natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,15 @@ from typing import List, Optional
|
|
4
4
|
|
5
5
|
logger = logging.getLogger(__name__)
|
6
6
|
|
7
|
+
|
7
8
|
@dataclass
|
8
9
|
class TextStyleOptions:
|
9
10
|
"""Options for configuring text style analysis."""
|
10
11
|
|
11
12
|
# Properties to consider when grouping elements by style
|
12
|
-
group_by: List[str] = field(
|
13
|
+
group_by: List[str] = field(
|
14
|
+
default_factory=lambda: ["size", "fontname", "is_bold", "is_italic", "color"]
|
15
|
+
)
|
13
16
|
|
14
17
|
# Tolerance for comparing font sizes (e.g., 0.5 rounds to nearest 0.5 point)
|
15
18
|
size_tolerance: float = 0.5
|
@@ -30,31 +33,35 @@ class TextStyleOptions:
|
|
30
33
|
# Format string for descriptive labels. Placeholders match keys in style_properties dict.
|
31
34
|
# Example: "{size}pt {weight}{style} {family} ({color})"
|
32
35
|
# Available keys: size, fontname, is_bold, is_italic, color, weight, style, family
|
33
|
-
label_format: str = "{size}pt {weight}{style} {family}"
|
34
|
-
|
36
|
+
label_format: str = "{size}pt {weight}{style} {family}" # Default format without color
|
35
37
|
|
36
38
|
def __post_init__(self):
|
37
39
|
# Validate size_tolerance
|
38
40
|
if self.size_tolerance <= 0:
|
39
|
-
logger.warning(
|
41
|
+
logger.warning(
|
42
|
+
f"size_tolerance must be positive, setting to 0.1. Original value: {self.size_tolerance}"
|
43
|
+
)
|
40
44
|
self.size_tolerance = 0.1
|
41
45
|
|
42
46
|
# Ensure 'size' is always considered if tolerance is relevant
|
43
|
-
if
|
47
|
+
if "size" not in self.group_by and self.size_tolerance > 0:
|
44
48
|
logger.debug("Adding 'size' to group_by keys because size_tolerance is set.")
|
45
|
-
if
|
49
|
+
if "size" not in self.group_by:
|
50
|
+
self.group_by.append("size")
|
46
51
|
|
47
|
-
if self.ignore_color and
|
52
|
+
if self.ignore_color and "color" in self.group_by:
|
48
53
|
logger.debug("Removing 'color' from group_by keys because ignore_color is True.")
|
49
|
-
self.group_by = [key for key in self.group_by if key !=
|
50
|
-
elif not self.ignore_color and
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
+
self.group_by = [key for key in self.group_by if key != "color"]
|
55
|
+
elif not self.ignore_color and "color" not in self.group_by:
|
56
|
+
# If color isn't ignored, ensure it's included if requested in label format?
|
57
|
+
# For now, just rely on explicit group_by setting.
|
58
|
+
pass
|
54
59
|
|
55
60
|
# Basic validation for group_by keys
|
56
|
-
allowed_keys = {
|
61
|
+
allowed_keys = {"size", "fontname", "is_bold", "is_italic", "color"}
|
57
62
|
invalid_keys = set(self.group_by) - allowed_keys
|
58
63
|
if invalid_keys:
|
59
|
-
logger.warning(
|
60
|
-
|
64
|
+
logger.warning(
|
65
|
+
f"Invalid keys found in group_by: {invalid_keys}. Allowed keys: {allowed_keys}. Ignoring invalid keys."
|
66
|
+
)
|
67
|
+
self.group_by = [key for key in self.group_by if key in allowed_keys]
|
@@ -1,18 +1,21 @@
|
|
1
1
|
"""
|
2
2
|
Text structure analyzer for natural-pdf.
|
3
3
|
"""
|
4
|
+
|
4
5
|
import logging
|
5
6
|
import re
|
6
|
-
from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
|
7
7
|
from collections import defaultdict
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
9
|
+
|
10
|
+
from natural_pdf.analyzers.text_options import TextStyleOptions
|
8
11
|
|
9
12
|
# Import ElementCollection and TextStyleOptions
|
10
13
|
from natural_pdf.elements.collections import ElementCollection
|
11
|
-
from natural_pdf.analyzers.text_options import TextStyleOptions
|
12
14
|
|
13
15
|
if TYPE_CHECKING:
|
14
16
|
from natural_pdf.core.page import Page
|
15
17
|
from natural_pdf.elements.base import Element
|
18
|
+
|
16
19
|
# Remove ElementCollection from here if imported above
|
17
20
|
|
18
21
|
logger = logging.getLogger(__name__)
|
@@ -21,63 +24,77 @@ logger = logging.getLogger(__name__)
|
|
21
24
|
FONT_PREFIX_RE = re.compile(r"^[A-Z]{6}\+")
|
22
25
|
|
23
26
|
# Common font weight/style keywords
|
24
|
-
FONT_WEIGHTS = {
|
27
|
+
FONT_WEIGHTS = {
|
28
|
+
"bold": "Bold",
|
29
|
+
"black": "Bold",
|
30
|
+
"heavy": "Bold",
|
31
|
+
"medium": "",
|
32
|
+
"light": "Light",
|
33
|
+
"thin": "Thin",
|
34
|
+
}
|
25
35
|
FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
|
26
36
|
|
37
|
+
|
27
38
|
class TextStyleAnalyzer:
|
28
39
|
"""
|
29
40
|
Analyzes and groups text elements by their style properties based on configuration.
|
30
|
-
|
41
|
+
|
31
42
|
This analyzer groups text elements based on specified font properties
|
32
43
|
(controlled by TextStyleOptions) and adds 'style_label', 'style_key',
|
33
44
|
and 'style_properties' attributes to each processed text element.
|
34
45
|
"""
|
35
|
-
|
46
|
+
|
36
47
|
def __init__(self, options: Optional[TextStyleOptions] = None):
|
37
48
|
"""
|
38
49
|
Initialize the text style analyzer.
|
39
|
-
|
50
|
+
|
40
51
|
Args:
|
41
52
|
options: Configuration options for the analysis. Uses default if None.
|
42
53
|
"""
|
43
54
|
self.options = options or TextStyleOptions()
|
44
55
|
logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
|
45
56
|
|
46
|
-
def analyze(
|
57
|
+
def analyze(
|
58
|
+
self, page: "Page", options: Optional[TextStyleOptions] = None
|
59
|
+
) -> "ElementCollection":
|
47
60
|
"""
|
48
61
|
Analyze text styles on a page, group elements, and add style attributes.
|
49
|
-
|
62
|
+
|
50
63
|
Args:
|
51
64
|
page: The Page object to analyze.
|
52
65
|
options: Override the analyzer's default TextStyleOptions for this run.
|
53
|
-
|
66
|
+
|
54
67
|
Returns:
|
55
68
|
ElementCollection containing all processed text elements (typically words)
|
56
69
|
with added 'style_label', 'style_key', and 'style_properties' attributes.
|
57
70
|
"""
|
58
71
|
current_options = options or self.options
|
59
|
-
logger.info(
|
72
|
+
logger.info(
|
73
|
+
f"Starting text style analysis for page {page.number} with options: {current_options}"
|
74
|
+
)
|
60
75
|
|
61
76
|
# Use page.words for better granularity
|
62
77
|
text_elements = page.words
|
63
78
|
# Fallback if words are somehow empty/not generated
|
64
79
|
if not text_elements:
|
65
|
-
|
80
|
+
text_elements = page.find_all("text").elements # Get list from collection
|
66
81
|
|
67
82
|
# Skip empty pages or pages with no text elements
|
68
83
|
if not text_elements:
|
69
84
|
logger.warning(f"Page {page.number} has no text elements to analyze.")
|
70
85
|
return ElementCollection([])
|
71
86
|
|
72
|
-
style_cache: Dict[Tuple, Dict[str, Any]] =
|
73
|
-
|
87
|
+
style_cache: Dict[Tuple, Dict[str, Any]] = (
|
88
|
+
{}
|
89
|
+
) # Maps style_key_tuple -> {'label': str, 'properties': dict}
|
90
|
+
processed_elements: List["Element"] = []
|
74
91
|
|
75
92
|
# Ensure consistent ordering for style key creation
|
76
93
|
group_by_keys = sorted(current_options.group_by)
|
77
94
|
|
78
95
|
for element in text_elements:
|
79
96
|
# Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
|
80
|
-
if not hasattr(element,
|
97
|
+
if not hasattr(element, "text") or not hasattr(element, "size"):
|
81
98
|
logger.debug(f"Skipping element without text/size: {element}")
|
82
99
|
continue
|
83
100
|
|
@@ -86,37 +103,47 @@ class TextStyleAnalyzer:
|
|
86
103
|
style_key = self._create_style_key(style_properties, group_by_keys)
|
87
104
|
|
88
105
|
if style_key not in style_cache:
|
89
|
-
label = self._generate_style_label(
|
90
|
-
|
91
|
-
|
106
|
+
label = self._generate_style_label(
|
107
|
+
style_properties, current_options, len(style_cache) + 1
|
108
|
+
)
|
109
|
+
style_cache[style_key] = {"label": label, "properties": style_properties}
|
110
|
+
logger.debug(
|
111
|
+
f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}"
|
112
|
+
)
|
92
113
|
|
93
114
|
# Add attributes to the element
|
94
|
-
element.style_label = style_cache[style_key][
|
115
|
+
element.style_label = style_cache[style_key]["label"]
|
95
116
|
element.style_key = style_key
|
96
117
|
# Add the full properties dict for potential detailed inspection
|
97
|
-
element.style_properties = style_cache[style_key][
|
118
|
+
element.style_properties = style_cache[style_key]["properties"]
|
98
119
|
|
99
120
|
processed_elements.append(element)
|
100
121
|
|
101
122
|
except Exception as e:
|
102
|
-
|
103
|
-
|
104
|
-
|
123
|
+
logger.warning(
|
124
|
+
f"Error processing element {element} for text style: {e}", exc_info=True
|
125
|
+
)
|
126
|
+
# Optionally add element without style info or skip it
|
127
|
+
# processed_elements.append(element) # Add anyway?
|
105
128
|
|
106
129
|
# Optionally store a summary on the page
|
107
130
|
page._text_styles_summary = style_cache
|
108
|
-
logger.info(
|
131
|
+
logger.info(
|
132
|
+
f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles."
|
133
|
+
)
|
109
134
|
|
110
135
|
return ElementCollection(processed_elements)
|
111
|
-
|
112
|
-
def _extract_style_properties(
|
136
|
+
|
137
|
+
def _extract_style_properties(
|
138
|
+
self, element: "Element", options: TextStyleOptions
|
139
|
+
) -> Dict[str, Any]:
|
113
140
|
"""
|
114
141
|
Extract style properties from a text element based on options.
|
115
|
-
|
142
|
+
|
116
143
|
Args:
|
117
144
|
element: Text element.
|
118
145
|
options: TextStyleOptions driving the extraction.
|
119
|
-
|
146
|
+
|
120
147
|
Returns:
|
121
148
|
Dictionary of extracted style properties.
|
122
149
|
"""
|
@@ -124,68 +151,81 @@ class TextStyleAnalyzer:
|
|
124
151
|
|
125
152
|
# Font size
|
126
153
|
font_size = None
|
127
|
-
if hasattr(element,
|
154
|
+
if hasattr(element, "size") and element.size is not None:
|
128
155
|
# Round based on tolerance
|
129
156
|
rounding_factor = 1.0 / options.size_tolerance
|
130
157
|
font_size = round(element.size * rounding_factor) / rounding_factor
|
131
|
-
properties[
|
158
|
+
properties["size"] = font_size
|
132
159
|
|
133
160
|
# Font name
|
134
161
|
font_name = None
|
135
162
|
normalized_font_name = None
|
136
|
-
if hasattr(element,
|
163
|
+
if hasattr(element, "fontname") and element.fontname is not None:
|
137
164
|
font_name = element.fontname
|
138
165
|
normalized_font_name = self._normalize_font_name(font_name, options)
|
139
|
-
properties[
|
166
|
+
properties["fontname"] = normalized_font_name if options.normalize_fontname else font_name
|
140
167
|
|
141
168
|
# Font characteristics (derived from normalized name if available)
|
142
169
|
name_to_check = normalized_font_name or font_name or ""
|
143
170
|
name_lower = name_to_check.lower()
|
144
|
-
is_bold = (
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
171
|
+
is_bold = (
|
172
|
+
"bold" in name_lower
|
173
|
+
or "black" in name_lower
|
174
|
+
or "heavy" in name_lower
|
175
|
+
or name_to_check.endswith("-B")
|
176
|
+
)
|
177
|
+
is_italic = (
|
178
|
+
"italic" in name_lower or "oblique" in name_lower or name_to_check.endswith("-I")
|
179
|
+
)
|
180
|
+
|
181
|
+
properties["is_bold"] = is_bold
|
182
|
+
properties["is_italic"] = is_italic
|
149
183
|
|
150
184
|
# Text color
|
151
185
|
color = None
|
152
|
-
if
|
186
|
+
if (
|
187
|
+
not options.ignore_color
|
188
|
+
and hasattr(element, "non_stroking_color")
|
189
|
+
and element.non_stroking_color is not None
|
190
|
+
):
|
153
191
|
raw_color = element.non_stroking_color
|
154
192
|
# Convert color to a hashable form (tuple)
|
155
193
|
if isinstance(raw_color, (list, tuple)):
|
156
|
-
color = tuple(round(c, 3) for c in raw_color)
|
194
|
+
color = tuple(round(c, 3) for c in raw_color) # Round color components
|
157
195
|
else:
|
158
196
|
# Handle simple grayscale or other non-list representations if needed
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
197
|
+
try:
|
198
|
+
color = round(float(raw_color), 3)
|
199
|
+
except (ValueError, TypeError):
|
200
|
+
color = str(raw_color) # Fallback to string if cannot convert
|
163
201
|
# Normalize common colors (optional, could be complex)
|
164
202
|
# Example: (0.0, 0.0, 0.0) -> 'black', (1.0, 1.0, 1.0) -> 'white'
|
165
|
-
if color == (0.0, 0.0, 0.0) or color == 0.0:
|
166
|
-
|
167
|
-
|
203
|
+
if color == (0.0, 0.0, 0.0) or color == 0.0:
|
204
|
+
color = "black"
|
205
|
+
if color == (1.0, 1.0, 1.0) or color == 1.0:
|
206
|
+
color = "white"
|
207
|
+
properties["color"] = color
|
168
208
|
|
169
209
|
return properties
|
170
|
-
|
210
|
+
|
171
211
|
def _normalize_font_name(self, fontname: str, options: TextStyleOptions) -> str:
|
172
|
-
"""
|
212
|
+
"""Basic normalization of font names."""
|
173
213
|
if not options.normalize_fontname:
|
174
214
|
return fontname
|
175
215
|
# Remove common subset prefixes like "ABCDEF+"
|
176
216
|
name = FONT_PREFIX_RE.sub("", fontname)
|
177
217
|
# Could add more rules here, e.g., removing version numbers, standardizing separators
|
178
218
|
return name
|
179
|
-
|
219
|
+
|
180
220
|
def _parse_font_name(self, normalized_fontname: str) -> Dict[str, str]:
|
181
|
-
"""
|
221
|
+
"""Attempt to parse family, weight, and style from a font name. Very heuristic."""
|
182
222
|
if not normalized_fontname:
|
183
|
-
return {
|
223
|
+
return {"family": "Unknown", "weight": "", "style": ""}
|
184
224
|
|
185
|
-
parts = re.split(r
|
225
|
+
parts = re.split(r"[-,_ ]", normalized_fontname)
|
186
226
|
family_parts = []
|
187
|
-
weight =
|
188
|
-
style =
|
227
|
+
weight = ""
|
228
|
+
style = ""
|
189
229
|
|
190
230
|
for part in parts:
|
191
231
|
part_lower = part.lower()
|
@@ -196,7 +236,8 @@ class TextStyleAnalyzer:
|
|
196
236
|
weight = val
|
197
237
|
found = True
|
198
238
|
break
|
199
|
-
if found:
|
239
|
+
if found:
|
240
|
+
continue # Skip part if it was a weight
|
200
241
|
|
201
242
|
# Check styles
|
202
243
|
for key, val in FONT_STYLES.items():
|
@@ -204,67 +245,72 @@ class TextStyleAnalyzer:
|
|
204
245
|
style = val
|
205
246
|
found = True
|
206
247
|
break
|
207
|
-
if found:
|
248
|
+
if found:
|
249
|
+
continue # Skip part if it was a style
|
208
250
|
|
209
251
|
# If not weight or style, assume it's part of the family name
|
210
|
-
if part:
|
211
|
-
|
252
|
+
if part: # Avoid empty strings from multiple delimiters
|
253
|
+
family_parts.append(part)
|
212
254
|
|
213
|
-
family = "".join(family_parts) or "Unknown"
|
255
|
+
family = "".join(family_parts) or "Unknown" # Join remaining parts
|
214
256
|
# Simple cleanup: Remove "MT" often appended? Maybe too aggressive.
|
215
257
|
# if family.endswith("MT"): family = family[:-2]
|
216
258
|
|
217
|
-
return {
|
218
|
-
|
259
|
+
return {"family": family, "weight": weight, "style": style}
|
260
|
+
|
219
261
|
def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
|
220
|
-
"""
|
262
|
+
"""Create a hashable tuple key based on selected properties."""
|
221
263
|
key_parts = []
|
222
|
-
for key in group_by_keys:
|
264
|
+
for key in group_by_keys: # Use the pre-sorted list
|
223
265
|
value = properties.get(key)
|
224
266
|
# Ensure hashable - colors should already be tuples or basic types
|
225
|
-
if isinstance(value, list):
|
267
|
+
if isinstance(value, list): # Should not happen if _extract handled color correctly
|
226
268
|
value = tuple(value)
|
227
269
|
key_parts.append(value)
|
228
270
|
return tuple(key_parts)
|
229
|
-
|
230
|
-
def _generate_style_label(
|
231
|
-
|
271
|
+
|
272
|
+
def _generate_style_label(
|
273
|
+
self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int
|
274
|
+
) -> str:
|
275
|
+
"""Generate a style label based on properties and options."""
|
232
276
|
if not options.descriptive_labels:
|
233
277
|
return f"{options.label_prefix} {style_index}"
|
234
278
|
|
235
279
|
try:
|
236
|
-
font_details = self._parse_font_name(properties.get(
|
280
|
+
font_details = self._parse_font_name(properties.get("fontname", ""))
|
237
281
|
|
238
282
|
label_data = {
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
283
|
+
"size": properties.get("size", "?"),
|
284
|
+
"fontname": properties.get("fontname", "Unknown"),
|
285
|
+
"is_bold": properties.get("is_bold", False),
|
286
|
+
"is_italic": properties.get("is_italic", False),
|
287
|
+
"color": properties.get("color", ""),
|
288
|
+
"family": font_details["family"],
|
245
289
|
# Use parsed weight/style if available, otherwise fallback to is_bold/is_italic flags
|
246
|
-
|
247
|
-
|
290
|
+
"weight": font_details["weight"] or ("Bold" if properties.get("is_bold") else ""),
|
291
|
+
"style": font_details["style"] or ("Italic" if properties.get("is_italic") else ""),
|
248
292
|
}
|
249
293
|
# Ensure style has a space separator if both weight and style exist
|
250
|
-
if label_data[
|
251
|
-
label_data[
|
294
|
+
if label_data["weight"] and label_data["style"]:
|
295
|
+
label_data["style"] = " " + label_data["style"]
|
252
296
|
|
253
297
|
# Handle color formatting for label
|
254
|
-
color_val = label_data[
|
298
|
+
color_val = label_data["color"]
|
255
299
|
if isinstance(color_val, tuple):
|
256
|
-
|
300
|
+
color_str = f"rgb{color_val}" # Basic tuple representation
|
257
301
|
elif isinstance(color_val, str):
|
258
|
-
|
302
|
+
color_str = color_val # Already string ('black', 'white', or fallback)
|
259
303
|
else:
|
260
|
-
|
261
|
-
label_data[
|
304
|
+
color_str = str(color_val) # Other types
|
305
|
+
label_data["color_str"] = color_str
|
262
306
|
|
263
307
|
# Format the label, handle potential missing keys in format string gracefully
|
264
308
|
label = options.label_format.format_map(defaultdict(str, label_data))
|
265
|
-
return label.strip().replace(" ", " ")
|
309
|
+
return label.strip().replace(" ", " ") # Cleanup extra spaces
|
266
310
|
|
267
311
|
except Exception as e:
|
268
|
-
logger.warning(
|
312
|
+
logger.warning(
|
313
|
+
f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label."
|
314
|
+
)
|
269
315
|
# Fallback to numeric label on error
|
270
|
-
return f"{options.label_prefix} {style_index}"
|
316
|
+
return f"{options.label_prefix} {style_index}"
|
natural_pdf/analyzers/utils.py
CHANGED
@@ -1,57 +1,64 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import
|
2
|
+
from typing import Any, Dict, List
|
3
|
+
|
3
4
|
from ..elements.region import Region
|
4
5
|
|
5
|
-
|
6
|
-
|
6
|
+
|
7
|
+
def convert_to_regions(
|
8
|
+
page: Any, detections: List[Dict[str, Any]], scale_factor: float = 1.0
|
9
|
+
) -> List[Region]:
|
7
10
|
"""
|
8
11
|
Convert layout detections to Region objects.
|
9
|
-
|
12
|
+
|
10
13
|
Args:
|
11
14
|
page: Page object to create regions for
|
12
15
|
detections: List of detection dictionaries
|
13
16
|
scale_factor: Factor to scale coordinates from image to PDF space
|
14
|
-
|
17
|
+
|
15
18
|
Returns:
|
16
19
|
List of Region objects with layout metadata
|
17
20
|
"""
|
18
21
|
conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
|
19
|
-
conversion_logger.debug(
|
22
|
+
conversion_logger.debug(
|
23
|
+
f"Converting {len(detections)} detections to regions with scale {scale_factor}"
|
24
|
+
)
|
20
25
|
regions = []
|
21
|
-
|
26
|
+
|
22
27
|
for det in detections:
|
23
28
|
# Extract detection info
|
24
|
-
x_min, y_min, x_max, y_max = det[
|
25
|
-
|
29
|
+
x_min, y_min, x_max, y_max = det["bbox"]
|
30
|
+
|
26
31
|
# Ensure coordinates are in proper order (min values are smaller)
|
27
32
|
if x_min > x_max:
|
28
33
|
x_min, x_max = x_max, x_min
|
29
34
|
if y_min > y_max:
|
30
35
|
y_min, y_max = y_max, y_min
|
31
|
-
|
36
|
+
|
32
37
|
# Scale coordinates from image to PDF space
|
33
38
|
if scale_factor != 1.0:
|
34
39
|
x_min *= scale_factor
|
35
40
|
y_min *= scale_factor
|
36
41
|
x_max *= scale_factor
|
37
42
|
y_max *= scale_factor
|
38
|
-
|
43
|
+
|
39
44
|
# Create region with metadata
|
40
45
|
region = Region(page, (x_min, y_min, x_max, y_max))
|
41
|
-
region.region_type = det[
|
42
|
-
region.confidence = det[
|
43
|
-
region.normalized_type = det[
|
44
|
-
|
46
|
+
region.region_type = det["class"]
|
47
|
+
region.confidence = det["confidence"]
|
48
|
+
region.normalized_type = det["normalized_class"]
|
49
|
+
|
45
50
|
# Add source info - important for filtering
|
46
|
-
region.source = det.get(
|
47
|
-
region.model = det.get(
|
48
|
-
|
51
|
+
region.source = det.get("source", "detected")
|
52
|
+
region.model = det.get("model", "unknown")
|
53
|
+
|
49
54
|
# Add additional metadata if available
|
50
55
|
for key, value in det.items():
|
51
|
-
if key not in (
|
56
|
+
if key not in ("bbox", "class", "confidence", "normalized_class", "source", "model"):
|
52
57
|
setattr(region, key, value)
|
53
|
-
|
58
|
+
|
54
59
|
regions.append(region)
|
55
|
-
|
56
|
-
conversion_logger.debug(
|
57
|
-
|
60
|
+
|
61
|
+
conversion_logger.debug(
|
62
|
+
f"Created {len(regions)} region objects from {len(detections)} detections"
|
63
|
+
)
|
64
|
+
return regions
|