natural-pdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +6 -0
- natural_pdf/analyzers/layout/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +151 -0
- natural_pdf/analyzers/layout/docling.py +247 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
- natural_pdf/analyzers/layout/layout_manager.py +200 -0
- natural_pdf/analyzers/layout/layout_options.py +78 -0
- natural_pdf/analyzers/layout/paddle.py +240 -0
- natural_pdf/analyzers/layout/surya.py +151 -0
- natural_pdf/analyzers/layout/tatr.py +251 -0
- natural_pdf/analyzers/layout/yolo.py +165 -0
- natural_pdf/analyzers/text_options.py +60 -0
- natural_pdf/analyzers/text_structure.py +270 -0
- natural_pdf/analyzers/utils.py +57 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/element_manager.py +457 -0
- natural_pdf/core/highlighting_service.py +698 -0
- natural_pdf/core/page.py +1444 -0
- natural_pdf/core/pdf.py +653 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +761 -0
- natural_pdf/elements/collections.py +1345 -0
- natural_pdf/elements/line.py +140 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1793 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +56 -0
- natural_pdf/ocr/engine.py +104 -0
- natural_pdf/ocr/engine_easyocr.py +179 -0
- natural_pdf/ocr/engine_paddle.py +204 -0
- natural_pdf/ocr/engine_surya.py +171 -0
- natural_pdf/ocr/ocr_manager.py +191 -0
- natural_pdf/ocr/ocr_options.py +114 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +396 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +354 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +3 -0
- natural_pdf/utils/highlighting.py +12 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +223 -0
- natural_pdf/widgets/__init__.py +4 -0
- natural_pdf/widgets/frontend/viewer.js +88 -0
- natural_pdf/widgets/viewer.py +765 -0
- natural_pdf-0.1.0.dist-info/METADATA +295 -0
- natural_pdf-0.1.0.dist-info/RECORD +52 -0
- natural_pdf-0.1.0.dist-info/WHEEL +5 -0
- natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
- natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,270 @@
|
|
1
|
+
"""
|
2
|
+
Text structure analyzer for natural-pdf.
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
|
7
|
+
from collections import defaultdict
|
8
|
+
|
9
|
+
# Import ElementCollection and TextStyleOptions
|
10
|
+
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
from natural_pdf.analyzers.text_options import TextStyleOptions
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from natural_pdf.core.page import Page
|
15
|
+
from natural_pdf.elements.base import Element
|
16
|
+
# Remove ElementCollection from here if imported above
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
# Simple regex to remove common PDF font prefixes like "ABCDEF+"
|
21
|
+
FONT_PREFIX_RE = re.compile(r"^[A-Z]{6}\+")
|
22
|
+
|
23
|
+
# Common font weight/style keywords
|
24
|
+
FONT_WEIGHTS = {"bold": "Bold", "black": "Bold", "heavy": "Bold", "medium": "", "light": "Light", "thin": "Thin"}
|
25
|
+
FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
|
26
|
+
|
27
|
+
class TextStyleAnalyzer:
|
28
|
+
"""
|
29
|
+
Analyzes and groups text elements by their style properties based on configuration.
|
30
|
+
|
31
|
+
This analyzer groups text elements based on specified font properties
|
32
|
+
(controlled by TextStyleOptions) and adds 'style_label', 'style_key',
|
33
|
+
and 'style_properties' attributes to each processed text element.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(self, options: Optional[TextStyleOptions] = None):
|
37
|
+
"""
|
38
|
+
Initialize the text style analyzer.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
options: Configuration options for the analysis. Uses default if None.
|
42
|
+
"""
|
43
|
+
self.options = options or TextStyleOptions()
|
44
|
+
logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
|
45
|
+
|
46
|
+
def analyze(self, page: 'Page', options: Optional[TextStyleOptions] = None) -> 'ElementCollection':
|
47
|
+
"""
|
48
|
+
Analyze text styles on a page, group elements, and add style attributes.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
page: The Page object to analyze.
|
52
|
+
options: Override the analyzer's default TextStyleOptions for this run.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
ElementCollection containing all processed text elements (typically words)
|
56
|
+
with added 'style_label', 'style_key', and 'style_properties' attributes.
|
57
|
+
"""
|
58
|
+
current_options = options or self.options
|
59
|
+
logger.info(f"Starting text style analysis for page {page.number} with options: {current_options}")
|
60
|
+
|
61
|
+
# Use page.words for better granularity
|
62
|
+
text_elements = page.words
|
63
|
+
# Fallback if words are somehow empty/not generated
|
64
|
+
if not text_elements:
|
65
|
+
text_elements = page.find_all('text').elements # Get list from collection
|
66
|
+
|
67
|
+
# Skip empty pages or pages with no text elements
|
68
|
+
if not text_elements:
|
69
|
+
logger.warning(f"Page {page.number} has no text elements to analyze.")
|
70
|
+
return ElementCollection([])
|
71
|
+
|
72
|
+
style_cache: Dict[Tuple, Dict[str, Any]] = {} # Maps style_key_tuple -> {'label': str, 'properties': dict}
|
73
|
+
processed_elements: List['Element'] = []
|
74
|
+
|
75
|
+
# Ensure consistent ordering for style key creation
|
76
|
+
group_by_keys = sorted(current_options.group_by)
|
77
|
+
|
78
|
+
for element in text_elements:
|
79
|
+
# Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
|
80
|
+
if not hasattr(element, 'text') or not hasattr(element, 'size'):
|
81
|
+
logger.debug(f"Skipping element without text/size: {element}")
|
82
|
+
continue
|
83
|
+
|
84
|
+
try:
|
85
|
+
style_properties = self._extract_style_properties(element, current_options)
|
86
|
+
style_key = self._create_style_key(style_properties, group_by_keys)
|
87
|
+
|
88
|
+
if style_key not in style_cache:
|
89
|
+
label = self._generate_style_label(style_properties, current_options, len(style_cache) + 1)
|
90
|
+
style_cache[style_key] = {'label': label, 'properties': style_properties}
|
91
|
+
logger.debug(f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}")
|
92
|
+
|
93
|
+
# Add attributes to the element
|
94
|
+
element.style_label = style_cache[style_key]['label']
|
95
|
+
element.style_key = style_key
|
96
|
+
# Add the full properties dict for potential detailed inspection
|
97
|
+
element.style_properties = style_cache[style_key]['properties']
|
98
|
+
|
99
|
+
processed_elements.append(element)
|
100
|
+
|
101
|
+
except Exception as e:
|
102
|
+
logger.warning(f"Error processing element {element} for text style: {e}", exc_info=True)
|
103
|
+
# Optionally add element without style info or skip it
|
104
|
+
# processed_elements.append(element) # Add anyway?
|
105
|
+
|
106
|
+
# Optionally store a summary on the page
|
107
|
+
page._text_styles_summary = style_cache
|
108
|
+
logger.info(f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles.")
|
109
|
+
|
110
|
+
return ElementCollection(processed_elements)
|
111
|
+
|
112
|
+
def _extract_style_properties(self, element: 'Element', options: TextStyleOptions) -> Dict[str, Any]:
|
113
|
+
"""
|
114
|
+
Extract style properties from a text element based on options.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
element: Text element.
|
118
|
+
options: TextStyleOptions driving the extraction.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
Dictionary of extracted style properties.
|
122
|
+
"""
|
123
|
+
properties = {}
|
124
|
+
|
125
|
+
# Font size
|
126
|
+
font_size = None
|
127
|
+
if hasattr(element, 'size') and element.size is not None:
|
128
|
+
# Round based on tolerance
|
129
|
+
rounding_factor = 1.0 / options.size_tolerance
|
130
|
+
font_size = round(element.size * rounding_factor) / rounding_factor
|
131
|
+
properties['size'] = font_size
|
132
|
+
|
133
|
+
# Font name
|
134
|
+
font_name = None
|
135
|
+
normalized_font_name = None
|
136
|
+
if hasattr(element, 'fontname') and element.fontname is not None:
|
137
|
+
font_name = element.fontname
|
138
|
+
normalized_font_name = self._normalize_font_name(font_name, options)
|
139
|
+
properties['fontname'] = normalized_font_name if options.normalize_fontname else font_name
|
140
|
+
|
141
|
+
# Font characteristics (derived from normalized name if available)
|
142
|
+
name_to_check = normalized_font_name or font_name or ""
|
143
|
+
name_lower = name_to_check.lower()
|
144
|
+
is_bold = ('bold' in name_lower or 'black' in name_lower or 'heavy' in name_lower or name_to_check.endswith('-B'))
|
145
|
+
is_italic = ('italic' in name_lower or 'oblique' in name_lower or name_to_check.endswith('-I'))
|
146
|
+
|
147
|
+
properties['is_bold'] = is_bold
|
148
|
+
properties['is_italic'] = is_italic
|
149
|
+
|
150
|
+
# Text color
|
151
|
+
color = None
|
152
|
+
if not options.ignore_color and hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
|
153
|
+
raw_color = element.non_stroking_color
|
154
|
+
# Convert color to a hashable form (tuple)
|
155
|
+
if isinstance(raw_color, (list, tuple)):
|
156
|
+
color = tuple(round(c, 3) for c in raw_color) # Round color components
|
157
|
+
else:
|
158
|
+
# Handle simple grayscale or other non-list representations if needed
|
159
|
+
try:
|
160
|
+
color = round(float(raw_color), 3)
|
161
|
+
except (ValueError, TypeError):
|
162
|
+
color = str(raw_color) # Fallback to string if cannot convert
|
163
|
+
# Normalize common colors (optional, could be complex)
|
164
|
+
# Example: (0.0, 0.0, 0.0) -> 'black', (1.0, 1.0, 1.0) -> 'white'
|
165
|
+
if color == (0.0, 0.0, 0.0) or color == 0.0: color = 'black'
|
166
|
+
if color == (1.0, 1.0, 1.0) or color == 1.0: color = 'white'
|
167
|
+
properties['color'] = color
|
168
|
+
|
169
|
+
return properties
|
170
|
+
|
171
|
+
def _normalize_font_name(self, fontname: str, options: TextStyleOptions) -> str:
|
172
|
+
""" Basic normalization of font names. """
|
173
|
+
if not options.normalize_fontname:
|
174
|
+
return fontname
|
175
|
+
# Remove common subset prefixes like "ABCDEF+"
|
176
|
+
name = FONT_PREFIX_RE.sub("", fontname)
|
177
|
+
# Could add more rules here, e.g., removing version numbers, standardizing separators
|
178
|
+
return name
|
179
|
+
|
180
|
+
def _parse_font_name(self, normalized_fontname: str) -> Dict[str, str]:
|
181
|
+
""" Attempt to parse family, weight, and style from a font name. Very heuristic. """
|
182
|
+
if not normalized_fontname:
|
183
|
+
return {'family': 'Unknown', 'weight': '', 'style': ''}
|
184
|
+
|
185
|
+
parts = re.split(r'[-,_ ]', normalized_fontname)
|
186
|
+
family_parts = []
|
187
|
+
weight = ''
|
188
|
+
style = ''
|
189
|
+
|
190
|
+
for part in parts:
|
191
|
+
part_lower = part.lower()
|
192
|
+
found = False
|
193
|
+
# Check weights
|
194
|
+
for key, val in FONT_WEIGHTS.items():
|
195
|
+
if key in part_lower:
|
196
|
+
weight = val
|
197
|
+
found = True
|
198
|
+
break
|
199
|
+
if found: continue # Skip part if it was a weight
|
200
|
+
|
201
|
+
# Check styles
|
202
|
+
for key, val in FONT_STYLES.items():
|
203
|
+
if key in part_lower:
|
204
|
+
style = val
|
205
|
+
found = True
|
206
|
+
break
|
207
|
+
if found: continue # Skip part if it was a style
|
208
|
+
|
209
|
+
# If not weight or style, assume it's part of the family name
|
210
|
+
if part: # Avoid empty strings from multiple delimiters
|
211
|
+
family_parts.append(part)
|
212
|
+
|
213
|
+
family = "".join(family_parts) or "Unknown" # Join remaining parts
|
214
|
+
# Simple cleanup: Remove "MT" often appended? Maybe too aggressive.
|
215
|
+
# if family.endswith("MT"): family = family[:-2]
|
216
|
+
|
217
|
+
return {'family': family, 'weight': weight, 'style': style}
|
218
|
+
|
219
|
+
def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
|
220
|
+
""" Create a hashable tuple key based on selected properties. """
|
221
|
+
key_parts = []
|
222
|
+
for key in group_by_keys: # Use the pre-sorted list
|
223
|
+
value = properties.get(key)
|
224
|
+
# Ensure hashable - colors should already be tuples or basic types
|
225
|
+
if isinstance(value, list): # Should not happen if _extract handled color correctly
|
226
|
+
value = tuple(value)
|
227
|
+
key_parts.append(value)
|
228
|
+
return tuple(key_parts)
|
229
|
+
|
230
|
+
def _generate_style_label(self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int) -> str:
|
231
|
+
""" Generate a style label based on properties and options. """
|
232
|
+
if not options.descriptive_labels:
|
233
|
+
return f"{options.label_prefix} {style_index}"
|
234
|
+
|
235
|
+
try:
|
236
|
+
font_details = self._parse_font_name(properties.get('fontname', ''))
|
237
|
+
|
238
|
+
label_data = {
|
239
|
+
'size': properties.get('size', '?'),
|
240
|
+
'fontname': properties.get('fontname', 'Unknown'),
|
241
|
+
'is_bold': properties.get('is_bold', False),
|
242
|
+
'is_italic': properties.get('is_italic', False),
|
243
|
+
'color': properties.get('color', ''),
|
244
|
+
'family': font_details['family'],
|
245
|
+
# Use parsed weight/style if available, otherwise fallback to is_bold/is_italic flags
|
246
|
+
'weight': font_details['weight'] or ('Bold' if properties.get('is_bold') else ''),
|
247
|
+
'style': font_details['style'] or ('Italic' if properties.get('is_italic') else ''),
|
248
|
+
}
|
249
|
+
# Ensure style has a space separator if both weight and style exist
|
250
|
+
if label_data['weight'] and label_data['style']:
|
251
|
+
label_data['style'] = " " + label_data['style']
|
252
|
+
|
253
|
+
# Handle color formatting for label
|
254
|
+
color_val = label_data['color']
|
255
|
+
if isinstance(color_val, tuple):
|
256
|
+
color_str = f"rgb{color_val}" # Basic tuple representation
|
257
|
+
elif isinstance(color_val, str):
|
258
|
+
color_str = color_val # Already string ('black', 'white', or fallback)
|
259
|
+
else:
|
260
|
+
color_str = str(color_val) # Other types
|
261
|
+
label_data['color_str'] = color_str
|
262
|
+
|
263
|
+
# Format the label, handle potential missing keys in format string gracefully
|
264
|
+
label = options.label_format.format_map(defaultdict(str, label_data))
|
265
|
+
return label.strip().replace(" ", " ") # Cleanup extra spaces
|
266
|
+
|
267
|
+
except Exception as e:
|
268
|
+
logger.warning(f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label.")
|
269
|
+
# Fallback to numeric label on error
|
270
|
+
return f"{options.label_prefix} {style_index}"
|
@@ -0,0 +1,57 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Dict, Any
|
3
|
+
from ..elements.region import Region
|
4
|
+
|
5
|
+
def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
|
6
|
+
scale_factor: float = 1.0) -> List[Region]:
|
7
|
+
"""
|
8
|
+
Convert layout detections to Region objects.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
page: Page object to create regions for
|
12
|
+
detections: List of detection dictionaries
|
13
|
+
scale_factor: Factor to scale coordinates from image to PDF space
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
List of Region objects with layout metadata
|
17
|
+
"""
|
18
|
+
conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
|
19
|
+
conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
|
20
|
+
regions = []
|
21
|
+
|
22
|
+
for det in detections:
|
23
|
+
# Extract detection info
|
24
|
+
x_min, y_min, x_max, y_max = det['bbox']
|
25
|
+
|
26
|
+
# Ensure coordinates are in proper order (min values are smaller)
|
27
|
+
if x_min > x_max:
|
28
|
+
x_min, x_max = x_max, x_min
|
29
|
+
if y_min > y_max:
|
30
|
+
y_min, y_max = y_max, y_min
|
31
|
+
|
32
|
+
# Scale coordinates from image to PDF space
|
33
|
+
if scale_factor != 1.0:
|
34
|
+
x_min *= scale_factor
|
35
|
+
y_min *= scale_factor
|
36
|
+
x_max *= scale_factor
|
37
|
+
y_max *= scale_factor
|
38
|
+
|
39
|
+
# Create region with metadata
|
40
|
+
region = Region(page, (x_min, y_min, x_max, y_max))
|
41
|
+
region.region_type = det['class']
|
42
|
+
region.confidence = det['confidence']
|
43
|
+
region.normalized_type = det['normalized_class']
|
44
|
+
|
45
|
+
# Add source info - important for filtering
|
46
|
+
region.source = det.get('source', 'detected')
|
47
|
+
region.model = det.get('model', 'unknown')
|
48
|
+
|
49
|
+
# Add additional metadata if available
|
50
|
+
for key, value in det.items():
|
51
|
+
if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
|
52
|
+
setattr(region, key, value)
|
53
|
+
|
54
|
+
regions.append(region)
|
55
|
+
|
56
|
+
conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
|
57
|
+
return regions
|