natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,270 @@
1
+ """
2
+ Text structure analyzer for natural-pdf.
3
+ """
4
+ import logging
5
+ import re
6
+ from typing import List, Dict, Any, Optional, Tuple, Union, TYPE_CHECKING
7
+ from collections import defaultdict
8
+
9
+ # Import ElementCollection and TextStyleOptions
10
+ from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.analyzers.text_options import TextStyleOptions
12
+
13
+ if TYPE_CHECKING:
14
+ from natural_pdf.core.page import Page
15
+ from natural_pdf.elements.base import Element
16
+ # Remove ElementCollection from here if imported above
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Simple regex to remove common PDF font prefixes like "ABCDEF+"
21
+ FONT_PREFIX_RE = re.compile(r"^[A-Z]{6}\+")
22
+
23
+ # Common font weight/style keywords
24
+ FONT_WEIGHTS = {"bold": "Bold", "black": "Bold", "heavy": "Bold", "medium": "", "light": "Light", "thin": "Thin"}
25
+ FONT_STYLES = {"italic": "Italic", "oblique": "Italic"}
26
+
27
+ class TextStyleAnalyzer:
28
+ """
29
+ Analyzes and groups text elements by their style properties based on configuration.
30
+
31
+ This analyzer groups text elements based on specified font properties
32
+ (controlled by TextStyleOptions) and adds 'style_label', 'style_key',
33
+ and 'style_properties' attributes to each processed text element.
34
+ """
35
+
36
+ def __init__(self, options: Optional[TextStyleOptions] = None):
37
+ """
38
+ Initialize the text style analyzer.
39
+
40
+ Args:
41
+ options: Configuration options for the analysis. Uses default if None.
42
+ """
43
+ self.options = options or TextStyleOptions()
44
+ logger.debug(f"Initialized TextStyleAnalyzer with options: {self.options}")
45
+
46
+ def analyze(self, page: 'Page', options: Optional[TextStyleOptions] = None) -> 'ElementCollection':
47
+ """
48
+ Analyze text styles on a page, group elements, and add style attributes.
49
+
50
+ Args:
51
+ page: The Page object to analyze.
52
+ options: Override the analyzer's default TextStyleOptions for this run.
53
+
54
+ Returns:
55
+ ElementCollection containing all processed text elements (typically words)
56
+ with added 'style_label', 'style_key', and 'style_properties' attributes.
57
+ """
58
+ current_options = options or self.options
59
+ logger.info(f"Starting text style analysis for page {page.number} with options: {current_options}")
60
+
61
+ # Use page.words for better granularity
62
+ text_elements = page.words
63
+ # Fallback if words are somehow empty/not generated
64
+ if not text_elements:
65
+ text_elements = page.find_all('text').elements # Get list from collection
66
+
67
+ # Skip empty pages or pages with no text elements
68
+ if not text_elements:
69
+ logger.warning(f"Page {page.number} has no text elements to analyze.")
70
+ return ElementCollection([])
71
+
72
+ style_cache: Dict[Tuple, Dict[str, Any]] = {} # Maps style_key_tuple -> {'label': str, 'properties': dict}
73
+ processed_elements: List['Element'] = []
74
+
75
+ # Ensure consistent ordering for style key creation
76
+ group_by_keys = sorted(current_options.group_by)
77
+
78
+ for element in text_elements:
79
+ # Skip elements without necessary attributes (e.g., non-text elements if find_all was used)
80
+ if not hasattr(element, 'text') or not hasattr(element, 'size'):
81
+ logger.debug(f"Skipping element without text/size: {element}")
82
+ continue
83
+
84
+ try:
85
+ style_properties = self._extract_style_properties(element, current_options)
86
+ style_key = self._create_style_key(style_properties, group_by_keys)
87
+
88
+ if style_key not in style_cache:
89
+ label = self._generate_style_label(style_properties, current_options, len(style_cache) + 1)
90
+ style_cache[style_key] = {'label': label, 'properties': style_properties}
91
+ logger.debug(f"New style detected (Key: {style_key}): Label='{label}', Props={style_properties}")
92
+
93
+ # Add attributes to the element
94
+ element.style_label = style_cache[style_key]['label']
95
+ element.style_key = style_key
96
+ # Add the full properties dict for potential detailed inspection
97
+ element.style_properties = style_cache[style_key]['properties']
98
+
99
+ processed_elements.append(element)
100
+
101
+ except Exception as e:
102
+ logger.warning(f"Error processing element {element} for text style: {e}", exc_info=True)
103
+ # Optionally add element without style info or skip it
104
+ # processed_elements.append(element) # Add anyway?
105
+
106
+ # Optionally store a summary on the page
107
+ page._text_styles_summary = style_cache
108
+ logger.info(f"Finished text style analysis for page {page.number}. Found {len(style_cache)} unique styles.")
109
+
110
+ return ElementCollection(processed_elements)
111
+
112
+ def _extract_style_properties(self, element: 'Element', options: TextStyleOptions) -> Dict[str, Any]:
113
+ """
114
+ Extract style properties from a text element based on options.
115
+
116
+ Args:
117
+ element: Text element.
118
+ options: TextStyleOptions driving the extraction.
119
+
120
+ Returns:
121
+ Dictionary of extracted style properties.
122
+ """
123
+ properties = {}
124
+
125
+ # Font size
126
+ font_size = None
127
+ if hasattr(element, 'size') and element.size is not None:
128
+ # Round based on tolerance
129
+ rounding_factor = 1.0 / options.size_tolerance
130
+ font_size = round(element.size * rounding_factor) / rounding_factor
131
+ properties['size'] = font_size
132
+
133
+ # Font name
134
+ font_name = None
135
+ normalized_font_name = None
136
+ if hasattr(element, 'fontname') and element.fontname is not None:
137
+ font_name = element.fontname
138
+ normalized_font_name = self._normalize_font_name(font_name, options)
139
+ properties['fontname'] = normalized_font_name if options.normalize_fontname else font_name
140
+
141
+ # Font characteristics (derived from normalized name if available)
142
+ name_to_check = normalized_font_name or font_name or ""
143
+ name_lower = name_to_check.lower()
144
+ is_bold = ('bold' in name_lower or 'black' in name_lower or 'heavy' in name_lower or name_to_check.endswith('-B'))
145
+ is_italic = ('italic' in name_lower or 'oblique' in name_lower or name_to_check.endswith('-I'))
146
+
147
+ properties['is_bold'] = is_bold
148
+ properties['is_italic'] = is_italic
149
+
150
+ # Text color
151
+ color = None
152
+ if not options.ignore_color and hasattr(element, 'non_stroking_color') and element.non_stroking_color is not None:
153
+ raw_color = element.non_stroking_color
154
+ # Convert color to a hashable form (tuple)
155
+ if isinstance(raw_color, (list, tuple)):
156
+ color = tuple(round(c, 3) for c in raw_color) # Round color components
157
+ else:
158
+ # Handle simple grayscale or other non-list representations if needed
159
+ try:
160
+ color = round(float(raw_color), 3)
161
+ except (ValueError, TypeError):
162
+ color = str(raw_color) # Fallback to string if cannot convert
163
+ # Normalize common colors (optional, could be complex)
164
+ # Example: (0.0, 0.0, 0.0) -> 'black', (1.0, 1.0, 1.0) -> 'white'
165
+ if color == (0.0, 0.0, 0.0) or color == 0.0: color = 'black'
166
+ if color == (1.0, 1.0, 1.0) or color == 1.0: color = 'white'
167
+ properties['color'] = color
168
+
169
+ return properties
170
+
171
+ def _normalize_font_name(self, fontname: str, options: TextStyleOptions) -> str:
172
+ """ Basic normalization of font names. """
173
+ if not options.normalize_fontname:
174
+ return fontname
175
+ # Remove common subset prefixes like "ABCDEF+"
176
+ name = FONT_PREFIX_RE.sub("", fontname)
177
+ # Could add more rules here, e.g., removing version numbers, standardizing separators
178
+ return name
179
+
180
+ def _parse_font_name(self, normalized_fontname: str) -> Dict[str, str]:
181
+ """ Attempt to parse family, weight, and style from a font name. Very heuristic. """
182
+ if not normalized_fontname:
183
+ return {'family': 'Unknown', 'weight': '', 'style': ''}
184
+
185
+ parts = re.split(r'[-,_ ]', normalized_fontname)
186
+ family_parts = []
187
+ weight = ''
188
+ style = ''
189
+
190
+ for part in parts:
191
+ part_lower = part.lower()
192
+ found = False
193
+ # Check weights
194
+ for key, val in FONT_WEIGHTS.items():
195
+ if key in part_lower:
196
+ weight = val
197
+ found = True
198
+ break
199
+ if found: continue # Skip part if it was a weight
200
+
201
+ # Check styles
202
+ for key, val in FONT_STYLES.items():
203
+ if key in part_lower:
204
+ style = val
205
+ found = True
206
+ break
207
+ if found: continue # Skip part if it was a style
208
+
209
+ # If not weight or style, assume it's part of the family name
210
+ if part: # Avoid empty strings from multiple delimiters
211
+ family_parts.append(part)
212
+
213
+ family = "".join(family_parts) or "Unknown" # Join remaining parts
214
+ # Simple cleanup: Remove "MT" often appended? Maybe too aggressive.
215
+ # if family.endswith("MT"): family = family[:-2]
216
+
217
+ return {'family': family, 'weight': weight, 'style': style}
218
+
219
+ def _create_style_key(self, properties: Dict[str, Any], group_by_keys: List[str]) -> Tuple:
220
+ """ Create a hashable tuple key based on selected properties. """
221
+ key_parts = []
222
+ for key in group_by_keys: # Use the pre-sorted list
223
+ value = properties.get(key)
224
+ # Ensure hashable - colors should already be tuples or basic types
225
+ if isinstance(value, list): # Should not happen if _extract handled color correctly
226
+ value = tuple(value)
227
+ key_parts.append(value)
228
+ return tuple(key_parts)
229
+
230
+ def _generate_style_label(self, properties: Dict[str, Any], options: TextStyleOptions, style_index: int) -> str:
231
+ """ Generate a style label based on properties and options. """
232
+ if not options.descriptive_labels:
233
+ return f"{options.label_prefix} {style_index}"
234
+
235
+ try:
236
+ font_details = self._parse_font_name(properties.get('fontname', ''))
237
+
238
+ label_data = {
239
+ 'size': properties.get('size', '?'),
240
+ 'fontname': properties.get('fontname', 'Unknown'),
241
+ 'is_bold': properties.get('is_bold', False),
242
+ 'is_italic': properties.get('is_italic', False),
243
+ 'color': properties.get('color', ''),
244
+ 'family': font_details['family'],
245
+ # Use parsed weight/style if available, otherwise fallback to is_bold/is_italic flags
246
+ 'weight': font_details['weight'] or ('Bold' if properties.get('is_bold') else ''),
247
+ 'style': font_details['style'] or ('Italic' if properties.get('is_italic') else ''),
248
+ }
249
+ # Ensure style has a space separator if both weight and style exist
250
+ if label_data['weight'] and label_data['style']:
251
+ label_data['style'] = " " + label_data['style']
252
+
253
+ # Handle color formatting for label
254
+ color_val = label_data['color']
255
+ if isinstance(color_val, tuple):
256
+ color_str = f"rgb{color_val}" # Basic tuple representation
257
+ elif isinstance(color_val, str):
258
+ color_str = color_val # Already string ('black', 'white', or fallback)
259
+ else:
260
+ color_str = str(color_val) # Other types
261
+ label_data['color_str'] = color_str
262
+
263
+ # Format the label, handle potential missing keys in format string gracefully
264
+ label = options.label_format.format_map(defaultdict(str, label_data))
265
+ return label.strip().replace(" ", " ") # Cleanup extra spaces
266
+
267
+ except Exception as e:
268
+ logger.warning(f"Error generating descriptive label for style {properties}: {e}. Falling back to numeric label.")
269
+ # Fallback to numeric label on error
270
+ return f"{options.label_prefix} {style_index}"
@@ -0,0 +1,57 @@
1
+ import logging
2
+ from typing import List, Dict, Any
3
+ from ..elements.region import Region
4
+
5
+ def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
6
+ scale_factor: float = 1.0) -> List[Region]:
7
+ """
8
+ Convert layout detections to Region objects.
9
+
10
+ Args:
11
+ page: Page object to create regions for
12
+ detections: List of detection dictionaries
13
+ scale_factor: Factor to scale coordinates from image to PDF space
14
+
15
+ Returns:
16
+ List of Region objects with layout metadata
17
+ """
18
+ conversion_logger = logging.getLogger("natural_pdf.analyzers.layout.convert")
19
+ conversion_logger.debug(f"Converting {len(detections)} detections to regions with scale {scale_factor}")
20
+ regions = []
21
+
22
+ for det in detections:
23
+ # Extract detection info
24
+ x_min, y_min, x_max, y_max = det['bbox']
25
+
26
+ # Ensure coordinates are in proper order (min values are smaller)
27
+ if x_min > x_max:
28
+ x_min, x_max = x_max, x_min
29
+ if y_min > y_max:
30
+ y_min, y_max = y_max, y_min
31
+
32
+ # Scale coordinates from image to PDF space
33
+ if scale_factor != 1.0:
34
+ x_min *= scale_factor
35
+ y_min *= scale_factor
36
+ x_max *= scale_factor
37
+ y_max *= scale_factor
38
+
39
+ # Create region with metadata
40
+ region = Region(page, (x_min, y_min, x_max, y_max))
41
+ region.region_type = det['class']
42
+ region.confidence = det['confidence']
43
+ region.normalized_type = det['normalized_class']
44
+
45
+ # Add source info - important for filtering
46
+ region.source = det.get('source', 'detected')
47
+ region.model = det.get('model', 'unknown')
48
+
49
+ # Add additional metadata if available
50
+ for key, value in det.items():
51
+ if key not in ('bbox', 'class', 'confidence', 'normalized_class', 'source', 'model'):
52
+ setattr(region, key, value)
53
+
54
+ regions.append(region)
55
+
56
+ conversion_logger.debug(f"Created {len(regions)} region objects from {len(detections)} detections")
57
+ return regions
@@ -0,0 +1,3 @@
1
+ """
2
+ Core classes for Natural PDF.
3
+ """