natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,354 @@
1
+ """
2
+ CSS-like selector parser for natural-pdf.
3
+ """
4
+ import re
5
+ import ast
6
+ from typing import Dict, Any, List, Optional, Union, Tuple
7
+ from colour import Color
8
+
9
+
10
+ def safe_parse_value(value_str: str) -> Any:
11
+ """
12
+ Safely parse a value string without using eval().
13
+
14
+ Args:
15
+ value_str: String representation of a value (number, tuple, string, etc.)
16
+
17
+ Returns:
18
+ Parsed value
19
+ """
20
+ # Strip quotes first if it's a quoted string
21
+ value_str = value_str.strip()
22
+ if (value_str.startswith('"') and value_str.endswith('"')) or \
23
+ (value_str.startswith("'") and value_str.endswith("'")):
24
+ return value_str[1:-1]
25
+
26
+ # Try parsing as a Python literal (numbers, tuples, lists)
27
+ try:
28
+ return ast.literal_eval(value_str)
29
+ except (SyntaxError, ValueError):
30
+ # If it's not a valid Python literal, return as is
31
+ return value_str
32
+
33
+
34
+ def safe_parse_color(value_str: str) -> tuple:
35
+ """
36
+ Parse a color value which could be an RGB tuple, color name, or hex code.
37
+
38
+ Args:
39
+ value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
40
+
41
+ Returns:
42
+ RGB tuple (r, g, b) with values from 0 to 1
43
+ """
44
+ value_str = value_str.strip()
45
+
46
+ # Try parsing as a Python literal (for RGB tuples)
47
+ try:
48
+ # If it's already a valid tuple or list, parse it
49
+ color_tuple = ast.literal_eval(value_str)
50
+ if isinstance(color_tuple, (list, tuple)) and len(color_tuple) >= 3:
51
+ # Return just the RGB components as a tuple
52
+ return tuple(color_tuple[:3])
53
+ except (SyntaxError, ValueError):
54
+ # Not a valid tuple/list, try as a color name or hex
55
+ try:
56
+ # Use colour library to parse color names, hex values, etc.
57
+ color = Color(value_str)
58
+ # Convert to RGB tuple with values between 0 and 1
59
+ return (color.red, color.green, color.blue)
60
+ except (ValueError, AttributeError):
61
+ # If color parsing fails, return a default (black)
62
+ return (0, 0, 0)
63
+
64
+ # If we got here with a non-tuple, return default
65
+ return (0, 0, 0)
66
+
67
+
68
+ def parse_selector(selector: str) -> Dict[str, Any]:
69
+ """
70
+ Parse a CSS-like selector string into a structured selector object.
71
+
72
+ Examples:
73
+ - 'text:contains("Revenue")'
74
+ - 'table:below("Financial Data")'
75
+ - 'rect[fill=(1,0,0)]'
76
+
77
+ Args:
78
+ selector: CSS-like selector string
79
+
80
+ Returns:
81
+ Dict representing the parsed selector
82
+ """
83
+ # Basic structure for result
84
+ result = {
85
+ 'type': 'any', # Default to any element type
86
+ 'filters': [],
87
+ 'attributes': {},
88
+ 'pseudo_classes': [],
89
+ }
90
+
91
+ # Check if empty or None
92
+ if not selector or not isinstance(selector, str):
93
+ return result
94
+
95
+ # Parse element type
96
+ type_match = re.match(r'^([a-zA-Z_]+)', selector)
97
+ if type_match:
98
+ result['type'] = type_match.group(1).lower()
99
+ selector = selector[len(type_match.group(0)):]
100
+
101
+ # Parse attributes (e.g., [color=(1,0,0)])
102
+ attr_pattern = r'\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]'
103
+ attr_matches = re.findall(attr_pattern, selector)
104
+ for name, op, value in attr_matches:
105
+ # Handle special parsing for color attributes
106
+ if name in ['color', 'non_stroking_color', 'fill', 'stroke', 'strokeColor', 'fillColor']:
107
+ value = safe_parse_color(value)
108
+ else:
109
+ # Safe parsing for other attributes
110
+ value = safe_parse_value(value)
111
+
112
+ # Store attribute with operator
113
+ result['attributes'][name] = {
114
+ 'op': op,
115
+ 'value': value
116
+ }
117
+
118
+ # Parse pseudo-classes (e.g., :contains("text"))
119
+ pseudo_pattern = r':([a-zA-Z_]+)(?:\(([^)]+)\))?'
120
+ pseudo_matches = re.findall(pseudo_pattern, selector)
121
+ for name, args in pseudo_matches:
122
+ # Process arguments
123
+ processed_args = args
124
+ if args:
125
+ if name in ['color', 'background']:
126
+ processed_args = safe_parse_color(args)
127
+ else:
128
+ processed_args = safe_parse_value(args)
129
+
130
+ result['pseudo_classes'].append({
131
+ 'name': name,
132
+ 'args': processed_args
133
+ })
134
+
135
+ return result
136
+
137
+
138
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
139
+ """
140
+ Convert a parsed selector to a filter function.
141
+
142
+ Args:
143
+ selector: Parsed selector dictionary
144
+ **kwargs: Additional filter parameters including:
145
+ - regex: Whether to use regex for text search
146
+ - case: Whether to do case-sensitive text search
147
+
148
+ Returns:
149
+ Function that takes an element and returns True if it matches
150
+ """
151
+ def filter_func(element):
152
+ # Check element type
153
+ if selector['type'] != 'any':
154
+ # Special handling for 'text' type to match both 'text', 'char', and 'word'
155
+ if selector['type'] == 'text':
156
+ if element.type not in ['text', 'char', 'word']:
157
+ return False
158
+ # Special handling for 'region' type to check for detected layout regions
159
+ elif selector['type'] == 'region':
160
+ # Check if this is a Region with region_type property
161
+ if not hasattr(element, 'region_type'):
162
+ return False
163
+
164
+ # If 'type' attribute specified, it will be checked in the attributes section
165
+ # Check for Docling-specific types (section-header, etc.)
166
+ elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
167
+ # This is a direct match with a Docling region type
168
+ pass
169
+ # Otherwise, require exact match with the element's type attribute
170
+ elif not hasattr(element, 'type') or element.type != selector['type']:
171
+ return False
172
+
173
+ # Check attributes
174
+ for name, attr_info in selector['attributes'].items():
175
+ op = attr_info['op']
176
+ value = attr_info['value']
177
+
178
+ # Special case for fontname attribute - allow matching part of the name
179
+ if name == 'fontname' and op == '*=':
180
+ element_value = getattr(element, name, None)
181
+ if element_value is None or value.lower() not in element_value.lower():
182
+ return False
183
+ continue
184
+
185
+ # Convert hyphenated attribute names to underscore for Python properties
186
+ python_name = name.replace('-', '_')
187
+
188
+ # Special case for region attributes
189
+ if selector['type'] == 'region':
190
+ if name == 'type':
191
+ # Use normalized_type for comparison if available
192
+ if hasattr(element, 'normalized_type') and element.normalized_type:
193
+ element_value = element.normalized_type
194
+ else:
195
+ # Convert spaces to hyphens for consistency with the normalized format
196
+ element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
197
+ elif name == 'model':
198
+ # Special handling for model attribute in regions
199
+ element_value = getattr(element, 'model', None)
200
+ else:
201
+ # Get the attribute value from the element normally
202
+ element_value = getattr(element, python_name, None)
203
+ else:
204
+ # Get the attribute value from the element normally for non-region elements
205
+ element_value = getattr(element, python_name, None)
206
+
207
+ if element_value is None:
208
+ return False
209
+
210
+ # Apply operator
211
+ if op == '=':
212
+ if element_value != value:
213
+ return False
214
+ elif op == '~=':
215
+ # Approximate match (e.g., for colors)
216
+ if not _is_approximate_match(element_value, value):
217
+ return False
218
+ elif op == '>=':
219
+ # Greater than or equal (element value must be >= specified value)
220
+ if not (isinstance(element_value, (int, float)) and
221
+ isinstance(value, (int, float)) and
222
+ element_value >= value):
223
+ return False
224
+ elif op == '<=':
225
+ # Less than or equal (element value must be <= specified value)
226
+ if not (isinstance(element_value, (int, float)) and
227
+ isinstance(value, (int, float)) and
228
+ element_value <= value):
229
+ return False
230
+ elif op == '>':
231
+ # Greater than (element value must be > specified value)
232
+ if not (isinstance(element_value, (int, float)) and
233
+ isinstance(value, (int, float)) and
234
+ element_value > value):
235
+ return False
236
+ elif op == '<':
237
+ # Less than (element value must be < specified value)
238
+ if not (isinstance(element_value, (int, float)) and
239
+ isinstance(value, (int, float)) and
240
+ element_value < value):
241
+ return False
242
+
243
+ # Check pseudo-classes
244
+ for pseudo in selector['pseudo_classes']:
245
+ name = pseudo['name']
246
+ args = pseudo['args']
247
+
248
+ # Handle various pseudo-classes
249
+ if name == 'contains' and hasattr(element, 'text'):
250
+ use_regex = kwargs.get('regex', False)
251
+ ignore_case = not kwargs.get('case', True)
252
+
253
+ if use_regex:
254
+ import re
255
+ if not element.text:
256
+ return False
257
+ try:
258
+ pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
259
+ if not pattern.search(element.text):
260
+ return False
261
+ except re.error:
262
+ # If regex is invalid, fall back to literal text search
263
+ element_text = element.text
264
+ search_text = args
265
+
266
+ if ignore_case:
267
+ element_text = element_text.lower()
268
+ search_text = search_text.lower()
269
+
270
+ if search_text not in element_text:
271
+ return False
272
+ else:
273
+ # String comparison with case sensitivity option
274
+ if not element.text:
275
+ return False
276
+
277
+ element_text = element.text
278
+ search_text = args
279
+
280
+ if ignore_case:
281
+ element_text = element_text.lower()
282
+ search_text = search_text.lower()
283
+
284
+ if search_text not in element_text:
285
+ return False
286
+ elif name == 'starts-with' and hasattr(element, 'text'):
287
+ if not element.text or not element.text.startswith(args):
288
+ return False
289
+ elif name == 'ends-with' and hasattr(element, 'text'):
290
+ if not element.text or not element.text.endswith(args):
291
+ return False
292
+ elif name == 'bold':
293
+ if not (hasattr(element, 'bold') and element.bold):
294
+ return False
295
+ elif name == 'italic':
296
+ if not (hasattr(element, 'italic') and element.italic):
297
+ return False
298
+ elif name == 'horizontal':
299
+ if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
300
+ return False
301
+ elif name == 'vertical':
302
+ if not (hasattr(element, 'is_vertical') and element.is_vertical):
303
+ return False
304
+ else:
305
+ # Potentially unsupported pseudo-class, or one handled elsewhere (like :not)
306
+ pass
307
+
308
+ # If we get here, all checks passed
309
+ return True
310
+
311
+ return filter_func
312
+
313
+
314
+ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
315
+ """
316
+ Check if two values approximately match.
317
+
318
+ This is mainly used for color comparisons with some tolerance.
319
+
320
+ Args:
321
+ value1: First value
322
+ value2: Second value
323
+ tolerance: Maximum difference allowed
324
+
325
+ Returns:
326
+ True if the values approximately match
327
+ """
328
+ # Handle string colors by converting them to RGB tuples
329
+ if isinstance(value1, str):
330
+ try:
331
+ value1 = tuple(Color(value1).rgb)
332
+ except:
333
+ pass
334
+
335
+ if isinstance(value2, str):
336
+ try:
337
+ value2 = tuple(Color(value2).rgb)
338
+ except:
339
+ pass
340
+
341
+ # If both are tuples/lists with the same length (e.g., colors)
342
+ if (isinstance(value1, (list, tuple)) and
343
+ isinstance(value2, (list, tuple)) and
344
+ len(value1) == len(value2)):
345
+
346
+ # Check if all components are within tolerance
347
+ return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
348
+
349
+ # If both are numbers
350
+ if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
351
+ return abs(value1 - value2) <= tolerance
352
+
353
+ # Default to exact match for other types
354
+ return value1 == value2
@@ -0,0 +1 @@
1
+ # Templates package