natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,360 @@
1
+ """
2
+ CSS-like selector parser for natural-pdf.
3
+ """
4
+ import re
5
+ import ast
6
+ from typing import Dict, Any, List, Optional, Union, Tuple
7
+ from colour import Color
8
+
9
+
10
+ def safe_parse_value(value_str: str) -> Any:
11
+ """
12
+ Safely parse a value string without using eval().
13
+
14
+ Args:
15
+ value_str: String representation of a value (number, tuple, string, etc.)
16
+
17
+ Returns:
18
+ Parsed value
19
+ """
20
+ # Strip quotes first if it's a quoted string
21
+ value_str = value_str.strip()
22
+ if (value_str.startswith('"') and value_str.endswith('"')) or \
23
+ (value_str.startswith("'") and value_str.endswith("'")):
24
+ return value_str[1:-1]
25
+
26
+ # Try parsing as a Python literal (numbers, tuples, lists)
27
+ try:
28
+ return ast.literal_eval(value_str)
29
+ except (SyntaxError, ValueError):
30
+ # If it's not a valid Python literal, return as is
31
+ return value_str
32
+
33
+
34
+ def safe_parse_color(value_str: str) -> tuple:
35
+ """
36
+ Parse a color value which could be an RGB tuple, color name, or hex code.
37
+
38
+ Args:
39
+ value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
40
+
41
+ Returns:
42
+ RGB tuple (r, g, b) with values from 0 to 1
43
+ """
44
+ value_str = value_str.strip()
45
+
46
+ # Try parsing as a Python literal (for RGB tuples)
47
+ try:
48
+ # If it's already a valid tuple or list, parse it
49
+ color_tuple = ast.literal_eval(value_str)
50
+ if isinstance(color_tuple, (list, tuple)) and len(color_tuple) >= 3:
51
+ # Return just the RGB components as a tuple
52
+ return tuple(color_tuple[:3])
53
+ except (SyntaxError, ValueError):
54
+ # Not a valid tuple/list, try as a color name or hex
55
+ try:
56
+ # Use colour library to parse color names, hex values, etc.
57
+ color = Color(value_str)
58
+ # Convert to RGB tuple with values between 0 and 1
59
+ return (color.red, color.green, color.blue)
60
+ except (ValueError, AttributeError):
61
+ # If color parsing fails, return a default (black)
62
+ return (0, 0, 0)
63
+
64
+ # If we got here with a non-tuple, return default
65
+ return (0, 0, 0)
66
+
67
+
68
+ def parse_selector(selector: str) -> Dict[str, Any]:
69
+ """
70
+ Parse a CSS-like selector string into a structured selector object.
71
+
72
+ Examples:
73
+ - 'text:contains("Revenue")'
74
+ - 'table:below("Financial Data")'
75
+ - 'rect[fill=(1,0,0)]'
76
+
77
+ Args:
78
+ selector: CSS-like selector string
79
+
80
+ Returns:
81
+ Dict representing the parsed selector
82
+ """
83
+ # Basic structure for result
84
+ result = {
85
+ 'type': 'any', # Default to any element type
86
+ 'filters': [],
87
+ 'attributes': {},
88
+ 'pseudo_classes': [],
89
+ }
90
+
91
+ # Check if empty or None
92
+ if not selector or not isinstance(selector, str):
93
+ return result
94
+
95
+ # Parse element type
96
+ type_match = re.match(r'^([a-zA-Z_]+)', selector)
97
+ if type_match:
98
+ result['type'] = type_match.group(1).lower()
99
+ selector = selector[len(type_match.group(0)):]
100
+
101
+ # Parse attributes (e.g., [color=(1,0,0)])
102
+ attr_pattern = r'\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]'
103
+ attr_matches = re.findall(attr_pattern, selector)
104
+ for name, op, value in attr_matches:
105
+ # Handle special parsing for color attributes
106
+ if name in ['color', 'non_stroking_color', 'fill', 'stroke', 'strokeColor', 'fillColor']:
107
+ value = safe_parse_color(value)
108
+ else:
109
+ # Safe parsing for other attributes
110
+ value = safe_parse_value(value)
111
+
112
+ # Store attribute with operator
113
+ result['attributes'][name] = {
114
+ 'op': op,
115
+ 'value': value
116
+ }
117
+
118
+ # Parse pseudo-classes (e.g., :contains("text"))
119
+ pseudo_pattern = r':([a-zA-Z_]+)(?:\(([^)]+)\))?'
120
+ pseudo_matches = re.findall(pseudo_pattern, selector)
121
+ for name, args in pseudo_matches:
122
+ # Process arguments
123
+ processed_args = args
124
+ if args:
125
+ if name in ['color', 'background']:
126
+ processed_args = safe_parse_color(args)
127
+ else:
128
+ processed_args = safe_parse_value(args)
129
+
130
+ result['pseudo_classes'].append({
131
+ 'name': name,
132
+ 'args': processed_args
133
+ })
134
+
135
+ return result
136
+
137
+
138
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
139
+ """
140
+ Convert a parsed selector to a filter function.
141
+
142
+ Args:
143
+ selector: Parsed selector dictionary
144
+ **kwargs: Additional filter parameters including:
145
+ - regex: Whether to use regex for text search
146
+ - case: Whether to do case-sensitive text search
147
+
148
+ Returns:
149
+ Function that takes an element and returns True if it matches
150
+ """
151
+ def filter_func(element):
152
+ # Check element type
153
+ if selector['type'] != 'any':
154
+ # Special handling for 'text' type to match both 'text', 'char', and 'word'
155
+ if selector['type'] == 'text':
156
+ if element.type not in ['text', 'char', 'word']:
157
+ return False
158
+ # Special handling for 'region' type to check for detected layout regions
159
+ elif selector['type'] == 'region':
160
+ # Check if this is a Region with region_type property
161
+ if not hasattr(element, 'region_type'):
162
+ return False
163
+
164
+ # If 'type' attribute specified, it will be checked in the attributes section
165
+ # Otherwise, require exact match
166
+ elif element.type != selector['type']:
167
+ return False
168
+
169
+ # Check attributes
170
+ for name, attr_info in selector['attributes'].items():
171
+ op = attr_info['op']
172
+ value = attr_info['value']
173
+
174
+ # Special case for fontname attribute - allow matching part of the name
175
+ if name == 'fontname' and op == '*=':
176
+ element_value = getattr(element, name, None)
177
+ if element_value is None or value.lower() not in element_value.lower():
178
+ return False
179
+ continue
180
+
181
+ # Convert hyphenated attribute names to underscore for Python properties
182
+ python_name = name.replace('-', '_')
183
+
184
+ # Special case for region attributes
185
+ if selector['type'] == 'region':
186
+ if name == 'type':
187
+ # Use normalized_type for comparison if available
188
+ if hasattr(element, 'normalized_type') and element.normalized_type:
189
+ element_value = element.normalized_type
190
+ else:
191
+ # Convert spaces to hyphens for consistency with the normalized format
192
+ element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
193
+ elif name == 'model':
194
+ # Special handling for model attribute in regions
195
+ element_value = getattr(element, 'model', None)
196
+ else:
197
+ # Get the attribute value from the element normally
198
+ element_value = getattr(element, python_name, None)
199
+ else:
200
+ # Get the attribute value from the element normally for non-region elements
201
+ element_value = getattr(element, python_name, None)
202
+
203
+ if element_value is None:
204
+ return False
205
+
206
+ # Apply operator
207
+ if op == '=':
208
+ if element_value != value:
209
+ return False
210
+ elif op == '~=':
211
+ # Approximate match (e.g., for colors)
212
+ if not _is_approximate_match(element_value, value):
213
+ return False
214
+ elif op == '>=':
215
+ # Greater than or equal (element value must be >= specified value)
216
+ if not (isinstance(element_value, (int, float)) and
217
+ isinstance(value, (int, float)) and
218
+ element_value >= value):
219
+ return False
220
+ elif op == '<=':
221
+ # Less than or equal (element value must be <= specified value)
222
+ if not (isinstance(element_value, (int, float)) and
223
+ isinstance(value, (int, float)) and
224
+ element_value <= value):
225
+ return False
226
+ elif op == '>':
227
+ # Greater than (element value must be > specified value)
228
+ if not (isinstance(element_value, (int, float)) and
229
+ isinstance(value, (int, float)) and
230
+ element_value > value):
231
+ return False
232
+ elif op == '<':
233
+ # Less than (element value must be < specified value)
234
+ if not (isinstance(element_value, (int, float)) and
235
+ isinstance(value, (int, float)) and
236
+ element_value < value):
237
+ return False
238
+
239
+ # Check pseudo-classes
240
+ for pseudo in selector['pseudo_classes']:
241
+ name = pseudo['name']
242
+ args = pseudo['args']
243
+
244
+ # Handle various pseudo-classes
245
+ if name == 'contains' and hasattr(element, 'text'):
246
+ use_regex = kwargs.get('regex', False)
247
+ ignore_case = not kwargs.get('case', True)
248
+
249
+ if use_regex:
250
+ import re
251
+ if not element.text:
252
+ return False
253
+ try:
254
+ pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
255
+ if not pattern.search(element.text):
256
+ return False
257
+ except re.error:
258
+ # If regex is invalid, fall back to literal text search
259
+ element_text = element.text
260
+ search_text = args
261
+
262
+ if ignore_case:
263
+ element_text = element_text.lower()
264
+ search_text = search_text.lower()
265
+
266
+ if search_text not in element_text:
267
+ return False
268
+ else:
269
+ # String comparison with case sensitivity option
270
+ if not element.text:
271
+ return False
272
+
273
+ element_text = element.text
274
+ search_text = args
275
+
276
+ if ignore_case:
277
+ element_text = element_text.lower()
278
+ search_text = search_text.lower()
279
+
280
+ if search_text not in element_text:
281
+ return False
282
+ elif name == 'starts-with' and hasattr(element, 'text'):
283
+ if not element.text or not element.text.startswith(args):
284
+ return False
285
+ elif name == 'ends-with' and hasattr(element, 'text'):
286
+ if not element.text or not element.text.endswith(args):
287
+ return False
288
+ elif name == 'bold':
289
+ # Try to use the element's bold property if it exists
290
+ if hasattr(element, 'bold'):
291
+ if not element.bold:
292
+ return False
293
+ # Otherwise check fontname
294
+ elif hasattr(element, 'fontname'):
295
+ font_lower = element.fontname.lower()
296
+ if not ('bold' in font_lower or 'black' in font_lower or element.fontname.endswith('-B')):
297
+ return False
298
+ else:
299
+ return False
300
+ elif name == 'italic':
301
+ # Try to use the element's italic property if it exists
302
+ if hasattr(element, 'italic'):
303
+ if not element.italic:
304
+ return False
305
+ # Otherwise check fontname
306
+ elif hasattr(element, 'fontname'):
307
+ font_lower = element.fontname.lower()
308
+ if not ('italic' in font_lower or 'oblique' in font_lower or element.fontname.endswith('-I')):
309
+ return False
310
+ else:
311
+ return False
312
+ # Spatial pseudo-classes are handled at a higher level (in _apply_selector)
313
+
314
+ # If we get here, all checks passed
315
+ return True
316
+
317
+ return filter_func
318
+
319
+
320
+ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
321
+ """
322
+ Check if two values approximately match.
323
+
324
+ This is mainly used for color comparisons with some tolerance.
325
+
326
+ Args:
327
+ value1: First value
328
+ value2: Second value
329
+ tolerance: Maximum difference allowed
330
+
331
+ Returns:
332
+ True if the values approximately match
333
+ """
334
+ # Handle string colors by converting them to RGB tuples
335
+ if isinstance(value1, str) and (value1.startswith('#') or value1.lower() in Color.COLOR_NAME_TO_RGB):
336
+ try:
337
+ value1 = tuple(Color(value1).rgb)
338
+ except (ValueError, AttributeError):
339
+ pass
340
+
341
+ if isinstance(value2, str) and (value2.startswith('#') or value2.lower() in Color.COLOR_NAME_TO_RGB):
342
+ try:
343
+ value2 = tuple(Color(value2).rgb)
344
+ except (ValueError, AttributeError):
345
+ pass
346
+
347
+ # If both are tuples/lists with the same length (e.g., colors)
348
+ if (isinstance(value1, (list, tuple)) and
349
+ isinstance(value2, (list, tuple)) and
350
+ len(value1) == len(value2)):
351
+
352
+ # Check if all components are within tolerance
353
+ return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
354
+
355
+ # If both are numbers
356
+ if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
357
+ return abs(value1 - value2) <= tolerance
358
+
359
+ # Default to exact match for other types
360
+ return value1 == value2
@@ -0,0 +1 @@
1
+ # Templates package