natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,360 @@
|
|
1
|
+
"""
|
2
|
+
CSS-like selector parser for natural-pdf.
|
3
|
+
"""
|
4
|
+
import re
|
5
|
+
import ast
|
6
|
+
from typing import Dict, Any, List, Optional, Union, Tuple
|
7
|
+
from colour import Color
|
8
|
+
|
9
|
+
|
10
|
+
def safe_parse_value(value_str: str) -> Any:
|
11
|
+
"""
|
12
|
+
Safely parse a value string without using eval().
|
13
|
+
|
14
|
+
Args:
|
15
|
+
value_str: String representation of a value (number, tuple, string, etc.)
|
16
|
+
|
17
|
+
Returns:
|
18
|
+
Parsed value
|
19
|
+
"""
|
20
|
+
# Strip quotes first if it's a quoted string
|
21
|
+
value_str = value_str.strip()
|
22
|
+
if (value_str.startswith('"') and value_str.endswith('"')) or \
|
23
|
+
(value_str.startswith("'") and value_str.endswith("'")):
|
24
|
+
return value_str[1:-1]
|
25
|
+
|
26
|
+
# Try parsing as a Python literal (numbers, tuples, lists)
|
27
|
+
try:
|
28
|
+
return ast.literal_eval(value_str)
|
29
|
+
except (SyntaxError, ValueError):
|
30
|
+
# If it's not a valid Python literal, return as is
|
31
|
+
return value_str
|
32
|
+
|
33
|
+
|
34
|
+
def safe_parse_color(value_str: str) -> tuple:
|
35
|
+
"""
|
36
|
+
Parse a color value which could be an RGB tuple, color name, or hex code.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
RGB tuple (r, g, b) with values from 0 to 1
|
43
|
+
"""
|
44
|
+
value_str = value_str.strip()
|
45
|
+
|
46
|
+
# Try parsing as a Python literal (for RGB tuples)
|
47
|
+
try:
|
48
|
+
# If it's already a valid tuple or list, parse it
|
49
|
+
color_tuple = ast.literal_eval(value_str)
|
50
|
+
if isinstance(color_tuple, (list, tuple)) and len(color_tuple) >= 3:
|
51
|
+
# Return just the RGB components as a tuple
|
52
|
+
return tuple(color_tuple[:3])
|
53
|
+
except (SyntaxError, ValueError):
|
54
|
+
# Not a valid tuple/list, try as a color name or hex
|
55
|
+
try:
|
56
|
+
# Use colour library to parse color names, hex values, etc.
|
57
|
+
color = Color(value_str)
|
58
|
+
# Convert to RGB tuple with values between 0 and 1
|
59
|
+
return (color.red, color.green, color.blue)
|
60
|
+
except (ValueError, AttributeError):
|
61
|
+
# If color parsing fails, return a default (black)
|
62
|
+
return (0, 0, 0)
|
63
|
+
|
64
|
+
# If we got here with a non-tuple, return default
|
65
|
+
return (0, 0, 0)
|
66
|
+
|
67
|
+
|
68
|
+
def parse_selector(selector: str) -> Dict[str, Any]:
|
69
|
+
"""
|
70
|
+
Parse a CSS-like selector string into a structured selector object.
|
71
|
+
|
72
|
+
Examples:
|
73
|
+
- 'text:contains("Revenue")'
|
74
|
+
- 'table:below("Financial Data")'
|
75
|
+
- 'rect[fill=(1,0,0)]'
|
76
|
+
|
77
|
+
Args:
|
78
|
+
selector: CSS-like selector string
|
79
|
+
|
80
|
+
Returns:
|
81
|
+
Dict representing the parsed selector
|
82
|
+
"""
|
83
|
+
# Basic structure for result
|
84
|
+
result = {
|
85
|
+
'type': 'any', # Default to any element type
|
86
|
+
'filters': [],
|
87
|
+
'attributes': {},
|
88
|
+
'pseudo_classes': [],
|
89
|
+
}
|
90
|
+
|
91
|
+
# Check if empty or None
|
92
|
+
if not selector or not isinstance(selector, str):
|
93
|
+
return result
|
94
|
+
|
95
|
+
# Parse element type
|
96
|
+
type_match = re.match(r'^([a-zA-Z_]+)', selector)
|
97
|
+
if type_match:
|
98
|
+
result['type'] = type_match.group(1).lower()
|
99
|
+
selector = selector[len(type_match.group(0)):]
|
100
|
+
|
101
|
+
# Parse attributes (e.g., [color=(1,0,0)])
|
102
|
+
attr_pattern = r'\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]'
|
103
|
+
attr_matches = re.findall(attr_pattern, selector)
|
104
|
+
for name, op, value in attr_matches:
|
105
|
+
# Handle special parsing for color attributes
|
106
|
+
if name in ['color', 'non_stroking_color', 'fill', 'stroke', 'strokeColor', 'fillColor']:
|
107
|
+
value = safe_parse_color(value)
|
108
|
+
else:
|
109
|
+
# Safe parsing for other attributes
|
110
|
+
value = safe_parse_value(value)
|
111
|
+
|
112
|
+
# Store attribute with operator
|
113
|
+
result['attributes'][name] = {
|
114
|
+
'op': op,
|
115
|
+
'value': value
|
116
|
+
}
|
117
|
+
|
118
|
+
# Parse pseudo-classes (e.g., :contains("text"))
|
119
|
+
pseudo_pattern = r':([a-zA-Z_]+)(?:\(([^)]+)\))?'
|
120
|
+
pseudo_matches = re.findall(pseudo_pattern, selector)
|
121
|
+
for name, args in pseudo_matches:
|
122
|
+
# Process arguments
|
123
|
+
processed_args = args
|
124
|
+
if args:
|
125
|
+
if name in ['color', 'background']:
|
126
|
+
processed_args = safe_parse_color(args)
|
127
|
+
else:
|
128
|
+
processed_args = safe_parse_value(args)
|
129
|
+
|
130
|
+
result['pseudo_classes'].append({
|
131
|
+
'name': name,
|
132
|
+
'args': processed_args
|
133
|
+
})
|
134
|
+
|
135
|
+
return result
|
136
|
+
|
137
|
+
|
138
|
+
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
139
|
+
"""
|
140
|
+
Convert a parsed selector to a filter function.
|
141
|
+
|
142
|
+
Args:
|
143
|
+
selector: Parsed selector dictionary
|
144
|
+
**kwargs: Additional filter parameters including:
|
145
|
+
- regex: Whether to use regex for text search
|
146
|
+
- case: Whether to do case-sensitive text search
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
Function that takes an element and returns True if it matches
|
150
|
+
"""
|
151
|
+
def filter_func(element):
|
152
|
+
# Check element type
|
153
|
+
if selector['type'] != 'any':
|
154
|
+
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
155
|
+
if selector['type'] == 'text':
|
156
|
+
if element.type not in ['text', 'char', 'word']:
|
157
|
+
return False
|
158
|
+
# Special handling for 'region' type to check for detected layout regions
|
159
|
+
elif selector['type'] == 'region':
|
160
|
+
# Check if this is a Region with region_type property
|
161
|
+
if not hasattr(element, 'region_type'):
|
162
|
+
return False
|
163
|
+
|
164
|
+
# If 'type' attribute specified, it will be checked in the attributes section
|
165
|
+
# Otherwise, require exact match
|
166
|
+
elif element.type != selector['type']:
|
167
|
+
return False
|
168
|
+
|
169
|
+
# Check attributes
|
170
|
+
for name, attr_info in selector['attributes'].items():
|
171
|
+
op = attr_info['op']
|
172
|
+
value = attr_info['value']
|
173
|
+
|
174
|
+
# Special case for fontname attribute - allow matching part of the name
|
175
|
+
if name == 'fontname' and op == '*=':
|
176
|
+
element_value = getattr(element, name, None)
|
177
|
+
if element_value is None or value.lower() not in element_value.lower():
|
178
|
+
return False
|
179
|
+
continue
|
180
|
+
|
181
|
+
# Convert hyphenated attribute names to underscore for Python properties
|
182
|
+
python_name = name.replace('-', '_')
|
183
|
+
|
184
|
+
# Special case for region attributes
|
185
|
+
if selector['type'] == 'region':
|
186
|
+
if name == 'type':
|
187
|
+
# Use normalized_type for comparison if available
|
188
|
+
if hasattr(element, 'normalized_type') and element.normalized_type:
|
189
|
+
element_value = element.normalized_type
|
190
|
+
else:
|
191
|
+
# Convert spaces to hyphens for consistency with the normalized format
|
192
|
+
element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
|
193
|
+
elif name == 'model':
|
194
|
+
# Special handling for model attribute in regions
|
195
|
+
element_value = getattr(element, 'model', None)
|
196
|
+
else:
|
197
|
+
# Get the attribute value from the element normally
|
198
|
+
element_value = getattr(element, python_name, None)
|
199
|
+
else:
|
200
|
+
# Get the attribute value from the element normally for non-region elements
|
201
|
+
element_value = getattr(element, python_name, None)
|
202
|
+
|
203
|
+
if element_value is None:
|
204
|
+
return False
|
205
|
+
|
206
|
+
# Apply operator
|
207
|
+
if op == '=':
|
208
|
+
if element_value != value:
|
209
|
+
return False
|
210
|
+
elif op == '~=':
|
211
|
+
# Approximate match (e.g., for colors)
|
212
|
+
if not _is_approximate_match(element_value, value):
|
213
|
+
return False
|
214
|
+
elif op == '>=':
|
215
|
+
# Greater than or equal (element value must be >= specified value)
|
216
|
+
if not (isinstance(element_value, (int, float)) and
|
217
|
+
isinstance(value, (int, float)) and
|
218
|
+
element_value >= value):
|
219
|
+
return False
|
220
|
+
elif op == '<=':
|
221
|
+
# Less than or equal (element value must be <= specified value)
|
222
|
+
if not (isinstance(element_value, (int, float)) and
|
223
|
+
isinstance(value, (int, float)) and
|
224
|
+
element_value <= value):
|
225
|
+
return False
|
226
|
+
elif op == '>':
|
227
|
+
# Greater than (element value must be > specified value)
|
228
|
+
if not (isinstance(element_value, (int, float)) and
|
229
|
+
isinstance(value, (int, float)) and
|
230
|
+
element_value > value):
|
231
|
+
return False
|
232
|
+
elif op == '<':
|
233
|
+
# Less than (element value must be < specified value)
|
234
|
+
if not (isinstance(element_value, (int, float)) and
|
235
|
+
isinstance(value, (int, float)) and
|
236
|
+
element_value < value):
|
237
|
+
return False
|
238
|
+
|
239
|
+
# Check pseudo-classes
|
240
|
+
for pseudo in selector['pseudo_classes']:
|
241
|
+
name = pseudo['name']
|
242
|
+
args = pseudo['args']
|
243
|
+
|
244
|
+
# Handle various pseudo-classes
|
245
|
+
if name == 'contains' and hasattr(element, 'text'):
|
246
|
+
use_regex = kwargs.get('regex', False)
|
247
|
+
ignore_case = not kwargs.get('case', True)
|
248
|
+
|
249
|
+
if use_regex:
|
250
|
+
import re
|
251
|
+
if not element.text:
|
252
|
+
return False
|
253
|
+
try:
|
254
|
+
pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
|
255
|
+
if not pattern.search(element.text):
|
256
|
+
return False
|
257
|
+
except re.error:
|
258
|
+
# If regex is invalid, fall back to literal text search
|
259
|
+
element_text = element.text
|
260
|
+
search_text = args
|
261
|
+
|
262
|
+
if ignore_case:
|
263
|
+
element_text = element_text.lower()
|
264
|
+
search_text = search_text.lower()
|
265
|
+
|
266
|
+
if search_text not in element_text:
|
267
|
+
return False
|
268
|
+
else:
|
269
|
+
# String comparison with case sensitivity option
|
270
|
+
if not element.text:
|
271
|
+
return False
|
272
|
+
|
273
|
+
element_text = element.text
|
274
|
+
search_text = args
|
275
|
+
|
276
|
+
if ignore_case:
|
277
|
+
element_text = element_text.lower()
|
278
|
+
search_text = search_text.lower()
|
279
|
+
|
280
|
+
if search_text not in element_text:
|
281
|
+
return False
|
282
|
+
elif name == 'starts-with' and hasattr(element, 'text'):
|
283
|
+
if not element.text or not element.text.startswith(args):
|
284
|
+
return False
|
285
|
+
elif name == 'ends-with' and hasattr(element, 'text'):
|
286
|
+
if not element.text or not element.text.endswith(args):
|
287
|
+
return False
|
288
|
+
elif name == 'bold':
|
289
|
+
# Try to use the element's bold property if it exists
|
290
|
+
if hasattr(element, 'bold'):
|
291
|
+
if not element.bold:
|
292
|
+
return False
|
293
|
+
# Otherwise check fontname
|
294
|
+
elif hasattr(element, 'fontname'):
|
295
|
+
font_lower = element.fontname.lower()
|
296
|
+
if not ('bold' in font_lower or 'black' in font_lower or element.fontname.endswith('-B')):
|
297
|
+
return False
|
298
|
+
else:
|
299
|
+
return False
|
300
|
+
elif name == 'italic':
|
301
|
+
# Try to use the element's italic property if it exists
|
302
|
+
if hasattr(element, 'italic'):
|
303
|
+
if not element.italic:
|
304
|
+
return False
|
305
|
+
# Otherwise check fontname
|
306
|
+
elif hasattr(element, 'fontname'):
|
307
|
+
font_lower = element.fontname.lower()
|
308
|
+
if not ('italic' in font_lower or 'oblique' in font_lower or element.fontname.endswith('-I')):
|
309
|
+
return False
|
310
|
+
else:
|
311
|
+
return False
|
312
|
+
# Spatial pseudo-classes are handled at a higher level (in _apply_selector)
|
313
|
+
|
314
|
+
# If we get here, all checks passed
|
315
|
+
return True
|
316
|
+
|
317
|
+
return filter_func
|
318
|
+
|
319
|
+
|
320
|
+
def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
321
|
+
"""
|
322
|
+
Check if two values approximately match.
|
323
|
+
|
324
|
+
This is mainly used for color comparisons with some tolerance.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
value1: First value
|
328
|
+
value2: Second value
|
329
|
+
tolerance: Maximum difference allowed
|
330
|
+
|
331
|
+
Returns:
|
332
|
+
True if the values approximately match
|
333
|
+
"""
|
334
|
+
# Handle string colors by converting them to RGB tuples
|
335
|
+
if isinstance(value1, str) and (value1.startswith('#') or value1.lower() in Color.COLOR_NAME_TO_RGB):
|
336
|
+
try:
|
337
|
+
value1 = tuple(Color(value1).rgb)
|
338
|
+
except (ValueError, AttributeError):
|
339
|
+
pass
|
340
|
+
|
341
|
+
if isinstance(value2, str) and (value2.startswith('#') or value2.lower() in Color.COLOR_NAME_TO_RGB):
|
342
|
+
try:
|
343
|
+
value2 = tuple(Color(value2).rgb)
|
344
|
+
except (ValueError, AttributeError):
|
345
|
+
pass
|
346
|
+
|
347
|
+
# If both are tuples/lists with the same length (e.g., colors)
|
348
|
+
if (isinstance(value1, (list, tuple)) and
|
349
|
+
isinstance(value2, (list, tuple)) and
|
350
|
+
len(value1) == len(value2)):
|
351
|
+
|
352
|
+
# Check if all components are within tolerance
|
353
|
+
return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
|
354
|
+
|
355
|
+
# If both are numbers
|
356
|
+
if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
|
357
|
+
return abs(value1 - value2) <= tolerance
|
358
|
+
|
359
|
+
# Default to exact match for other types
|
360
|
+
return value1 == value2
|
@@ -0,0 +1 @@
|
|
1
|
+
# Templates package
|