natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -1,28 +1,31 @@
|
|
1
1
|
"""
|
2
2
|
CSS-like selector parser for natural-pdf.
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
5
|
import ast
|
6
|
-
|
6
|
+
import re
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
|
+
|
7
9
|
from colour import Color
|
8
10
|
|
9
11
|
|
10
12
|
def safe_parse_value(value_str: str) -> Any:
|
11
13
|
"""
|
12
14
|
Safely parse a value string without using eval().
|
13
|
-
|
15
|
+
|
14
16
|
Args:
|
15
17
|
value_str: String representation of a value (number, tuple, string, etc.)
|
16
|
-
|
18
|
+
|
17
19
|
Returns:
|
18
20
|
Parsed value
|
19
21
|
"""
|
20
22
|
# Strip quotes first if it's a quoted string
|
21
23
|
value_str = value_str.strip()
|
22
|
-
if (value_str.startswith('"') and value_str.endswith('"')) or
|
23
|
-
|
24
|
+
if (value_str.startswith('"') and value_str.endswith('"')) or (
|
25
|
+
value_str.startswith("'") and value_str.endswith("'")
|
26
|
+
):
|
24
27
|
return value_str[1:-1]
|
25
|
-
|
28
|
+
|
26
29
|
# Try parsing as a Python literal (numbers, tuples, lists)
|
27
30
|
try:
|
28
31
|
return ast.literal_eval(value_str)
|
@@ -34,15 +37,15 @@ def safe_parse_value(value_str: str) -> Any:
|
|
34
37
|
def safe_parse_color(value_str: str) -> tuple:
|
35
38
|
"""
|
36
39
|
Parse a color value which could be an RGB tuple, color name, or hex code.
|
37
|
-
|
40
|
+
|
38
41
|
Args:
|
39
42
|
value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
|
40
|
-
|
43
|
+
|
41
44
|
Returns:
|
42
45
|
RGB tuple (r, g, b) with values from 0 to 1
|
43
46
|
"""
|
44
47
|
value_str = value_str.strip()
|
45
|
-
|
48
|
+
|
46
49
|
# Try parsing as a Python literal (for RGB tuples)
|
47
50
|
try:
|
48
51
|
# If it's already a valid tuple or list, parse it
|
@@ -60,7 +63,7 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
60
63
|
except (ValueError, AttributeError):
|
61
64
|
# If color parsing fails, return a default (black)
|
62
65
|
return (0, 0, 0)
|
63
|
-
|
66
|
+
|
64
67
|
# If we got here with a non-tuple, return default
|
65
68
|
return (0, 0, 0)
|
66
69
|
|
@@ -68,260 +71,78 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
68
71
|
def parse_selector(selector: str) -> Dict[str, Any]:
|
69
72
|
"""
|
70
73
|
Parse a CSS-like selector string into a structured selector object.
|
71
|
-
|
74
|
+
|
72
75
|
Examples:
|
73
76
|
- 'text:contains("Revenue")'
|
74
77
|
- 'table:below("Financial Data")'
|
75
78
|
- 'rect[fill=(1,0,0)]'
|
76
|
-
|
79
|
+
|
77
80
|
Args:
|
78
81
|
selector: CSS-like selector string
|
79
|
-
|
82
|
+
|
80
83
|
Returns:
|
81
84
|
Dict representing the parsed selector
|
82
85
|
"""
|
83
86
|
# Basic structure for result
|
84
87
|
result = {
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
88
|
+
"type": "any", # Default to any element type
|
89
|
+
"filters": [],
|
90
|
+
"attributes": {},
|
91
|
+
"pseudo_classes": [],
|
89
92
|
}
|
90
|
-
|
93
|
+
|
91
94
|
# Check if empty or None
|
92
95
|
if not selector or not isinstance(selector, str):
|
93
96
|
return result
|
94
|
-
|
97
|
+
|
95
98
|
# Parse element type
|
96
|
-
type_match = re.match(r
|
99
|
+
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
97
100
|
if type_match:
|
98
|
-
result[
|
99
|
-
selector = selector[len(type_match.group(0)):]
|
100
|
-
|
101
|
+
result["type"] = type_match.group(1).lower()
|
102
|
+
selector = selector[len(type_match.group(0)) :]
|
103
|
+
|
101
104
|
# Parse attributes (e.g., [color=(1,0,0)])
|
102
|
-
attr_pattern = r
|
105
|
+
attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
|
103
106
|
attr_matches = re.findall(attr_pattern, selector)
|
104
107
|
for name, op, value in attr_matches:
|
105
108
|
# Handle special parsing for color attributes
|
106
|
-
if name in [
|
109
|
+
if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
|
107
110
|
value = safe_parse_color(value)
|
108
111
|
else:
|
109
112
|
# Safe parsing for other attributes
|
110
113
|
value = safe_parse_value(value)
|
111
|
-
|
114
|
+
|
112
115
|
# Store attribute with operator
|
113
|
-
result[
|
114
|
-
|
115
|
-
'value': value
|
116
|
-
}
|
117
|
-
|
116
|
+
result["attributes"][name] = {"op": op, "value": value}
|
117
|
+
|
118
118
|
# Parse pseudo-classes (e.g., :contains("text"))
|
119
|
-
pseudo_pattern = r
|
119
|
+
pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
|
120
120
|
pseudo_matches = re.findall(pseudo_pattern, selector)
|
121
121
|
for name, args in pseudo_matches:
|
122
122
|
# Process arguments
|
123
123
|
processed_args = args
|
124
124
|
if args:
|
125
|
-
if name in [
|
125
|
+
if name in ["color", "background"]:
|
126
126
|
processed_args = safe_parse_color(args)
|
127
127
|
else:
|
128
128
|
processed_args = safe_parse_value(args)
|
129
|
-
|
130
|
-
result['pseudo_classes'].append({
|
131
|
-
'name': name,
|
132
|
-
'args': processed_args
|
133
|
-
})
|
134
|
-
|
135
|
-
return result
|
136
129
|
|
130
|
+
result["pseudo_classes"].append({"name": name, "args": processed_args})
|
137
131
|
|
138
|
-
|
139
|
-
"""
|
140
|
-
Convert a parsed selector to a filter function.
|
141
|
-
|
142
|
-
Args:
|
143
|
-
selector: Parsed selector dictionary
|
144
|
-
**kwargs: Additional filter parameters including:
|
145
|
-
- regex: Whether to use regex for text search
|
146
|
-
- case: Whether to do case-sensitive text search
|
147
|
-
|
148
|
-
Returns:
|
149
|
-
Function that takes an element and returns True if it matches
|
150
|
-
"""
|
151
|
-
def filter_func(element):
|
152
|
-
# Check element type
|
153
|
-
if selector['type'] != 'any':
|
154
|
-
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
155
|
-
if selector['type'] == 'text':
|
156
|
-
if element.type not in ['text', 'char', 'word']:
|
157
|
-
return False
|
158
|
-
# Special handling for 'region' type to check for detected layout regions
|
159
|
-
elif selector['type'] == 'region':
|
160
|
-
# Check if this is a Region with region_type property
|
161
|
-
if not hasattr(element, 'region_type'):
|
162
|
-
return False
|
163
|
-
|
164
|
-
# If 'type' attribute specified, it will be checked in the attributes section
|
165
|
-
# Check for Docling-specific types (section-header, etc.)
|
166
|
-
elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
|
167
|
-
# This is a direct match with a Docling region type
|
168
|
-
pass
|
169
|
-
# Otherwise, require exact match with the element's type attribute
|
170
|
-
elif not hasattr(element, 'type') or element.type != selector['type']:
|
171
|
-
return False
|
172
|
-
|
173
|
-
# Check attributes
|
174
|
-
for name, attr_info in selector['attributes'].items():
|
175
|
-
op = attr_info['op']
|
176
|
-
value = attr_info['value']
|
177
|
-
|
178
|
-
# Special case for fontname attribute - allow matching part of the name
|
179
|
-
if name == 'fontname' and op == '*=':
|
180
|
-
element_value = getattr(element, name, None)
|
181
|
-
if element_value is None or value.lower() not in element_value.lower():
|
182
|
-
return False
|
183
|
-
continue
|
184
|
-
|
185
|
-
# Convert hyphenated attribute names to underscore for Python properties
|
186
|
-
python_name = name.replace('-', '_')
|
187
|
-
|
188
|
-
# Special case for region attributes
|
189
|
-
if selector['type'] == 'region':
|
190
|
-
if name == 'type':
|
191
|
-
# Use normalized_type for comparison if available
|
192
|
-
if hasattr(element, 'normalized_type') and element.normalized_type:
|
193
|
-
element_value = element.normalized_type
|
194
|
-
else:
|
195
|
-
# Convert spaces to hyphens for consistency with the normalized format
|
196
|
-
element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
|
197
|
-
elif name == 'model':
|
198
|
-
# Special handling for model attribute in regions
|
199
|
-
element_value = getattr(element, 'model', None)
|
200
|
-
else:
|
201
|
-
# Get the attribute value from the element normally
|
202
|
-
element_value = getattr(element, python_name, None)
|
203
|
-
else:
|
204
|
-
# Get the attribute value from the element normally for non-region elements
|
205
|
-
element_value = getattr(element, python_name, None)
|
206
|
-
|
207
|
-
if element_value is None:
|
208
|
-
return False
|
209
|
-
|
210
|
-
# Apply operator
|
211
|
-
if op == '=':
|
212
|
-
if element_value != value:
|
213
|
-
return False
|
214
|
-
elif op == '~=':
|
215
|
-
# Approximate match (e.g., for colors)
|
216
|
-
if not _is_approximate_match(element_value, value):
|
217
|
-
return False
|
218
|
-
elif op == '>=':
|
219
|
-
# Greater than or equal (element value must be >= specified value)
|
220
|
-
if not (isinstance(element_value, (int, float)) and
|
221
|
-
isinstance(value, (int, float)) and
|
222
|
-
element_value >= value):
|
223
|
-
return False
|
224
|
-
elif op == '<=':
|
225
|
-
# Less than or equal (element value must be <= specified value)
|
226
|
-
if not (isinstance(element_value, (int, float)) and
|
227
|
-
isinstance(value, (int, float)) and
|
228
|
-
element_value <= value):
|
229
|
-
return False
|
230
|
-
elif op == '>':
|
231
|
-
# Greater than (element value must be > specified value)
|
232
|
-
if not (isinstance(element_value, (int, float)) and
|
233
|
-
isinstance(value, (int, float)) and
|
234
|
-
element_value > value):
|
235
|
-
return False
|
236
|
-
elif op == '<':
|
237
|
-
# Less than (element value must be < specified value)
|
238
|
-
if not (isinstance(element_value, (int, float)) and
|
239
|
-
isinstance(value, (int, float)) and
|
240
|
-
element_value < value):
|
241
|
-
return False
|
242
|
-
|
243
|
-
# Check pseudo-classes
|
244
|
-
for pseudo in selector['pseudo_classes']:
|
245
|
-
name = pseudo['name']
|
246
|
-
args = pseudo['args']
|
247
|
-
|
248
|
-
# Handle various pseudo-classes
|
249
|
-
if name == 'contains' and hasattr(element, 'text'):
|
250
|
-
use_regex = kwargs.get('regex', False)
|
251
|
-
ignore_case = not kwargs.get('case', True)
|
252
|
-
|
253
|
-
if use_regex:
|
254
|
-
import re
|
255
|
-
if not element.text:
|
256
|
-
return False
|
257
|
-
try:
|
258
|
-
pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
|
259
|
-
if not pattern.search(element.text):
|
260
|
-
return False
|
261
|
-
except re.error:
|
262
|
-
# If regex is invalid, fall back to literal text search
|
263
|
-
element_text = element.text
|
264
|
-
search_text = args
|
265
|
-
|
266
|
-
if ignore_case:
|
267
|
-
element_text = element_text.lower()
|
268
|
-
search_text = search_text.lower()
|
269
|
-
|
270
|
-
if search_text not in element_text:
|
271
|
-
return False
|
272
|
-
else:
|
273
|
-
# String comparison with case sensitivity option
|
274
|
-
if not element.text:
|
275
|
-
return False
|
276
|
-
|
277
|
-
element_text = element.text
|
278
|
-
search_text = args
|
279
|
-
|
280
|
-
if ignore_case:
|
281
|
-
element_text = element_text.lower()
|
282
|
-
search_text = search_text.lower()
|
283
|
-
|
284
|
-
if search_text not in element_text:
|
285
|
-
return False
|
286
|
-
elif name == 'starts-with' and hasattr(element, 'text'):
|
287
|
-
if not element.text or not element.text.startswith(args):
|
288
|
-
return False
|
289
|
-
elif name == 'ends-with' and hasattr(element, 'text'):
|
290
|
-
if not element.text or not element.text.endswith(args):
|
291
|
-
return False
|
292
|
-
elif name == 'bold':
|
293
|
-
if not (hasattr(element, 'bold') and element.bold):
|
294
|
-
return False
|
295
|
-
elif name == 'italic':
|
296
|
-
if not (hasattr(element, 'italic') and element.italic):
|
297
|
-
return False
|
298
|
-
elif name == 'horizontal':
|
299
|
-
if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
|
300
|
-
return False
|
301
|
-
elif name == 'vertical':
|
302
|
-
if not (hasattr(element, 'is_vertical') and element.is_vertical):
|
303
|
-
return False
|
304
|
-
else:
|
305
|
-
# Potentially unsupported pseudo-class, or one handled elsewhere (like :not)
|
306
|
-
pass
|
307
|
-
|
308
|
-
# If we get here, all checks passed
|
309
|
-
return True
|
310
|
-
|
311
|
-
return filter_func
|
132
|
+
return result
|
312
133
|
|
313
134
|
|
314
135
|
def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
315
136
|
"""
|
316
137
|
Check if two values approximately match.
|
317
|
-
|
138
|
+
|
318
139
|
This is mainly used for color comparisons with some tolerance.
|
319
|
-
|
140
|
+
|
320
141
|
Args:
|
321
142
|
value1: First value
|
322
143
|
value2: Second value
|
323
144
|
tolerance: Maximum difference allowed
|
324
|
-
|
145
|
+
|
325
146
|
Returns:
|
326
147
|
True if the values approximately match
|
327
148
|
"""
|
@@ -331,157 +152,177 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
|
331
152
|
value1 = tuple(Color(value1).rgb)
|
332
153
|
except:
|
333
154
|
pass
|
334
|
-
|
155
|
+
|
335
156
|
if isinstance(value2, str):
|
336
157
|
try:
|
337
158
|
value2 = tuple(Color(value2).rgb)
|
338
159
|
except:
|
339
160
|
pass
|
340
|
-
|
161
|
+
|
341
162
|
# If both are tuples/lists with the same length (e.g., colors)
|
342
|
-
if (
|
343
|
-
isinstance(
|
344
|
-
|
345
|
-
|
163
|
+
if (
|
164
|
+
isinstance(value1, (list, tuple))
|
165
|
+
and isinstance(value2, (list, tuple))
|
166
|
+
and len(value1) == len(value2)
|
167
|
+
):
|
168
|
+
|
346
169
|
# Check if all components are within tolerance
|
347
170
|
return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
|
348
|
-
|
171
|
+
|
349
172
|
# If both are numbers
|
350
173
|
if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
|
351
174
|
return abs(value1 - value2) <= tolerance
|
352
|
-
|
175
|
+
|
353
176
|
# Default to exact match for other types
|
354
177
|
return value1 == value2
|
355
178
|
|
356
179
|
|
357
180
|
PSEUDO_CLASS_FUNCTIONS = {
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
181
|
+
"bold": lambda el: hasattr(el, "bold") and el.bold,
|
182
|
+
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
183
|
+
"first-child": lambda el: hasattr(el, "parent")
|
184
|
+
and el.parent
|
185
|
+
and el.parent.children[0] == el, # Example placeholder
|
186
|
+
"last-child": lambda el: hasattr(el, "parent")
|
187
|
+
and el.parent
|
188
|
+
and el.parent.children[-1] == el, # Example placeholder
|
362
189
|
# Add the new pseudo-classes for negation
|
363
|
-
|
364
|
-
|
190
|
+
"not-bold": lambda el: hasattr(el, "bold") and not el.bold,
|
191
|
+
"not-italic": lambda el: hasattr(el, "italic") and not el.italic,
|
365
192
|
}
|
366
193
|
|
367
194
|
|
368
195
|
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
369
196
|
"""
|
370
197
|
Convert a parsed selector to a filter function.
|
371
|
-
|
198
|
+
|
372
199
|
Args:
|
373
200
|
selector: Parsed selector dictionary
|
374
201
|
**kwargs: Additional filter parameters including:
|
375
202
|
- regex: Whether to use regex for text search
|
376
203
|
- case: Whether to do case-sensitive text search
|
377
|
-
|
204
|
+
|
378
205
|
Returns:
|
379
206
|
Function that takes an element and returns True if it matches
|
380
207
|
"""
|
208
|
+
|
381
209
|
def filter_func(element):
|
382
210
|
# Check element type
|
383
|
-
if selector[
|
211
|
+
if selector["type"] != "any":
|
384
212
|
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
385
|
-
if selector[
|
386
|
-
if element.type not in [
|
213
|
+
if selector["type"] == "text":
|
214
|
+
if element.type not in ["text", "char", "word"]:
|
387
215
|
return False
|
388
216
|
# Special handling for 'region' type to check for detected layout regions
|
389
|
-
elif selector[
|
217
|
+
elif selector["type"] == "region":
|
390
218
|
# Check if this is a Region with region_type property
|
391
|
-
if not hasattr(element,
|
219
|
+
if not hasattr(element, "region_type"):
|
392
220
|
return False
|
393
|
-
|
221
|
+
|
394
222
|
# If 'type' attribute specified, it will be checked in the attributes section
|
395
223
|
# Check for Docling-specific types (section-header, etc.)
|
396
|
-
elif
|
224
|
+
elif (
|
225
|
+
hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
|
226
|
+
):
|
397
227
|
# This is a direct match with a Docling region type
|
398
228
|
pass
|
399
229
|
# Otherwise, require exact match with the element's type attribute
|
400
|
-
elif not hasattr(element,
|
230
|
+
elif not hasattr(element, "type") or element.type != selector["type"]:
|
401
231
|
return False
|
402
|
-
|
232
|
+
|
403
233
|
# Check attributes
|
404
|
-
for name, attr_info in selector[
|
405
|
-
op = attr_info[
|
406
|
-
value = attr_info[
|
407
|
-
|
234
|
+
for name, attr_info in selector["attributes"].items():
|
235
|
+
op = attr_info["op"]
|
236
|
+
value = attr_info["value"]
|
237
|
+
|
408
238
|
# Special case for fontname attribute - allow matching part of the name
|
409
|
-
if name ==
|
239
|
+
if name == "fontname" and op == "*=":
|
410
240
|
element_value = getattr(element, name, None)
|
411
241
|
if element_value is None or value.lower() not in element_value.lower():
|
412
242
|
return False
|
413
243
|
continue
|
414
|
-
|
244
|
+
|
415
245
|
# Convert hyphenated attribute names to underscore for Python properties
|
416
|
-
python_name = name.replace(
|
417
|
-
|
246
|
+
python_name = name.replace("-", "_")
|
247
|
+
|
418
248
|
# Special case for region attributes
|
419
|
-
if selector[
|
420
|
-
if name ==
|
249
|
+
if selector["type"] == "region":
|
250
|
+
if name == "type":
|
421
251
|
# Use normalized_type for comparison if available
|
422
|
-
if hasattr(element,
|
252
|
+
if hasattr(element, "normalized_type") and element.normalized_type:
|
423
253
|
element_value = element.normalized_type
|
424
254
|
else:
|
425
255
|
# Convert spaces to hyphens for consistency with the normalized format
|
426
|
-
element_value =
|
427
|
-
|
256
|
+
element_value = (
|
257
|
+
getattr(element, "region_type", "").lower().replace(" ", "_")
|
258
|
+
)
|
259
|
+
elif name == "model":
|
428
260
|
# Special handling for model attribute in regions
|
429
|
-
element_value = getattr(element,
|
261
|
+
element_value = getattr(element, "model", None)
|
430
262
|
else:
|
431
263
|
# Get the attribute value from the element normally
|
432
264
|
element_value = getattr(element, python_name, None)
|
433
265
|
else:
|
434
266
|
# Get the attribute value from the element normally for non-region elements
|
435
267
|
element_value = getattr(element, python_name, None)
|
436
|
-
|
268
|
+
|
437
269
|
if element_value is None:
|
438
270
|
return False
|
439
|
-
|
271
|
+
|
440
272
|
# Apply operator
|
441
|
-
if op ==
|
273
|
+
if op == "=":
|
442
274
|
if element_value != value:
|
443
275
|
return False
|
444
|
-
elif op ==
|
276
|
+
elif op == "~=":
|
445
277
|
# Approximate match (e.g., for colors)
|
446
278
|
if not _is_approximate_match(element_value, value):
|
447
279
|
return False
|
448
|
-
elif op ==
|
280
|
+
elif op == ">=":
|
449
281
|
# Greater than or equal (element value must be >= specified value)
|
450
|
-
if not (
|
451
|
-
|
452
|
-
|
282
|
+
if not (
|
283
|
+
isinstance(element_value, (int, float))
|
284
|
+
and isinstance(value, (int, float))
|
285
|
+
and element_value >= value
|
286
|
+
):
|
453
287
|
return False
|
454
|
-
elif op ==
|
288
|
+
elif op == "<=":
|
455
289
|
# Less than or equal (element value must be <= specified value)
|
456
|
-
if not (
|
457
|
-
|
458
|
-
|
290
|
+
if not (
|
291
|
+
isinstance(element_value, (int, float))
|
292
|
+
and isinstance(value, (int, float))
|
293
|
+
and element_value <= value
|
294
|
+
):
|
459
295
|
return False
|
460
|
-
elif op ==
|
296
|
+
elif op == ">":
|
461
297
|
# Greater than (element value must be > specified value)
|
462
|
-
if not (
|
463
|
-
|
464
|
-
|
298
|
+
if not (
|
299
|
+
isinstance(element_value, (int, float))
|
300
|
+
and isinstance(value, (int, float))
|
301
|
+
and element_value > value
|
302
|
+
):
|
465
303
|
return False
|
466
|
-
elif op ==
|
304
|
+
elif op == "<":
|
467
305
|
# Less than (element value must be < specified value)
|
468
|
-
if not (
|
469
|
-
|
470
|
-
|
306
|
+
if not (
|
307
|
+
isinstance(element_value, (int, float))
|
308
|
+
and isinstance(value, (int, float))
|
309
|
+
and element_value < value
|
310
|
+
):
|
471
311
|
return False
|
472
|
-
|
312
|
+
|
473
313
|
# Check pseudo-classes
|
474
|
-
for pseudo in selector[
|
475
|
-
name = pseudo[
|
476
|
-
args = pseudo[
|
477
|
-
|
314
|
+
for pseudo in selector["pseudo_classes"]:
|
315
|
+
name = pseudo["name"]
|
316
|
+
args = pseudo["args"]
|
317
|
+
|
478
318
|
# Handle various pseudo-classes
|
479
|
-
if name ==
|
480
|
-
use_regex = kwargs.get(
|
481
|
-
ignore_case = not kwargs.get(
|
482
|
-
|
319
|
+
if name == "contains" and hasattr(element, "text"):
|
320
|
+
use_regex = kwargs.get("regex", False)
|
321
|
+
ignore_case = not kwargs.get("case", True)
|
322
|
+
|
483
323
|
if use_regex:
|
484
324
|
import re
|
325
|
+
|
485
326
|
if not element.text:
|
486
327
|
return False
|
487
328
|
try:
|
@@ -492,77 +333,79 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
|
492
333
|
# If regex is invalid, fall back to literal text search
|
493
334
|
element_text = element.text
|
494
335
|
search_text = args
|
495
|
-
|
336
|
+
|
496
337
|
if ignore_case:
|
497
338
|
element_text = element_text.lower()
|
498
339
|
search_text = search_text.lower()
|
499
|
-
|
340
|
+
|
500
341
|
if search_text not in element_text:
|
501
342
|
return False
|
502
343
|
else:
|
503
344
|
# String comparison with case sensitivity option
|
504
345
|
if not element.text:
|
505
346
|
return False
|
506
|
-
|
347
|
+
|
507
348
|
element_text = element.text
|
508
349
|
search_text = args
|
509
|
-
|
350
|
+
|
510
351
|
if ignore_case:
|
511
352
|
element_text = element_text.lower()
|
512
353
|
search_text = search_text.lower()
|
513
|
-
|
354
|
+
|
514
355
|
if search_text not in element_text:
|
515
356
|
return False
|
516
|
-
elif name ==
|
357
|
+
elif name == "starts-with" and hasattr(element, "text"):
|
517
358
|
if not element.text or not element.text.startswith(args):
|
518
359
|
return False
|
519
|
-
elif name ==
|
360
|
+
elif name == "ends-with" and hasattr(element, "text"):
|
520
361
|
if not element.text or not element.text.endswith(args):
|
521
362
|
return False
|
522
|
-
elif name ==
|
523
|
-
if not (hasattr(element,
|
363
|
+
elif name == "bold":
|
364
|
+
if not (hasattr(element, "bold") and element.bold):
|
524
365
|
return False
|
525
|
-
elif name ==
|
526
|
-
if not (hasattr(element,
|
366
|
+
elif name == "italic":
|
367
|
+
if not (hasattr(element, "italic") and element.italic):
|
527
368
|
return False
|
528
|
-
elif name ==
|
529
|
-
if not (hasattr(element,
|
369
|
+
elif name == "horizontal":
|
370
|
+
if not (hasattr(element, "is_horizontal") and element.is_horizontal):
|
530
371
|
return False
|
531
|
-
elif name ==
|
532
|
-
if not (hasattr(element,
|
372
|
+
elif name == "vertical":
|
373
|
+
if not (hasattr(element, "is_vertical") and element.is_vertical):
|
533
374
|
return False
|
534
375
|
else:
|
535
376
|
# Check pseudo-classes (basic ones like :bold, :italic)
|
536
377
|
if name in PSEUDO_CLASS_FUNCTIONS:
|
537
378
|
if not PSEUDO_CLASS_FUNCTIONS[name](element):
|
538
379
|
return False
|
539
|
-
elif name ==
|
540
|
-
if not hasattr(element,
|
380
|
+
elif name == "contains":
|
381
|
+
if not hasattr(element, "text") or not element.text:
|
541
382
|
return False
|
542
383
|
text_to_check = element.text
|
543
384
|
search_term = args
|
544
|
-
if not kwargs.get(
|
385
|
+
if not kwargs.get("case", True): # Check case flag from kwargs
|
545
386
|
text_to_check = text_to_check.lower()
|
546
387
|
search_term = search_term.lower()
|
547
|
-
|
548
|
-
if kwargs.get(
|
388
|
+
|
389
|
+
if kwargs.get("regex", False): # Check regex flag from kwargs
|
549
390
|
try:
|
550
391
|
if not re.search(search_term, text_to_check):
|
551
392
|
return False
|
552
393
|
except re.error as e:
|
553
|
-
|
554
|
-
|
394
|
+
logger.warning(
|
395
|
+
f"Invalid regex in :contains selector '{search_term}': {e}"
|
396
|
+
)
|
397
|
+
return False # Invalid regex cannot match
|
555
398
|
else:
|
556
399
|
if search_term not in text_to_check:
|
557
400
|
return False
|
558
401
|
# Skip complex pseudo-classes like :near, :above here, handled later
|
559
|
-
elif name in (
|
560
|
-
pass
|
402
|
+
elif name in ("above", "below", "near", "left-of", "right-of"):
|
403
|
+
pass # Handled separately after initial filtering
|
561
404
|
else:
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
return True # Element passes all attribute and simple pseudo-class filters
|
405
|
+
# Optionally log unknown pseudo-classes
|
406
|
+
# logger.warning(f"Unknown pseudo-class: {name}")
|
407
|
+
pass
|
567
408
|
|
568
|
-
|
409
|
+
return True # Element passes all attribute and simple pseudo-class filters
|
410
|
+
|
411
|
+
return filter_func
|