natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -3,11 +3,11 @@ CSS-like selector parser for natural-pdf.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import ast
|
6
|
+
import logging
|
6
7
|
import re
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
8
9
|
|
9
10
|
from colour import Color
|
10
|
-
import logging
|
11
11
|
|
12
12
|
logger = logging.getLogger(__name__)
|
13
13
|
|
@@ -89,31 +89,37 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
89
89
|
"""
|
90
90
|
result = {
|
91
91
|
"type": "any",
|
92
|
-
"attributes":
|
92
|
+
"attributes": [],
|
93
93
|
"pseudo_classes": [],
|
94
|
-
"filters": [],
|
94
|
+
"filters": [], # Keep this for potential future use
|
95
95
|
}
|
96
96
|
|
97
|
-
original_selector_for_error = selector
|
97
|
+
original_selector_for_error = selector # Keep for error messages
|
98
98
|
if not selector or not isinstance(selector, str):
|
99
99
|
return result
|
100
100
|
|
101
101
|
selector = selector.strip()
|
102
102
|
|
103
|
-
# ---
|
103
|
+
# --- Handle wildcard selector explicitly ---
|
104
104
|
if selector == "*":
|
105
105
|
# Wildcard matches any type, already the default.
|
106
106
|
# Clear selector so the loop doesn't run and error out.
|
107
|
-
selector = ""
|
107
|
+
selector = ""
|
108
108
|
# --- END NEW ---
|
109
109
|
|
110
110
|
# 1. Extract type (optional, at the beginning)
|
111
111
|
# Only run if selector wasn't '*'
|
112
|
-
if selector:
|
112
|
+
if selector:
|
113
|
+
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
|
+
if type_match:
|
115
|
+
result["type"] = type_match.group(1).lower()
|
116
|
+
selector = selector[len(type_match.group(0)) :].strip()
|
117
|
+
# Only run if selector wasn't '*'
|
118
|
+
if selector:
|
113
119
|
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
120
|
if type_match:
|
115
121
|
result["type"] = type_match.group(1).lower()
|
116
|
-
selector = selector[len(type_match.group(0)):].strip()
|
122
|
+
selector = selector[len(type_match.group(0)) :].strip()
|
117
123
|
|
118
124
|
# Regexes for parts at the START of the remaining string
|
119
125
|
# Attribute: Starts with [, ends with ], content is non-greedy non-] chars
|
@@ -133,58 +139,74 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
133
139
|
block_content = attr_match.group(1).strip()
|
134
140
|
# Parse the content inside the block
|
135
141
|
# Pattern: name, optional op, optional value
|
136
|
-
detail_match = re.match(
|
142
|
+
detail_match = re.match(
|
143
|
+
r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
|
144
|
+
)
|
137
145
|
if not detail_match:
|
138
|
-
raise ValueError(
|
146
|
+
raise ValueError(
|
147
|
+
f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
|
148
|
+
)
|
139
149
|
|
140
150
|
name, op, value_str = detail_match.groups()
|
141
151
|
|
142
152
|
if op is None:
|
143
|
-
|
144
|
-
|
153
|
+
# Presence selector [attr]
|
154
|
+
result["attributes"].append({"name": name, "op": "exists", "value": None})
|
145
155
|
else:
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
156
|
+
# Operator exists, value must also exist (even if empty via quotes)
|
157
|
+
if value_str is None: # Catches invalid [attr=]
|
158
|
+
raise ValueError(
|
159
|
+
f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
|
160
|
+
)
|
161
|
+
# Parse value
|
162
|
+
parsed_value: Any
|
163
|
+
if name in [
|
164
|
+
"color",
|
165
|
+
"non_stroking_color",
|
166
|
+
"fill",
|
167
|
+
"stroke",
|
168
|
+
"strokeColor",
|
169
|
+
"fillColor",
|
170
|
+
]:
|
171
|
+
parsed_value = safe_parse_color(value_str)
|
172
|
+
else:
|
173
|
+
parsed_value = safe_parse_value(value_str) # Handles quotes
|
174
|
+
result["attributes"].append({"name": name, "op": op, "value": parsed_value})
|
175
|
+
|
176
|
+
selector = selector[attr_match.end() :].strip()
|
160
177
|
processed_chunk = True
|
161
178
|
continue
|
162
179
|
|
163
180
|
# Check for :not(...) block
|
164
181
|
if selector.lower().startswith(not_pseudo_prefix):
|
165
|
-
start_index = len(not_pseudo_prefix) - 1
|
182
|
+
start_index = len(not_pseudo_prefix) - 1 # Index of '('
|
166
183
|
nesting = 1
|
167
184
|
end_index = -1
|
168
185
|
for i in range(start_index + 1, len(selector)):
|
169
|
-
if selector[i] ==
|
170
|
-
|
186
|
+
if selector[i] == "(":
|
187
|
+
nesting += 1
|
188
|
+
elif selector[i] == ")":
|
171
189
|
nesting -= 1
|
172
190
|
if nesting == 0:
|
173
191
|
end_index = i
|
174
192
|
break
|
175
193
|
|
176
194
|
if end_index == -1:
|
177
|
-
raise ValueError(
|
195
|
+
raise ValueError(
|
196
|
+
f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
|
197
|
+
)
|
178
198
|
|
179
199
|
inner_selector_str = selector[start_index + 1 : end_index].strip()
|
180
200
|
if not inner_selector_str:
|
181
|
-
|
201
|
+
raise ValueError(
|
202
|
+
f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
|
203
|
+
)
|
182
204
|
|
183
205
|
# Recursively parse the inner selector
|
184
206
|
parsed_inner_selector = parse_selector(inner_selector_str)
|
185
|
-
result["pseudo_classes"].append({
|
207
|
+
result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
|
186
208
|
|
187
|
-
selector = selector[end_index + 1:].strip()
|
209
|
+
selector = selector[end_index + 1 :].strip()
|
188
210
|
processed_chunk = True
|
189
211
|
continue
|
190
212
|
|
@@ -192,25 +214,27 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
192
214
|
pseudo_match = pseudo_pattern.match(selector)
|
193
215
|
if pseudo_match:
|
194
216
|
name, args_str = pseudo_match.groups()
|
195
|
-
name = name.lower()
|
196
|
-
processed_args = args_str
|
217
|
+
name = name.lower() # Normalize pseudo-class name
|
218
|
+
processed_args = args_str # Keep as string initially, or None
|
197
219
|
|
198
220
|
if args_str is not None:
|
199
221
|
# Only parse args if they exist and based on the pseudo-class type
|
200
|
-
|
222
|
+
if name in ["color", "background"]:
|
201
223
|
processed_args = safe_parse_color(args_str)
|
202
|
-
|
224
|
+
else:
|
203
225
|
processed_args = safe_parse_value(args_str)
|
204
226
|
# else: args remain None
|
205
227
|
|
206
228
|
result["pseudo_classes"].append({"name": name, "args": processed_args})
|
207
|
-
selector = selector[pseudo_match.end():].strip()
|
229
|
+
selector = selector[pseudo_match.end() :].strip()
|
208
230
|
processed_chunk = True
|
209
231
|
continue
|
210
232
|
|
211
233
|
# If we reach here and the selector string is not empty, something is wrong
|
212
234
|
if not processed_chunk and selector:
|
213
|
-
raise ValueError(
|
235
|
+
raise ValueError(
|
236
|
+
f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
|
237
|
+
)
|
214
238
|
|
215
239
|
return result
|
216
240
|
|
@@ -263,12 +287,8 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
|
263
287
|
PSEUDO_CLASS_FUNCTIONS = {
|
264
288
|
"bold": lambda el: hasattr(el, "bold") and el.bold,
|
265
289
|
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
266
|
-
"first-child": lambda el: hasattr(el, "parent")
|
267
|
-
and el.parent
|
268
|
-
and el.parent.children[0] == el,
|
269
|
-
"last-child": lambda el: hasattr(el, "parent")
|
270
|
-
and el.parent
|
271
|
-
and el.parent.children[-1] == el,
|
290
|
+
"first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
|
291
|
+
"last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
|
272
292
|
"empty": lambda el: not el.text,
|
273
293
|
"not-empty": lambda el: el.text,
|
274
294
|
"not-bold": lambda el: hasattr(el, "bold") and not el.bold,
|
@@ -308,34 +328,44 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
308
328
|
func = lambda el: (
|
309
329
|
hasattr(el, "normalized_type") and el.normalized_type == selector_type
|
310
330
|
) or (
|
311
|
-
not hasattr(
|
312
|
-
|
331
|
+
not hasattr(
|
332
|
+
el, "normalized_type"
|
333
|
+
) # Only check element.type if normalized_type doesn't exist/match
|
334
|
+
and hasattr(el, "type")
|
335
|
+
and el.type == selector_type
|
313
336
|
)
|
314
337
|
filters.append({"name": filter_name, "func": func})
|
315
338
|
|
316
|
-
|
317
339
|
# Filter by attributes
|
318
|
-
for
|
319
|
-
|
320
|
-
|
321
|
-
|
340
|
+
for attr_filter in selector["attributes"]:
|
341
|
+
name = attr_filter["name"]
|
342
|
+
op = attr_filter["op"]
|
343
|
+
value = attr_filter["value"]
|
344
|
+
python_name = name.replace("-", "_") # Convert CSS-style names
|
322
345
|
|
323
346
|
# --- Define the core value retrieval logic ---
|
324
|
-
def get_element_value(
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
347
|
+
def get_element_value(
|
348
|
+
element, name=name, python_name=python_name, selector_type=selector_type
|
349
|
+
):
|
350
|
+
bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
|
351
|
+
if name in bbox_mapping:
|
352
|
+
bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
|
353
|
+
return bbox[bbox_mapping[name]]
|
354
|
+
|
355
|
+
# Special case for region attributes
|
356
|
+
if selector_type == "region":
|
357
|
+
if name == "type":
|
358
|
+
if hasattr(element, "normalized_type") and element.normalized_type:
|
359
|
+
return element.normalized_type
|
360
|
+
else:
|
361
|
+
return getattr(element, "region_type", "").lower().replace(" ", "_")
|
362
|
+
elif name == "model":
|
363
|
+
return getattr(element, "model", None)
|
364
|
+
else:
|
365
|
+
return getattr(element, python_name, None)
|
366
|
+
else:
|
367
|
+
# General case for non-region elements
|
368
|
+
return getattr(element, python_name, None)
|
339
369
|
|
340
370
|
# --- Define the comparison function or direct check ---
|
341
371
|
filter_lambda: Callable[[Any], bool]
|
@@ -345,14 +375,11 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
345
375
|
# Special handling for attribute presence check [attr]
|
346
376
|
filter_name = f"attribute [{name} exists]"
|
347
377
|
# Lambda checks that the retrieved value is not None
|
348
|
-
filter_lambda = (
|
349
|
-
lambda el, get_val=get_element_value:
|
350
|
-
get_val(el) is not None
|
351
|
-
)
|
378
|
+
filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
|
352
379
|
else:
|
353
380
|
# Handle operators with values (e.g., =, !=, *=, etc.)
|
354
381
|
compare_func: Callable[[Any, Any], bool]
|
355
|
-
op_desc = f"{op} {value!r}"
|
382
|
+
op_desc = f"{op} {value!r}" # Default description
|
356
383
|
|
357
384
|
# Determine compare_func based on op (reuse existing logic)
|
358
385
|
if op == "=":
|
@@ -363,40 +390,76 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
363
390
|
op_desc = f"~= {value!r} (approx)"
|
364
391
|
compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
|
365
392
|
elif op == "^=":
|
366
|
-
compare_func =
|
393
|
+
compare_func = (
|
394
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
395
|
+
and isinstance(sel_val, str)
|
396
|
+
and el_val.startswith(sel_val)
|
397
|
+
)
|
367
398
|
elif op == "$=":
|
368
|
-
compare_func =
|
399
|
+
compare_func = (
|
400
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
401
|
+
and isinstance(sel_val, str)
|
402
|
+
and el_val.endswith(sel_val)
|
403
|
+
)
|
369
404
|
elif op == "*=":
|
370
405
|
if name == "fontname":
|
371
|
-
|
372
|
-
|
406
|
+
op_desc = f"*= {value!r} (contains, case-insensitive)"
|
407
|
+
compare_func = (
|
408
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
409
|
+
and isinstance(sel_val, str)
|
410
|
+
and sel_val.lower() in el_val.lower()
|
411
|
+
)
|
373
412
|
else:
|
374
|
-
|
375
|
-
|
413
|
+
op_desc = f"*= {value!r} (contains)"
|
414
|
+
compare_func = (
|
415
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
416
|
+
and isinstance(sel_val, str)
|
417
|
+
and sel_val in el_val
|
418
|
+
)
|
376
419
|
elif op == ">=":
|
377
|
-
compare_func =
|
420
|
+
compare_func = (
|
421
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
422
|
+
and isinstance(sel_val, (int, float))
|
423
|
+
and el_val >= sel_val
|
424
|
+
)
|
378
425
|
elif op == "<=":
|
379
|
-
compare_func =
|
426
|
+
compare_func = (
|
427
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
428
|
+
and isinstance(sel_val, (int, float))
|
429
|
+
and el_val <= sel_val
|
430
|
+
)
|
380
431
|
elif op == ">":
|
381
|
-
compare_func =
|
432
|
+
compare_func = (
|
433
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
434
|
+
and isinstance(sel_val, (int, float))
|
435
|
+
and el_val > sel_val
|
436
|
+
)
|
382
437
|
elif op == "<":
|
383
|
-
compare_func =
|
438
|
+
compare_func = (
|
439
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
440
|
+
and isinstance(sel_val, (int, float))
|
441
|
+
and el_val < sel_val
|
442
|
+
)
|
384
443
|
else:
|
385
444
|
# Should not happen with current parsing logic
|
386
|
-
logger.warning(
|
387
|
-
|
445
|
+
logger.warning(
|
446
|
+
f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
|
447
|
+
)
|
448
|
+
continue # Skip this attribute filter
|
388
449
|
|
389
450
|
# --- Create the final filter function for operators with values ---
|
390
451
|
filter_name = f"attribute [{name}{op_desc}]"
|
391
452
|
# Capture loop variables correctly in the lambda
|
392
453
|
filter_lambda = (
|
393
|
-
lambda el, get_val=get_element_value, compare=compare_func, expected_val=value:
|
394
|
-
|
454
|
+
lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
|
455
|
+
element_value := get_val(el)
|
456
|
+
)
|
457
|
+
is not None
|
458
|
+
and compare(element_value, expected_val)
|
395
459
|
)
|
396
460
|
|
397
461
|
filters.append({"name": filter_name, "func": filter_lambda})
|
398
462
|
|
399
|
-
|
400
463
|
# Filter by pseudo-classes
|
401
464
|
for pseudo in selector["pseudo_classes"]:
|
402
465
|
name = pseudo["name"]
|
@@ -407,62 +470,75 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
407
470
|
|
408
471
|
# Relational pseudo-classes are handled separately by the caller
|
409
472
|
if name in ("above", "below", "near", "left-of", "right-of"):
|
410
|
-
|
473
|
+
continue
|
411
474
|
|
412
|
-
# --- Handle :not() ---
|
475
|
+
# --- Handle :not() ---
|
413
476
|
elif name == "not":
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
477
|
+
if not isinstance(args, dict): # args should be the parsed inner selector
|
478
|
+
logger.error(f"Invalid arguments for :not pseudo-class: {args}")
|
479
|
+
raise TypeError(
|
480
|
+
"Internal error: :not pseudo-class requires a parsed selector dictionary as args."
|
481
|
+
)
|
482
|
+
|
483
|
+
# Recursively get the filter function for the inner selector
|
484
|
+
# Pass kwargs down in case regex/case flags affect the inner selector
|
485
|
+
inner_filter_func = selector_to_filter_func(args, **kwargs)
|
486
|
+
|
487
|
+
# The filter lambda applies the inner function and inverts the result
|
488
|
+
filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
|
489
|
+
|
490
|
+
# Try to create a descriptive name (can be long)
|
491
|
+
# Maybe simplify this later if needed
|
492
|
+
inner_filter_list = _build_filter_list(args, **kwargs)
|
493
|
+
inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
|
494
|
+
filter_name = f"pseudo-class :not({inner_filter_names})"
|
495
|
+
|
496
|
+
# --- Handle text-based pseudo-classes ---
|
432
497
|
elif name == "contains" and args is not None:
|
433
498
|
use_regex = kwargs.get("regex", False)
|
434
|
-
ignore_case = not kwargs.get("case", True)
|
435
|
-
filter_name =
|
499
|
+
ignore_case = not kwargs.get("case", True) # Default case sensitive
|
500
|
+
filter_name = (
|
501
|
+
f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
|
502
|
+
)
|
436
503
|
|
437
504
|
def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
505
|
+
if not hasattr(element, "text") or not element.text:
|
506
|
+
return False # Element must have non-empty text
|
507
|
+
|
508
|
+
element_text = element.text
|
509
|
+
search_term = str(args) # Ensure args is string
|
510
|
+
|
511
|
+
if use_regex:
|
512
|
+
try:
|
513
|
+
pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
|
514
|
+
return bool(pattern.search(element_text))
|
515
|
+
except re.error as e:
|
516
|
+
logger.warning(
|
517
|
+
f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
|
518
|
+
)
|
519
|
+
# Fallback to literal search on regex error
|
520
|
+
if ignore_case:
|
521
|
+
return search_term.lower() in element_text.lower()
|
522
|
+
else:
|
523
|
+
return search_term in element_text
|
524
|
+
else: # Literal search
|
525
|
+
if ignore_case:
|
526
|
+
return search_term.lower() in element_text.lower()
|
527
|
+
else:
|
528
|
+
return search_term in element_text
|
529
|
+
|
460
530
|
filter_lambda = contains_check
|
461
531
|
|
462
532
|
elif name == "starts-with" and args is not None:
|
463
|
-
filter_lambda =
|
533
|
+
filter_lambda = (
|
534
|
+
lambda el, arg=args: hasattr(el, "text")
|
535
|
+
and el.text
|
536
|
+
and el.text.startswith(str(arg))
|
537
|
+
)
|
464
538
|
elif name == "ends-with" and args is not None:
|
465
|
-
|
539
|
+
filter_lambda = (
|
540
|
+
lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
|
541
|
+
)
|
466
542
|
|
467
543
|
# Boolean attribute pseudo-classes
|
468
544
|
elif name == "bold":
|
@@ -477,11 +553,10 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
477
553
|
# Check predefined lambda functions (e.g., :first-child, :empty)
|
478
554
|
elif name in PSEUDO_CLASS_FUNCTIONS:
|
479
555
|
filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
|
480
|
-
filter_name = f"pseudo-class :{name}"
|
556
|
+
filter_name = f"pseudo-class :{name}" # Set name for predefined ones
|
481
557
|
else:
|
482
558
|
raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
|
483
559
|
|
484
|
-
|
485
560
|
if filter_lambda:
|
486
561
|
# Use the potentially updated filter_name
|
487
562
|
filters.append({"name": filter_name, "func": filter_lambda})
|
@@ -500,15 +575,17 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
|
|
500
575
|
A single function that takes an element and returns True only if
|
501
576
|
it passes ALL filters in the list.
|
502
577
|
"""
|
578
|
+
|
503
579
|
def combined_filter(element):
|
504
580
|
for f in filters:
|
505
581
|
try:
|
506
|
-
if not f[
|
582
|
+
if not f["func"](element):
|
507
583
|
return False
|
508
584
|
except Exception as e:
|
509
|
-
|
510
|
-
|
585
|
+
logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
|
586
|
+
return False # Treat errors as filter failures
|
511
587
|
return True
|
588
|
+
|
512
589
|
return combined_filter
|
513
590
|
|
514
591
|
|
@@ -529,8 +606,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
529
606
|
filter_list = _build_filter_list(selector, **kwargs)
|
530
607
|
|
531
608
|
if logger.isEnabledFor(logging.DEBUG):
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
return _assemble_filter_func(filter_list)
|
609
|
+
filter_names = [f["name"] for f in filter_list]
|
610
|
+
logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
|
536
611
|
|
612
|
+
return _assemble_filter_func(filter_list)
|
natural_pdf/utils/debug.py
CHANGED
@@ -3,13 +3,13 @@ OCR debug utilities for natural-pdf.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import base64
|
6
|
+
import importlib.resources
|
7
|
+
import importlib.util
|
6
8
|
import io
|
7
9
|
import json
|
8
10
|
import os
|
9
|
-
import importlib.util
|
10
|
-
import importlib.resources
|
11
11
|
import webbrowser
|
12
|
-
from typing import Dict, List,
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
13
13
|
|
14
14
|
from PIL import Image
|
15
15
|
|
natural_pdf/utils/identifiers.py
CHANGED
natural_pdf/utils/locks.py
CHANGED
natural_pdf/utils/packaging.py
CHANGED
@@ -2,23 +2,25 @@
|
|
2
2
|
Utilities for packaging data for external processes, like correction tasks.
|
3
3
|
"""
|
4
4
|
|
5
|
-
import os
|
6
5
|
import base64
|
7
6
|
import io
|
8
7
|
import json
|
9
|
-
import zipfile
|
10
|
-
import tempfile
|
11
8
|
import logging
|
9
|
+
import os
|
12
10
|
import shutil
|
13
|
-
|
11
|
+
import tempfile
|
12
|
+
import zipfile
|
13
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Union
|
14
|
+
|
14
15
|
from tqdm import tqdm
|
16
|
+
|
15
17
|
from natural_pdf.elements.text import TextElement
|
16
18
|
|
17
19
|
# Import the specific PDF/Page types if possible, otherwise use Any
|
18
20
|
if TYPE_CHECKING:
|
19
|
-
from natural_pdf.core.pdf import PDF
|
20
|
-
from natural_pdf.core.page import Page
|
21
21
|
from natural_pdf.collections.pdf_collection import PDFCollection
|
22
|
+
from natural_pdf.core.page import Page
|
23
|
+
from natural_pdf.core.pdf import PDF
|
22
24
|
else:
|
23
25
|
PDF = Any
|
24
26
|
Page = Any
|