natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -3,11 +3,14 @@ CSS-like selector parser for natural-pdf.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import ast
|
6
|
+
import logging
|
6
7
|
import re
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
8
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
8
9
|
|
9
10
|
from colour import Color
|
10
11
|
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
11
14
|
|
12
15
|
def safe_parse_value(value_str: str) -> Any:
|
13
16
|
"""
|
@@ -72,10 +75,11 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
72
75
|
"""
|
73
76
|
Parse a CSS-like selector string into a structured selector object.
|
74
77
|
|
75
|
-
|
76
|
-
- 'text
|
77
|
-
-
|
78
|
-
- '
|
78
|
+
Handles:
|
79
|
+
- Element types (e.g., 'text', 'rect')
|
80
|
+
- Attribute presence (e.g., '[data-id]')
|
81
|
+
- Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
|
82
|
+
- Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
|
79
83
|
|
80
84
|
Args:
|
81
85
|
selector: CSS-like selector string
|
@@ -83,51 +87,154 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
83
87
|
Returns:
|
84
88
|
Dict representing the parsed selector
|
85
89
|
"""
|
86
|
-
# Basic structure for result
|
87
90
|
result = {
|
88
|
-
"type": "any",
|
89
|
-
"
|
90
|
-
"attributes": {},
|
91
|
+
"type": "any",
|
92
|
+
"attributes": [],
|
91
93
|
"pseudo_classes": [],
|
94
|
+
"filters": [], # Keep this for potential future use
|
92
95
|
}
|
93
96
|
|
94
|
-
#
|
97
|
+
original_selector_for_error = selector # Keep for error messages
|
95
98
|
if not selector or not isinstance(selector, str):
|
96
99
|
return result
|
97
100
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
for
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
101
|
+
selector = selector.strip()
|
102
|
+
|
103
|
+
# --- Handle wildcard selector explicitly ---
|
104
|
+
if selector == "*":
|
105
|
+
# Wildcard matches any type, already the default.
|
106
|
+
# Clear selector so the loop doesn't run and error out.
|
107
|
+
selector = ""
|
108
|
+
# --- END NEW ---
|
109
|
+
|
110
|
+
# 1. Extract type (optional, at the beginning)
|
111
|
+
# Only run if selector wasn't '*'
|
112
|
+
if selector:
|
113
|
+
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
|
+
if type_match:
|
115
|
+
result["type"] = type_match.group(1).lower()
|
116
|
+
selector = selector[len(type_match.group(0)) :].strip()
|
117
|
+
# Only run if selector wasn't '*'
|
118
|
+
if selector:
|
119
|
+
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
120
|
+
if type_match:
|
121
|
+
result["type"] = type_match.group(1).lower()
|
122
|
+
selector = selector[len(type_match.group(0)) :].strip()
|
123
|
+
|
124
|
+
# Regexes for parts at the START of the remaining string
|
125
|
+
# Attribute: Starts with [, ends with ], content is non-greedy non-] chars
|
126
|
+
attr_pattern = re.compile(r"^\[\s*([^\s\]]+.*?)\s*\]")
|
127
|
+
# Pseudo: Starts with :, name is letters/hyphen/underscore, optionally followed by (...)
|
128
|
+
pseudo_pattern = re.compile(r"^:([a-zA-Z_\-]+)(?:\((.*?)\))?")
|
129
|
+
# :not() specifically requires careful parenthesis matching later
|
130
|
+
not_pseudo_prefix = ":not("
|
131
|
+
|
132
|
+
# 2. Iteratively parse attributes and pseudo-classes
|
133
|
+
while selector:
|
134
|
+
processed_chunk = False
|
135
|
+
|
136
|
+
# Check for attribute block `[...]`
|
137
|
+
attr_match = attr_pattern.match(selector)
|
138
|
+
if attr_match:
|
139
|
+
block_content = attr_match.group(1).strip()
|
140
|
+
# Parse the content inside the block
|
141
|
+
# Pattern: name, optional op, optional value
|
142
|
+
detail_match = re.match(
|
143
|
+
r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
|
144
|
+
)
|
145
|
+
if not detail_match:
|
146
|
+
raise ValueError(
|
147
|
+
f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
|
148
|
+
)
|
149
|
+
|
150
|
+
name, op, value_str = detail_match.groups()
|
151
|
+
|
152
|
+
if op is None:
|
153
|
+
# Presence selector [attr]
|
154
|
+
result["attributes"].append({"name": name, "op": "exists", "value": None})
|
127
155
|
else:
|
128
|
-
|
156
|
+
# Operator exists, value must also exist (even if empty via quotes)
|
157
|
+
if value_str is None: # Catches invalid [attr=]
|
158
|
+
raise ValueError(
|
159
|
+
f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
|
160
|
+
)
|
161
|
+
# Parse value
|
162
|
+
parsed_value: Any
|
163
|
+
if name in [
|
164
|
+
"color",
|
165
|
+
"non_stroking_color",
|
166
|
+
"fill",
|
167
|
+
"stroke",
|
168
|
+
"strokeColor",
|
169
|
+
"fillColor",
|
170
|
+
]:
|
171
|
+
parsed_value = safe_parse_color(value_str)
|
172
|
+
else:
|
173
|
+
parsed_value = safe_parse_value(value_str) # Handles quotes
|
174
|
+
result["attributes"].append({"name": name, "op": op, "value": parsed_value})
|
175
|
+
|
176
|
+
selector = selector[attr_match.end() :].strip()
|
177
|
+
processed_chunk = True
|
178
|
+
continue
|
179
|
+
|
180
|
+
# Check for :not(...) block
|
181
|
+
if selector.lower().startswith(not_pseudo_prefix):
|
182
|
+
start_index = len(not_pseudo_prefix) - 1 # Index of '('
|
183
|
+
nesting = 1
|
184
|
+
end_index = -1
|
185
|
+
for i in range(start_index + 1, len(selector)):
|
186
|
+
if selector[i] == "(":
|
187
|
+
nesting += 1
|
188
|
+
elif selector[i] == ")":
|
189
|
+
nesting -= 1
|
190
|
+
if nesting == 0:
|
191
|
+
end_index = i
|
192
|
+
break
|
193
|
+
|
194
|
+
if end_index == -1:
|
195
|
+
raise ValueError(
|
196
|
+
f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
|
197
|
+
)
|
198
|
+
|
199
|
+
inner_selector_str = selector[start_index + 1 : end_index].strip()
|
200
|
+
if not inner_selector_str:
|
201
|
+
raise ValueError(
|
202
|
+
f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
|
203
|
+
)
|
204
|
+
|
205
|
+
# Recursively parse the inner selector
|
206
|
+
parsed_inner_selector = parse_selector(inner_selector_str)
|
207
|
+
result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
|
208
|
+
|
209
|
+
selector = selector[end_index + 1 :].strip()
|
210
|
+
processed_chunk = True
|
211
|
+
continue
|
212
|
+
|
213
|
+
# Check for other pseudo-class blocks `:name` or `:name(...)`
|
214
|
+
pseudo_match = pseudo_pattern.match(selector)
|
215
|
+
if pseudo_match:
|
216
|
+
name, args_str = pseudo_match.groups()
|
217
|
+
name = name.lower() # Normalize pseudo-class name
|
218
|
+
processed_args = args_str # Keep as string initially, or None
|
219
|
+
|
220
|
+
if args_str is not None:
|
221
|
+
# Only parse args if they exist and based on the pseudo-class type
|
222
|
+
if name in ["color", "background"]:
|
223
|
+
processed_args = safe_parse_color(args_str)
|
224
|
+
else:
|
225
|
+
processed_args = safe_parse_value(args_str)
|
226
|
+
# else: args remain None
|
129
227
|
|
130
|
-
|
228
|
+
result["pseudo_classes"].append({"name": name, "args": processed_args})
|
229
|
+
selector = selector[pseudo_match.end() :].strip()
|
230
|
+
processed_chunk = True
|
231
|
+
continue
|
232
|
+
|
233
|
+
# If we reach here and the selector string is not empty, something is wrong
|
234
|
+
if not processed_chunk and selector:
|
235
|
+
raise ValueError(
|
236
|
+
f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
|
237
|
+
)
|
131
238
|
|
132
239
|
return result
|
133
240
|
|
@@ -180,21 +287,18 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
|
|
180
287
|
PSEUDO_CLASS_FUNCTIONS = {
|
181
288
|
"bold": lambda el: hasattr(el, "bold") and el.bold,
|
182
289
|
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
183
|
-
"first-child": lambda el: hasattr(el, "parent")
|
184
|
-
and el.parent
|
185
|
-
|
186
|
-
"
|
187
|
-
and el.parent
|
188
|
-
and el.parent.children[-1] == el, # Example placeholder
|
189
|
-
# Add the new pseudo-classes for negation
|
290
|
+
"first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
|
291
|
+
"last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
|
292
|
+
"empty": lambda el: not el.text,
|
293
|
+
"not-empty": lambda el: el.text,
|
190
294
|
"not-bold": lambda el: hasattr(el, "bold") and not el.bold,
|
191
295
|
"not-italic": lambda el: hasattr(el, "italic") and not el.italic,
|
192
296
|
}
|
193
297
|
|
194
298
|
|
195
|
-
def
|
299
|
+
def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
|
196
300
|
"""
|
197
|
-
Convert a parsed selector to a filter
|
301
|
+
Convert a parsed selector to a list of named filter functions.
|
198
302
|
|
199
303
|
Args:
|
200
304
|
selector: Parsed selector dictionary
|
@@ -203,209 +307,306 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
|
203
307
|
- case: Whether to do case-sensitive text search
|
204
308
|
|
205
309
|
Returns:
|
206
|
-
|
310
|
+
List of dictionaries, each with 'name' (str) and 'func' (callable).
|
311
|
+
The callable takes an element and returns True if it matches the specific filter.
|
207
312
|
"""
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
313
|
+
filters: List[Dict[str, Any]] = []
|
314
|
+
selector_type = selector["type"]
|
315
|
+
|
316
|
+
# Filter by element type
|
317
|
+
if selector_type != "any":
|
318
|
+
filter_name = f"type is '{selector_type}'"
|
319
|
+
if selector_type == "text":
|
320
|
+
filter_name = "type is 'text', 'char', or 'word'"
|
321
|
+
func = lambda el: hasattr(el, "type") and el.type in ["text", "char", "word"]
|
322
|
+
elif selector_type == "region":
|
323
|
+
filter_name = "type is 'region' (has region_type)"
|
324
|
+
# Note: Specific region type attribute (e.g., [type=table]) is checked below
|
325
|
+
func = lambda el: hasattr(el, "region_type")
|
326
|
+
else:
|
327
|
+
# Check against normalized_type first, then element.type
|
328
|
+
func = lambda el: (
|
329
|
+
hasattr(el, "normalized_type") and el.normalized_type == selector_type
|
330
|
+
) or (
|
331
|
+
not hasattr(
|
332
|
+
el, "normalized_type"
|
333
|
+
) # Only check element.type if normalized_type doesn't exist/match
|
334
|
+
and hasattr(el, "type")
|
335
|
+
and el.type == selector_type
|
336
|
+
)
|
337
|
+
filters.append({"name": filter_name, "func": func})
|
338
|
+
|
339
|
+
# Filter by attributes
|
340
|
+
for attr_filter in selector["attributes"]:
|
341
|
+
name = attr_filter["name"]
|
342
|
+
op = attr_filter["op"]
|
343
|
+
value = attr_filter["value"]
|
344
|
+
python_name = name.replace("-", "_") # Convert CSS-style names
|
345
|
+
|
346
|
+
# --- Define the core value retrieval logic ---
|
347
|
+
def get_element_value(
|
348
|
+
element, name=name, python_name=python_name, selector_type=selector_type
|
349
|
+
):
|
350
|
+
bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
|
351
|
+
if name in bbox_mapping:
|
352
|
+
bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
|
353
|
+
return bbox[bbox_mapping[name]]
|
247
354
|
|
248
355
|
# Special case for region attributes
|
249
|
-
if
|
356
|
+
if selector_type == "region":
|
250
357
|
if name == "type":
|
251
|
-
# Use normalized_type for comparison if available
|
252
358
|
if hasattr(element, "normalized_type") and element.normalized_type:
|
253
|
-
|
359
|
+
return element.normalized_type
|
254
360
|
else:
|
255
|
-
|
256
|
-
element_value = (
|
257
|
-
getattr(element, "region_type", "").lower().replace(" ", "_")
|
258
|
-
)
|
361
|
+
return getattr(element, "region_type", "").lower().replace(" ", "_")
|
259
362
|
elif name == "model":
|
260
|
-
|
261
|
-
element_value = getattr(element, "model", None)
|
363
|
+
return getattr(element, "model", None)
|
262
364
|
else:
|
263
|
-
|
264
|
-
element_value = getattr(element, python_name, None)
|
365
|
+
return getattr(element, python_name, None)
|
265
366
|
else:
|
266
|
-
#
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
367
|
+
# General case for non-region elements
|
368
|
+
return getattr(element, python_name, None)
|
369
|
+
|
370
|
+
# --- Define the comparison function or direct check ---
|
371
|
+
filter_lambda: Callable[[Any], bool]
|
372
|
+
filter_name: str
|
373
|
+
|
374
|
+
if op == "exists":
|
375
|
+
# Special handling for attribute presence check [attr]
|
376
|
+
filter_name = f"attribute [{name} exists]"
|
377
|
+
# Lambda checks that the retrieved value is not None
|
378
|
+
filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
|
379
|
+
else:
|
380
|
+
# Handle operators with values (e.g., =, !=, *=, etc.)
|
381
|
+
compare_func: Callable[[Any, Any], bool]
|
382
|
+
op_desc = f"{op} {value!r}" # Default description
|
271
383
|
|
272
|
-
#
|
384
|
+
# Determine compare_func based on op (reuse existing logic)
|
273
385
|
if op == "=":
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
386
|
+
compare_func = lambda el_val, sel_val: el_val == sel_val
|
387
|
+
elif op == "!=":
|
388
|
+
compare_func = lambda el_val, sel_val: el_val != sel_val
|
389
|
+
elif op == "~":
|
390
|
+
op_desc = f"~= {value!r} (approx)"
|
391
|
+
compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
|
392
|
+
elif op == "^=":
|
393
|
+
compare_func = (
|
394
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
395
|
+
and isinstance(sel_val, str)
|
396
|
+
and el_val.startswith(sel_val)
|
397
|
+
)
|
398
|
+
elif op == "$=":
|
399
|
+
compare_func = (
|
400
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
401
|
+
and isinstance(sel_val, str)
|
402
|
+
and el_val.endswith(sel_val)
|
403
|
+
)
|
404
|
+
elif op == "*=":
|
405
|
+
if name == "fontname":
|
406
|
+
op_desc = f"*= {value!r} (contains, case-insensitive)"
|
407
|
+
compare_func = (
|
408
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
409
|
+
and isinstance(sel_val, str)
|
410
|
+
and sel_val.lower() in el_val.lower()
|
411
|
+
)
|
412
|
+
else:
|
413
|
+
op_desc = f"*= {value!r} (contains)"
|
414
|
+
compare_func = (
|
415
|
+
lambda el_val, sel_val: isinstance(el_val, str)
|
416
|
+
and isinstance(sel_val, str)
|
417
|
+
and sel_val in el_val
|
418
|
+
)
|
280
419
|
elif op == ">=":
|
281
|
-
|
282
|
-
|
283
|
-
isinstance(
|
284
|
-
and
|
285
|
-
|
286
|
-
):
|
287
|
-
return False
|
420
|
+
compare_func = (
|
421
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
422
|
+
and isinstance(sel_val, (int, float))
|
423
|
+
and el_val >= sel_val
|
424
|
+
)
|
288
425
|
elif op == "<=":
|
289
|
-
|
290
|
-
|
291
|
-
isinstance(
|
292
|
-
and
|
293
|
-
|
294
|
-
):
|
295
|
-
return False
|
426
|
+
compare_func = (
|
427
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
428
|
+
and isinstance(sel_val, (int, float))
|
429
|
+
and el_val <= sel_val
|
430
|
+
)
|
296
431
|
elif op == ">":
|
297
|
-
|
298
|
-
|
299
|
-
isinstance(
|
300
|
-
and
|
301
|
-
|
302
|
-
):
|
303
|
-
return False
|
432
|
+
compare_func = (
|
433
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
434
|
+
and isinstance(sel_val, (int, float))
|
435
|
+
and el_val > sel_val
|
436
|
+
)
|
304
437
|
elif op == "<":
|
305
|
-
|
306
|
-
|
307
|
-
isinstance(
|
308
|
-
and
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
438
|
+
compare_func = (
|
439
|
+
lambda el_val, sel_val: isinstance(el_val, (int, float))
|
440
|
+
and isinstance(sel_val, (int, float))
|
441
|
+
and el_val < sel_val
|
442
|
+
)
|
443
|
+
else:
|
444
|
+
# Should not happen with current parsing logic
|
445
|
+
logger.warning(
|
446
|
+
f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
|
447
|
+
)
|
448
|
+
continue # Skip this attribute filter
|
449
|
+
|
450
|
+
# --- Create the final filter function for operators with values ---
|
451
|
+
filter_name = f"attribute [{name}{op_desc}]"
|
452
|
+
# Capture loop variables correctly in the lambda
|
453
|
+
filter_lambda = (
|
454
|
+
lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
|
455
|
+
element_value := get_val(el)
|
456
|
+
)
|
457
|
+
is not None
|
458
|
+
and compare(element_value, expected_val)
|
459
|
+
)
|
460
|
+
|
461
|
+
filters.append({"name": filter_name, "func": filter_lambda})
|
462
|
+
|
463
|
+
# Filter by pseudo-classes
|
464
|
+
for pseudo in selector["pseudo_classes"]:
|
465
|
+
name = pseudo["name"]
|
466
|
+
args = pseudo["args"]
|
467
|
+
filter_lambda = None
|
468
|
+
# Start with a base name, modify for specifics like :not
|
469
|
+
filter_name = f"pseudo-class :{name}"
|
470
|
+
|
471
|
+
# Relational pseudo-classes are handled separately by the caller
|
472
|
+
if name in ("above", "below", "near", "left-of", "right-of"):
|
473
|
+
continue
|
474
|
+
|
475
|
+
# --- Handle :not() ---
|
476
|
+
elif name == "not":
|
477
|
+
if not isinstance(args, dict): # args should be the parsed inner selector
|
478
|
+
logger.error(f"Invalid arguments for :not pseudo-class: {args}")
|
479
|
+
raise TypeError(
|
480
|
+
"Internal error: :not pseudo-class requires a parsed selector dictionary as args."
|
481
|
+
)
|
482
|
+
|
483
|
+
# Recursively get the filter function for the inner selector
|
484
|
+
# Pass kwargs down in case regex/case flags affect the inner selector
|
485
|
+
inner_filter_func = selector_to_filter_func(args, **kwargs)
|
486
|
+
|
487
|
+
# The filter lambda applies the inner function and inverts the result
|
488
|
+
filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
|
489
|
+
|
490
|
+
# Try to create a descriptive name (can be long)
|
491
|
+
# Maybe simplify this later if needed
|
492
|
+
inner_filter_list = _build_filter_list(args, **kwargs)
|
493
|
+
inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
|
494
|
+
filter_name = f"pseudo-class :not({inner_filter_names})"
|
495
|
+
|
496
|
+
# --- Handle text-based pseudo-classes ---
|
497
|
+
elif name == "contains" and args is not None:
|
498
|
+
use_regex = kwargs.get("regex", False)
|
499
|
+
ignore_case = not kwargs.get("case", True) # Default case sensitive
|
500
|
+
filter_name = (
|
501
|
+
f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
|
502
|
+
)
|
503
|
+
|
504
|
+
def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
|
505
|
+
if not hasattr(element, "text") or not element.text:
|
506
|
+
return False # Element must have non-empty text
|
507
|
+
|
508
|
+
element_text = element.text
|
509
|
+
search_term = str(args) # Ensure args is string
|
322
510
|
|
323
511
|
if use_regex:
|
324
|
-
import re
|
325
|
-
|
326
|
-
if not element.text:
|
327
|
-
return False
|
328
512
|
try:
|
329
|
-
pattern = re.compile(
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
513
|
+
pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
|
514
|
+
return bool(pattern.search(element_text))
|
515
|
+
except re.error as e:
|
516
|
+
logger.warning(
|
517
|
+
f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
|
518
|
+
)
|
519
|
+
# Fallback to literal search on regex error
|
337
520
|
if ignore_case:
|
338
|
-
|
339
|
-
|
521
|
+
return search_term.lower() in element_text.lower()
|
522
|
+
else:
|
523
|
+
return search_term in element_text
|
524
|
+
else: # Literal search
|
525
|
+
if ignore_case:
|
526
|
+
return search_term.lower() in element_text.lower()
|
527
|
+
else:
|
528
|
+
return search_term in element_text
|
529
|
+
|
530
|
+
filter_lambda = contains_check
|
531
|
+
|
532
|
+
elif name == "starts-with" and args is not None:
|
533
|
+
filter_lambda = (
|
534
|
+
lambda el, arg=args: hasattr(el, "text")
|
535
|
+
and el.text
|
536
|
+
and el.text.startswith(str(arg))
|
537
|
+
)
|
538
|
+
elif name == "ends-with" and args is not None:
|
539
|
+
filter_lambda = (
|
540
|
+
lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
|
541
|
+
)
|
542
|
+
|
543
|
+
# Boolean attribute pseudo-classes
|
544
|
+
elif name == "bold":
|
545
|
+
filter_lambda = lambda el: hasattr(el, "bold") and el.bold
|
546
|
+
elif name == "italic":
|
547
|
+
filter_lambda = lambda el: hasattr(el, "italic") and el.italic
|
548
|
+
elif name == "horizontal":
|
549
|
+
filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
|
550
|
+
elif name == "vertical":
|
551
|
+
filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
|
552
|
+
|
553
|
+
# Check predefined lambda functions (e.g., :first-child, :empty)
|
554
|
+
elif name in PSEUDO_CLASS_FUNCTIONS:
|
555
|
+
filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
|
556
|
+
filter_name = f"pseudo-class :{name}" # Set name for predefined ones
|
557
|
+
else:
|
558
|
+
raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
|
340
559
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
# String comparison with case sensitivity option
|
345
|
-
if not element.text:
|
346
|
-
return False
|
560
|
+
if filter_lambda:
|
561
|
+
# Use the potentially updated filter_name
|
562
|
+
filters.append({"name": filter_name, "func": filter_lambda})
|
347
563
|
|
348
|
-
|
349
|
-
search_text = args
|
564
|
+
return filters
|
350
565
|
|
351
|
-
if ignore_case:
|
352
|
-
element_text = element_text.lower()
|
353
|
-
search_text = search_text.lower()
|
354
566
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
if not
|
371
|
-
return False
|
372
|
-
elif name == "vertical":
|
373
|
-
if not (hasattr(element, "is_vertical") and element.is_vertical):
|
567
|
+
def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool]:
|
568
|
+
"""
|
569
|
+
Combine a list of named filter functions into a single callable.
|
570
|
+
|
571
|
+
Args:
|
572
|
+
filters: List of dictionaries, each with 'name' and 'func'.
|
573
|
+
|
574
|
+
Returns:
|
575
|
+
A single function that takes an element and returns True only if
|
576
|
+
it passes ALL filters in the list.
|
577
|
+
"""
|
578
|
+
|
579
|
+
def combined_filter(element):
|
580
|
+
for f in filters:
|
581
|
+
try:
|
582
|
+
if not f["func"](element):
|
374
583
|
return False
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
else:
|
399
|
-
if search_term not in text_to_check:
|
400
|
-
return False
|
401
|
-
# Skip complex pseudo-classes like :near, :above here, handled later
|
402
|
-
elif name in ("above", "below", "near", "left-of", "right-of"):
|
403
|
-
pass # Handled separately after initial filtering
|
404
|
-
else:
|
405
|
-
# Optionally log unknown pseudo-classes
|
406
|
-
# logger.warning(f"Unknown pseudo-class: {name}")
|
407
|
-
pass
|
584
|
+
except Exception as e:
|
585
|
+
logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
|
586
|
+
return False # Treat errors as filter failures
|
587
|
+
return True
|
588
|
+
|
589
|
+
return combined_filter
|
590
|
+
|
591
|
+
|
592
|
+
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
|
593
|
+
"""
|
594
|
+
Convert a parsed selector to a single filter function.
|
595
|
+
|
596
|
+
Internally, this builds a list of individual filters and then combines them.
|
597
|
+
To inspect the individual filters, call `_build_filter_list` directly.
|
598
|
+
|
599
|
+
Args:
|
600
|
+
selector: Parsed selector dictionary
|
601
|
+
**kwargs: Additional filter parameters (e.g., regex, case).
|
602
|
+
|
603
|
+
Returns:
|
604
|
+
Function that takes an element and returns True if it matches the selector.
|
605
|
+
"""
|
606
|
+
filter_list = _build_filter_list(selector, **kwargs)
|
408
607
|
|
409
|
-
|
608
|
+
if logger.isEnabledFor(logging.DEBUG):
|
609
|
+
filter_names = [f["name"] for f in filter_list]
|
610
|
+
logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
|
410
611
|
|
411
|
-
return
|
612
|
+
return _assemble_filter_func(filter_list)
|