natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -4,9 +4,12 @@ CSS-like selector parser for natural-pdf.
|
|
4
4
|
|
5
5
|
import ast
|
6
6
|
import re
|
7
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Union, Callable
|
8
8
|
|
9
9
|
from colour import Color
|
10
|
+
import logging
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
10
13
|
|
11
14
|
|
12
15
|
def safe_parse_value(value_str: str) -> Any:
|
@@ -72,10 +75,11 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
72
75
|
"""
|
73
76
|
Parse a CSS-like selector string into a structured selector object.
|
74
77
|
|
75
|
-
|
76
|
-
- 'text
|
77
|
-
-
|
78
|
-
- '
|
78
|
+
Handles:
|
79
|
+
- Element types (e.g., 'text', 'rect')
|
80
|
+
- Attribute presence (e.g., '[data-id]')
|
81
|
+
- Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
|
82
|
+
- Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
|
79
83
|
|
80
84
|
Args:
|
81
85
|
selector: CSS-like selector string
|
@@ -83,51 +87,130 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
83
87
|
Returns:
|
84
88
|
Dict representing the parsed selector
|
85
89
|
"""
|
86
|
-
# Basic structure for result
|
87
90
|
result = {
|
88
|
-
"type": "any",
|
89
|
-
"filters": [],
|
91
|
+
"type": "any",
|
90
92
|
"attributes": {},
|
91
93
|
"pseudo_classes": [],
|
94
|
+
"filters": [], # Keep this for potential future use
|
92
95
|
}
|
93
96
|
|
94
|
-
#
|
97
|
+
original_selector_for_error = selector # Keep for error messages
|
95
98
|
if not selector or not isinstance(selector, str):
|
96
99
|
return result
|
97
100
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
#
|
119
|
-
pseudo_pattern = r"
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
101
|
+
selector = selector.strip()
|
102
|
+
|
103
|
+
# --- NEW: Handle wildcard selector explicitly ---
|
104
|
+
if selector == "*":
|
105
|
+
# Wildcard matches any type, already the default.
|
106
|
+
# Clear selector so the loop doesn't run and error out.
|
107
|
+
selector = ""
|
108
|
+
# --- END NEW ---
|
109
|
+
|
110
|
+
# 1. Extract type (optional, at the beginning)
|
111
|
+
# Only run if selector wasn't '*'
|
112
|
+
if selector:
|
113
|
+
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
|
+
if type_match:
|
115
|
+
result["type"] = type_match.group(1).lower()
|
116
|
+
selector = selector[len(type_match.group(0)):].strip()
|
117
|
+
|
118
|
+
# Regexes for parts at the START of the remaining string
|
119
|
+
# Attribute: Starts with [, ends with ], content is non-greedy non-] chars
|
120
|
+
attr_pattern = re.compile(r"^\[\s*([^\s\]]+.*?)\s*\]")
|
121
|
+
# Pseudo: Starts with :, name is letters/hyphen/underscore, optionally followed by (...)
|
122
|
+
pseudo_pattern = re.compile(r"^:([a-zA-Z_\-]+)(?:\((.*?)\))?")
|
123
|
+
# :not() specifically requires careful parenthesis matching later
|
124
|
+
not_pseudo_prefix = ":not("
|
125
|
+
|
126
|
+
# 2. Iteratively parse attributes and pseudo-classes
|
127
|
+
while selector:
|
128
|
+
processed_chunk = False
|
129
|
+
|
130
|
+
# Check for attribute block `[...]`
|
131
|
+
attr_match = attr_pattern.match(selector)
|
132
|
+
if attr_match:
|
133
|
+
block_content = attr_match.group(1).strip()
|
134
|
+
# Parse the content inside the block
|
135
|
+
# Pattern: name, optional op, optional value
|
136
|
+
detail_match = re.match(r"^([a-zA-Z_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content)
|
137
|
+
if not detail_match:
|
138
|
+
raise ValueError(f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'")
|
139
|
+
|
140
|
+
name, op, value_str = detail_match.groups()
|
141
|
+
|
142
|
+
if op is None:
|
143
|
+
# Presence selector [attr]
|
144
|
+
result["attributes"][name] = {"op": "exists", "value": None}
|
127
145
|
else:
|
128
|
-
|
129
|
-
|
130
|
-
|
146
|
+
# Operator exists, value must also exist (even if empty via quotes)
|
147
|
+
if value_str is None: # Catches invalid [attr=]
|
148
|
+
raise ValueError(
|
149
|
+
f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
|
150
|
+
)
|
151
|
+
# Parse value
|
152
|
+
parsed_value: Any
|
153
|
+
if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
|
154
|
+
parsed_value = safe_parse_color(value_str)
|
155
|
+
else:
|
156
|
+
parsed_value = safe_parse_value(value_str) # Handles quotes
|
157
|
+
result["attributes"][name] = {"op": op, "value": parsed_value}
|
158
|
+
|
159
|
+
selector = selector[attr_match.end():].strip()
|
160
|
+
processed_chunk = True
|
161
|
+
continue
|
162
|
+
|
163
|
+
# Check for :not(...) block
|
164
|
+
if selector.lower().startswith(not_pseudo_prefix):
|
165
|
+
start_index = len(not_pseudo_prefix) - 1 # Index of '('
|
166
|
+
nesting = 1
|
167
|
+
end_index = -1
|
168
|
+
for i in range(start_index + 1, len(selector)):
|
169
|
+
if selector[i] == '(': nesting += 1
|
170
|
+
elif selector[i] == ')':
|
171
|
+
nesting -= 1
|
172
|
+
if nesting == 0:
|
173
|
+
end_index = i
|
174
|
+
break
|
175
|
+
|
176
|
+
if end_index == -1:
|
177
|
+
raise ValueError(f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'")
|
178
|
+
|
179
|
+
inner_selector_str = selector[start_index + 1 : end_index].strip()
|
180
|
+
if not inner_selector_str:
|
181
|
+
raise ValueError(f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'")
|
182
|
+
|
183
|
+
# Recursively parse the inner selector
|
184
|
+
parsed_inner_selector = parse_selector(inner_selector_str)
|
185
|
+
result["pseudo_classes"].append({'name': 'not', 'args': parsed_inner_selector})
|
186
|
+
|
187
|
+
selector = selector[end_index + 1:].strip()
|
188
|
+
processed_chunk = True
|
189
|
+
continue
|
190
|
+
|
191
|
+
# Check for other pseudo-class blocks `:name` or `:name(...)`
|
192
|
+
pseudo_match = pseudo_pattern.match(selector)
|
193
|
+
if pseudo_match:
|
194
|
+
name, args_str = pseudo_match.groups()
|
195
|
+
name = name.lower() # Normalize pseudo-class name
|
196
|
+
processed_args = args_str # Keep as string initially, or None
|
197
|
+
|
198
|
+
if args_str is not None:
|
199
|
+
# Only parse args if they exist and based on the pseudo-class type
|
200
|
+
if name in ["color", "background"]:
|
201
|
+
processed_args = safe_parse_color(args_str)
|
202
|
+
else:
|
203
|
+
processed_args = safe_parse_value(args_str)
|
204
|
+
# else: args remain None
|
205
|
+
|
206
|
+
result["pseudo_classes"].append({"name": name, "args": processed_args})
|
207
|
+
selector = selector[pseudo_match.end():].strip()
|
208
|
+
processed_chunk = True
|
209
|
+
continue
|
210
|
+
|
211
|
+
# If we reach here and the selector string is not empty, something is wrong
|
212
|
+
if not processed_chunk and selector:
|
213
|
+
raise ValueError(f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'")
|
131
214
|
|
132
215
|
return result
|
133
216
|
|
@@ -182,19 +265,20 @@ PSEUDO_CLASS_FUNCTIONS = {
|
|
182
265
|
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
183
266
|
"first-child": lambda el: hasattr(el, "parent")
|
184
267
|
and el.parent
|
185
|
-
and el.parent.children[0] == el,
|
268
|
+
and el.parent.children[0] == el,
|
186
269
|
"last-child": lambda el: hasattr(el, "parent")
|
187
270
|
and el.parent
|
188
|
-
and el.parent.children[-1] == el,
|
189
|
-
|
271
|
+
and el.parent.children[-1] == el,
|
272
|
+
"empty": lambda el: not el.text,
|
273
|
+
"not-empty": lambda el: el.text,
|
190
274
|
"not-bold": lambda el: hasattr(el, "bold") and not el.bold,
|
191
275
|
"not-italic": lambda el: hasattr(el, "italic") and not el.italic,
|
192
276
|
}
|
193
277
|
|
194
278
|
|
195
|
-
def
|
279
|
+
def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
|
196
280
|
"""
|
197
|
-
Convert a parsed selector to a filter
|
281
|
+
Convert a parsed selector to a list of named filter functions.
|
198
282
|
|
199
283
|
Args:
|
200
284
|
selector: Parsed selector dictionary
|
@@ -203,209 +287,250 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
|
203
287
|
- case: Whether to do case-sensitive text search
|
204
288
|
|
205
289
|
Returns:
|
206
|
-
|
290
|
+
List of dictionaries, each with 'name' (str) and 'func' (callable).
|
291
|
+
The callable takes an element and returns True if it matches the specific filter.
|
207
292
|
"""
|
293
|
+
filters: List[Dict[str, Any]] = []
|
294
|
+
selector_type = selector["type"]
|
295
|
+
|
296
|
+
# Filter by element type
|
297
|
+
if selector_type != "any":
|
298
|
+
filter_name = f"type is '{selector_type}'"
|
299
|
+
if selector_type == "text":
|
300
|
+
filter_name = "type is 'text', 'char', or 'word'"
|
301
|
+
func = lambda el: hasattr(el, "type") and el.type in ["text", "char", "word"]
|
302
|
+
elif selector_type == "region":
|
303
|
+
filter_name = "type is 'region' (has region_type)"
|
304
|
+
# Note: Specific region type attribute (e.g., [type=table]) is checked below
|
305
|
+
func = lambda el: hasattr(el, "region_type")
|
306
|
+
else:
|
307
|
+
# Check against normalized_type first, then element.type
|
308
|
+
func = lambda el: (
|
309
|
+
hasattr(el, "normalized_type") and el.normalized_type == selector_type
|
310
|
+
) or (
|
311
|
+
not hasattr(el, "normalized_type") # Only check element.type if normalized_type doesn't exist/match
|
312
|
+
and hasattr(el, "type") and el.type == selector_type
|
313
|
+
)
|
314
|
+
filters.append({"name": filter_name, "func": func})
|
315
|
+
|
316
|
+
|
317
|
+
# Filter by attributes
|
318
|
+
for name, attr_info in selector["attributes"].items():
|
319
|
+
op = attr_info["op"]
|
320
|
+
value = attr_info["value"]
|
321
|
+
python_name = name.replace("-", "_") # Convert CSS-style names
|
322
|
+
|
323
|
+
# --- Define the core value retrieval logic ---
|
324
|
+
def get_element_value(element, name=name, python_name=python_name, selector_type=selector_type):
|
325
|
+
# Special case for region attributes
|
326
|
+
if selector_type == "region":
|
327
|
+
if name == "type":
|
328
|
+
if hasattr(element, "normalized_type") and element.normalized_type:
|
329
|
+
return element.normalized_type
|
330
|
+
else:
|
331
|
+
return getattr(element, "region_type", "").lower().replace(" ", "_")
|
332
|
+
elif name == "model":
|
333
|
+
return getattr(element, "model", None)
|
334
|
+
else:
|
335
|
+
return getattr(element, python_name, None)
|
336
|
+
else:
|
337
|
+
# General case for non-region elements
|
338
|
+
return getattr(element, python_name, None)
|
339
|
+
|
340
|
+
# --- Define the comparison function or direct check ---
|
341
|
+
filter_lambda: Callable[[Any], bool]
|
342
|
+
filter_name: str
|
343
|
+
|
344
|
+
if op == "exists":
|
345
|
+
# Special handling for attribute presence check [attr]
|
346
|
+
filter_name = f"attribute [{name} exists]"
|
347
|
+
# Lambda checks that the retrieved value is not None
|
348
|
+
filter_lambda = (
|
349
|
+
lambda el, get_val=get_element_value:
|
350
|
+
get_val(el) is not None
|
351
|
+
)
|
352
|
+
else:
|
353
|
+
# Handle operators with values (e.g., =, !=, *=, etc.)
|
354
|
+
compare_func: Callable[[Any, Any], bool]
|
355
|
+
op_desc = f"{op} {value!r}" # Default description
|
208
356
|
|
209
|
-
|
210
|
-
# Check element type
|
211
|
-
if selector["type"] != "any":
|
212
|
-
# Special handling for 'text' type to match both 'text', 'char', and 'word'
|
213
|
-
if selector["type"] == "text":
|
214
|
-
if element.type not in ["text", "char", "word"]:
|
215
|
-
return False
|
216
|
-
# Special handling for 'region' type to check for detected layout regions
|
217
|
-
elif selector["type"] == "region":
|
218
|
-
# Check if this is a Region with region_type property
|
219
|
-
if not hasattr(element, "region_type"):
|
220
|
-
return False
|
221
|
-
|
222
|
-
# If 'type' attribute specified, it will be checked in the attributes section
|
223
|
-
# Check for Docling-specific types (section-header, etc.)
|
224
|
-
elif (
|
225
|
-
hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
|
226
|
-
):
|
227
|
-
# This is a direct match with a Docling region type
|
228
|
-
pass
|
229
|
-
# Otherwise, require exact match with the element's type attribute
|
230
|
-
elif not hasattr(element, "type") or element.type != selector["type"]:
|
231
|
-
return False
|
232
|
-
|
233
|
-
# Check attributes
|
234
|
-
for name, attr_info in selector["attributes"].items():
|
235
|
-
op = attr_info["op"]
|
236
|
-
value = attr_info["value"]
|
237
|
-
|
238
|
-
# Special case for fontname attribute - allow matching part of the name
|
239
|
-
if name == "fontname" and op == "*=":
|
240
|
-
element_value = getattr(element, name, None)
|
241
|
-
if element_value is None or value.lower() not in element_value.lower():
|
242
|
-
return False
|
243
|
-
continue
|
244
|
-
|
245
|
-
# Convert hyphenated attribute names to underscore for Python properties
|
246
|
-
python_name = name.replace("-", "_")
|
247
|
-
|
248
|
-
# Special case for region attributes
|
249
|
-
if selector["type"] == "region":
|
250
|
-
if name == "type":
|
251
|
-
# Use normalized_type for comparison if available
|
252
|
-
if hasattr(element, "normalized_type") and element.normalized_type:
|
253
|
-
element_value = element.normalized_type
|
254
|
-
else:
|
255
|
-
# Convert spaces to hyphens for consistency with the normalized format
|
256
|
-
element_value = (
|
257
|
-
getattr(element, "region_type", "").lower().replace(" ", "_")
|
258
|
-
)
|
259
|
-
elif name == "model":
|
260
|
-
# Special handling for model attribute in regions
|
261
|
-
element_value = getattr(element, "model", None)
|
262
|
-
else:
|
263
|
-
# Get the attribute value from the element normally
|
264
|
-
element_value = getattr(element, python_name, None)
|
265
|
-
else:
|
266
|
-
# Get the attribute value from the element normally for non-region elements
|
267
|
-
element_value = getattr(element, python_name, None)
|
268
|
-
|
269
|
-
if element_value is None:
|
270
|
-
return False
|
271
|
-
|
272
|
-
# Apply operator
|
357
|
+
# Determine compare_func based on op (reuse existing logic)
|
273
358
|
if op == "=":
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
359
|
+
compare_func = lambda el_val, sel_val: el_val == sel_val
|
360
|
+
elif op == "!=":
|
361
|
+
compare_func = lambda el_val, sel_val: el_val != sel_val
|
362
|
+
elif op == "~":
|
363
|
+
op_desc = f"~= {value!r} (approx)"
|
364
|
+
compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
|
365
|
+
elif op == "^=":
|
366
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.startswith(sel_val)
|
367
|
+
elif op == "$=":
|
368
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.endswith(sel_val)
|
369
|
+
elif op == "*=":
|
370
|
+
if name == "fontname":
|
371
|
+
op_desc = f"*= {value!r} (contains, case-insensitive)"
|
372
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val.lower() in el_val.lower()
|
373
|
+
else:
|
374
|
+
op_desc = f"*= {value!r} (contains)"
|
375
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val in el_val
|
280
376
|
elif op == ">=":
|
281
|
-
|
282
|
-
if not (
|
283
|
-
isinstance(element_value, (int, float))
|
284
|
-
and isinstance(value, (int, float))
|
285
|
-
and element_value >= value
|
286
|
-
):
|
287
|
-
return False
|
377
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val >= sel_val
|
288
378
|
elif op == "<=":
|
289
|
-
|
290
|
-
if not (
|
291
|
-
isinstance(element_value, (int, float))
|
292
|
-
and isinstance(value, (int, float))
|
293
|
-
and element_value <= value
|
294
|
-
):
|
295
|
-
return False
|
379
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val <= sel_val
|
296
380
|
elif op == ">":
|
297
|
-
|
298
|
-
if not (
|
299
|
-
isinstance(element_value, (int, float))
|
300
|
-
and isinstance(value, (int, float))
|
301
|
-
and element_value > value
|
302
|
-
):
|
303
|
-
return False
|
381
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val > sel_val
|
304
382
|
elif op == "<":
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
383
|
+
compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val < sel_val
|
384
|
+
else:
|
385
|
+
# Should not happen with current parsing logic
|
386
|
+
logger.warning(f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'")
|
387
|
+
continue # Skip this attribute filter
|
388
|
+
|
389
|
+
# --- Create the final filter function for operators with values ---
|
390
|
+
filter_name = f"attribute [{name}{op_desc}]"
|
391
|
+
# Capture loop variables correctly in the lambda
|
392
|
+
filter_lambda = (
|
393
|
+
lambda el, get_val=get_element_value, compare=compare_func, expected_val=value:
|
394
|
+
(element_value := get_val(el)) is not None and compare(element_value, expected_val)
|
395
|
+
)
|
396
|
+
|
397
|
+
filters.append({"name": filter_name, "func": filter_lambda})
|
398
|
+
|
399
|
+
|
400
|
+
# Filter by pseudo-classes
|
401
|
+
for pseudo in selector["pseudo_classes"]:
|
402
|
+
name = pseudo["name"]
|
403
|
+
args = pseudo["args"]
|
404
|
+
filter_lambda = None
|
405
|
+
# Start with a base name, modify for specifics like :not
|
406
|
+
filter_name = f"pseudo-class :{name}"
|
407
|
+
|
408
|
+
# Relational pseudo-classes are handled separately by the caller
|
409
|
+
if name in ("above", "below", "near", "left-of", "right-of"):
|
410
|
+
continue
|
411
|
+
|
412
|
+
# --- Handle :not() ---
|
413
|
+
elif name == "not":
|
414
|
+
if not isinstance(args, dict): # args should be the parsed inner selector
|
415
|
+
logger.error(f"Invalid arguments for :not pseudo-class: {args}")
|
416
|
+
raise TypeError("Internal error: :not pseudo-class requires a parsed selector dictionary as args.")
|
417
|
+
|
418
|
+
# Recursively get the filter function for the inner selector
|
419
|
+
# Pass kwargs down in case regex/case flags affect the inner selector
|
420
|
+
inner_filter_func = selector_to_filter_func(args, **kwargs)
|
421
|
+
|
422
|
+
# The filter lambda applies the inner function and inverts the result
|
423
|
+
filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
|
424
|
+
|
425
|
+
# Try to create a descriptive name (can be long)
|
426
|
+
# Maybe simplify this later if needed
|
427
|
+
inner_filter_list = _build_filter_list(args, **kwargs)
|
428
|
+
inner_filter_names = ", ".join([f['name'] for f in inner_filter_list])
|
429
|
+
filter_name = f"pseudo-class :not({inner_filter_names})"
|
430
|
+
|
431
|
+
# --- Handle text-based pseudo-classes ---
|
432
|
+
elif name == "contains" and args is not None:
|
433
|
+
use_regex = kwargs.get("regex", False)
|
434
|
+
ignore_case = not kwargs.get("case", True) # Default case sensitive
|
435
|
+
filter_name = f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
|
436
|
+
|
437
|
+
def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
|
438
|
+
if not hasattr(element, "text") or not element.text:
|
439
|
+
return False # Element must have non-empty text
|
440
|
+
|
441
|
+
element_text = element.text
|
442
|
+
search_term = str(args) # Ensure args is string
|
443
|
+
|
444
|
+
if use_regex:
|
445
|
+
try:
|
446
|
+
pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
|
447
|
+
return bool(pattern.search(element_text))
|
448
|
+
except re.error as e:
|
449
|
+
logger.warning(f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search.")
|
450
|
+
# Fallback to literal search on regex error
|
451
|
+
if ignore_case:
|
452
|
+
return search_term.lower() in element_text.lower()
|
453
|
+
else:
|
454
|
+
return search_term in element_text
|
455
|
+
else: # Literal search
|
456
|
+
if ignore_case:
|
457
|
+
return search_term.lower() in element_text.lower()
|
458
|
+
else:
|
459
|
+
return search_term in element_text
|
460
|
+
filter_lambda = contains_check
|
461
|
+
|
462
|
+
elif name == "starts-with" and args is not None:
|
463
|
+
filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.startswith(str(arg))
|
464
|
+
elif name == "ends-with" and args is not None:
|
465
|
+
filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
|
466
|
+
|
467
|
+
# Boolean attribute pseudo-classes
|
468
|
+
elif name == "bold":
|
469
|
+
filter_lambda = lambda el: hasattr(el, "bold") and el.bold
|
470
|
+
elif name == "italic":
|
471
|
+
filter_lambda = lambda el: hasattr(el, "italic") and el.italic
|
472
|
+
elif name == "horizontal":
|
473
|
+
filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
|
474
|
+
elif name == "vertical":
|
475
|
+
filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
|
476
|
+
|
477
|
+
# Check predefined lambda functions (e.g., :first-child, :empty)
|
478
|
+
elif name in PSEUDO_CLASS_FUNCTIONS:
|
479
|
+
filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
|
480
|
+
filter_name = f"pseudo-class :{name}" # Set name for predefined ones
|
481
|
+
else:
|
482
|
+
raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
|
312
483
|
|
313
|
-
# Check pseudo-classes
|
314
|
-
for pseudo in selector["pseudo_classes"]:
|
315
|
-
name = pseudo["name"]
|
316
|
-
args = pseudo["args"]
|
317
|
-
|
318
|
-
# Handle various pseudo-classes
|
319
|
-
if name == "contains" and hasattr(element, "text"):
|
320
|
-
use_regex = kwargs.get("regex", False)
|
321
|
-
ignore_case = not kwargs.get("case", True)
|
322
|
-
|
323
|
-
if use_regex:
|
324
|
-
import re
|
325
|
-
|
326
|
-
if not element.text:
|
327
|
-
return False
|
328
|
-
try:
|
329
|
-
pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
|
330
|
-
if not pattern.search(element.text):
|
331
|
-
return False
|
332
|
-
except re.error:
|
333
|
-
# If regex is invalid, fall back to literal text search
|
334
|
-
element_text = element.text
|
335
|
-
search_text = args
|
336
|
-
|
337
|
-
if ignore_case:
|
338
|
-
element_text = element_text.lower()
|
339
|
-
search_text = search_text.lower()
|
340
|
-
|
341
|
-
if search_text not in element_text:
|
342
|
-
return False
|
343
|
-
else:
|
344
|
-
# String comparison with case sensitivity option
|
345
|
-
if not element.text:
|
346
|
-
return False
|
347
484
|
|
348
|
-
|
349
|
-
|
485
|
+
if filter_lambda:
|
486
|
+
# Use the potentially updated filter_name
|
487
|
+
filters.append({"name": filter_name, "func": filter_lambda})
|
350
488
|
|
351
|
-
|
352
|
-
element_text = element_text.lower()
|
353
|
-
search_text = search_text.lower()
|
489
|
+
return filters
|
354
490
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
if not (
|
371
|
-
return False
|
372
|
-
elif name == "vertical":
|
373
|
-
if not (hasattr(element, "is_vertical") and element.is_vertical):
|
491
|
+
|
492
|
+
def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool]:
|
493
|
+
"""
|
494
|
+
Combine a list of named filter functions into a single callable.
|
495
|
+
|
496
|
+
Args:
|
497
|
+
filters: List of dictionaries, each with 'name' and 'func'.
|
498
|
+
|
499
|
+
Returns:
|
500
|
+
A single function that takes an element and returns True only if
|
501
|
+
it passes ALL filters in the list.
|
502
|
+
"""
|
503
|
+
def combined_filter(element):
|
504
|
+
for f in filters:
|
505
|
+
try:
|
506
|
+
if not f['func'](element):
|
374
507
|
return False
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
return False # Invalid regex cannot match
|
398
|
-
else:
|
399
|
-
if search_term not in text_to_check:
|
400
|
-
return False
|
401
|
-
# Skip complex pseudo-classes like :near, :above here, handled later
|
402
|
-
elif name in ("above", "below", "near", "left-of", "right-of"):
|
403
|
-
pass # Handled separately after initial filtering
|
404
|
-
else:
|
405
|
-
# Optionally log unknown pseudo-classes
|
406
|
-
# logger.warning(f"Unknown pseudo-class: {name}")
|
407
|
-
pass
|
508
|
+
except Exception as e:
|
509
|
+
logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
|
510
|
+
return False # Treat errors as filter failures
|
511
|
+
return True
|
512
|
+
return combined_filter
|
513
|
+
|
514
|
+
|
515
|
+
def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
|
516
|
+
"""
|
517
|
+
Convert a parsed selector to a single filter function.
|
518
|
+
|
519
|
+
Internally, this builds a list of individual filters and then combines them.
|
520
|
+
To inspect the individual filters, call `_build_filter_list` directly.
|
521
|
+
|
522
|
+
Args:
|
523
|
+
selector: Parsed selector dictionary
|
524
|
+
**kwargs: Additional filter parameters (e.g., regex, case).
|
525
|
+
|
526
|
+
Returns:
|
527
|
+
Function that takes an element and returns True if it matches the selector.
|
528
|
+
"""
|
529
|
+
filter_list = _build_filter_list(selector, **kwargs)
|
408
530
|
|
409
|
-
|
531
|
+
if logger.isEnabledFor(logging.DEBUG):
|
532
|
+
filter_names = [f['name'] for f in filter_list]
|
533
|
+
logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
|
534
|
+
|
535
|
+
return _assemble_filter_func(filter_list)
|
410
536
|
|
411
|
-
return filter_func
|