natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,12 @@ CSS-like selector parser for natural-pdf.
4
4
 
5
5
  import ast
6
6
  import re
7
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ from typing import Any, Dict, List, Optional, Tuple, Union, Callable
8
8
 
9
9
  from colour import Color
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
10
13
 
11
14
 
12
15
  def safe_parse_value(value_str: str) -> Any:
@@ -72,10 +75,11 @@ def parse_selector(selector: str) -> Dict[str, Any]:
72
75
  """
73
76
  Parse a CSS-like selector string into a structured selector object.
74
77
 
75
- Examples:
76
- - 'text:contains("Revenue")'
77
- - 'table:below("Financial Data")'
78
- - 'rect[fill=(1,0,0)]'
78
+ Handles:
79
+ - Element types (e.g., 'text', 'rect')
80
+ - Attribute presence (e.g., '[data-id]')
81
+ - Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
82
+ - Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
79
83
 
80
84
  Args:
81
85
  selector: CSS-like selector string
@@ -83,51 +87,130 @@ def parse_selector(selector: str) -> Dict[str, Any]:
83
87
  Returns:
84
88
  Dict representing the parsed selector
85
89
  """
86
- # Basic structure for result
87
90
  result = {
88
- "type": "any", # Default to any element type
89
- "filters": [],
91
+ "type": "any",
90
92
  "attributes": {},
91
93
  "pseudo_classes": [],
94
+ "filters": [], # Keep this for potential future use
92
95
  }
93
96
 
94
- # Check if empty or None
97
+ original_selector_for_error = selector # Keep for error messages
95
98
  if not selector or not isinstance(selector, str):
96
99
  return result
97
100
 
98
- # Parse element type
99
- type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
100
- if type_match:
101
- result["type"] = type_match.group(1).lower()
102
- selector = selector[len(type_match.group(0)) :]
103
-
104
- # Parse attributes (e.g., [color=(1,0,0)])
105
- attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
106
- attr_matches = re.findall(attr_pattern, selector)
107
- for name, op, value in attr_matches:
108
- # Handle special parsing for color attributes
109
- if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
110
- value = safe_parse_color(value)
111
- else:
112
- # Safe parsing for other attributes
113
- value = safe_parse_value(value)
114
-
115
- # Store attribute with operator
116
- result["attributes"][name] = {"op": op, "value": value}
117
-
118
- # Parse pseudo-classes (e.g., :contains("text"))
119
- pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
120
- pseudo_matches = re.findall(pseudo_pattern, selector)
121
- for name, args in pseudo_matches:
122
- # Process arguments
123
- processed_args = args
124
- if args:
125
- if name in ["color", "background"]:
126
- processed_args = safe_parse_color(args)
101
+ selector = selector.strip()
102
+
103
+ # --- NEW: Handle wildcard selector explicitly ---
104
+ if selector == "*":
105
+ # Wildcard matches any type, already the default.
106
+ # Clear selector so the loop doesn't run and error out.
107
+ selector = ""
108
+ # --- END NEW ---
109
+
110
+ # 1. Extract type (optional, at the beginning)
111
+ # Only run if selector wasn't '*'
112
+ if selector:
113
+ type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
114
+ if type_match:
115
+ result["type"] = type_match.group(1).lower()
116
+ selector = selector[len(type_match.group(0)):].strip()
117
+
118
+ # Regexes for parts at the START of the remaining string
119
+ # Attribute: Starts with [, ends with ], content is non-greedy non-] chars
120
+ attr_pattern = re.compile(r"^\[\s*([^\s\]]+.*?)\s*\]")
121
+ # Pseudo: Starts with :, name is letters/hyphen/underscore, optionally followed by (...)
122
+ pseudo_pattern = re.compile(r"^:([a-zA-Z_\-]+)(?:\((.*?)\))?")
123
+ # :not() specifically requires careful parenthesis matching later
124
+ not_pseudo_prefix = ":not("
125
+
126
+ # 2. Iteratively parse attributes and pseudo-classes
127
+ while selector:
128
+ processed_chunk = False
129
+
130
+ # Check for attribute block `[...]`
131
+ attr_match = attr_pattern.match(selector)
132
+ if attr_match:
133
+ block_content = attr_match.group(1).strip()
134
+ # Parse the content inside the block
135
+ # Pattern: name, optional op, optional value
136
+ detail_match = re.match(r"^([a-zA-Z_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content)
137
+ if not detail_match:
138
+ raise ValueError(f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'")
139
+
140
+ name, op, value_str = detail_match.groups()
141
+
142
+ if op is None:
143
+ # Presence selector [attr]
144
+ result["attributes"][name] = {"op": "exists", "value": None}
127
145
  else:
128
- processed_args = safe_parse_value(args)
129
-
130
- result["pseudo_classes"].append({"name": name, "args": processed_args})
146
+ # Operator exists, value must also exist (even if empty via quotes)
147
+ if value_str is None: # Catches invalid [attr=]
148
+ raise ValueError(
149
+ f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
150
+ )
151
+ # Parse value
152
+ parsed_value: Any
153
+ if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
154
+ parsed_value = safe_parse_color(value_str)
155
+ else:
156
+ parsed_value = safe_parse_value(value_str) # Handles quotes
157
+ result["attributes"][name] = {"op": op, "value": parsed_value}
158
+
159
+ selector = selector[attr_match.end():].strip()
160
+ processed_chunk = True
161
+ continue
162
+
163
+ # Check for :not(...) block
164
+ if selector.lower().startswith(not_pseudo_prefix):
165
+ start_index = len(not_pseudo_prefix) - 1 # Index of '('
166
+ nesting = 1
167
+ end_index = -1
168
+ for i in range(start_index + 1, len(selector)):
169
+ if selector[i] == '(': nesting += 1
170
+ elif selector[i] == ')':
171
+ nesting -= 1
172
+ if nesting == 0:
173
+ end_index = i
174
+ break
175
+
176
+ if end_index == -1:
177
+ raise ValueError(f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'")
178
+
179
+ inner_selector_str = selector[start_index + 1 : end_index].strip()
180
+ if not inner_selector_str:
181
+ raise ValueError(f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'")
182
+
183
+ # Recursively parse the inner selector
184
+ parsed_inner_selector = parse_selector(inner_selector_str)
185
+ result["pseudo_classes"].append({'name': 'not', 'args': parsed_inner_selector})
186
+
187
+ selector = selector[end_index + 1:].strip()
188
+ processed_chunk = True
189
+ continue
190
+
191
+ # Check for other pseudo-class blocks `:name` or `:name(...)`
192
+ pseudo_match = pseudo_pattern.match(selector)
193
+ if pseudo_match:
194
+ name, args_str = pseudo_match.groups()
195
+ name = name.lower() # Normalize pseudo-class name
196
+ processed_args = args_str # Keep as string initially, or None
197
+
198
+ if args_str is not None:
199
+ # Only parse args if they exist and based on the pseudo-class type
200
+ if name in ["color", "background"]:
201
+ processed_args = safe_parse_color(args_str)
202
+ else:
203
+ processed_args = safe_parse_value(args_str)
204
+ # else: args remain None
205
+
206
+ result["pseudo_classes"].append({"name": name, "args": processed_args})
207
+ selector = selector[pseudo_match.end():].strip()
208
+ processed_chunk = True
209
+ continue
210
+
211
+ # If we reach here and the selector string is not empty, something is wrong
212
+ if not processed_chunk and selector:
213
+ raise ValueError(f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'")
131
214
 
132
215
  return result
133
216
 
@@ -182,19 +265,20 @@ PSEUDO_CLASS_FUNCTIONS = {
182
265
  "italic": lambda el: hasattr(el, "italic") and el.italic,
183
266
  "first-child": lambda el: hasattr(el, "parent")
184
267
  and el.parent
185
- and el.parent.children[0] == el, # Example placeholder
268
+ and el.parent.children[0] == el,
186
269
  "last-child": lambda el: hasattr(el, "parent")
187
270
  and el.parent
188
- and el.parent.children[-1] == el, # Example placeholder
189
- # Add the new pseudo-classes for negation
271
+ and el.parent.children[-1] == el,
272
+ "empty": lambda el: not el.text,
273
+ "not-empty": lambda el: el.text,
190
274
  "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
191
275
  "not-italic": lambda el: hasattr(el, "italic") and not el.italic,
192
276
  }
193
277
 
194
278
 
195
- def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
279
+ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
196
280
  """
197
- Convert a parsed selector to a filter function.
281
+ Convert a parsed selector to a list of named filter functions.
198
282
 
199
283
  Args:
200
284
  selector: Parsed selector dictionary
@@ -203,209 +287,250 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
203
287
  - case: Whether to do case-sensitive text search
204
288
 
205
289
  Returns:
206
- Function that takes an element and returns True if it matches
290
+ List of dictionaries, each with 'name' (str) and 'func' (callable).
291
+ The callable takes an element and returns True if it matches the specific filter.
207
292
  """
293
+ filters: List[Dict[str, Any]] = []
294
+ selector_type = selector["type"]
295
+
296
+ # Filter by element type
297
+ if selector_type != "any":
298
+ filter_name = f"type is '{selector_type}'"
299
+ if selector_type == "text":
300
+ filter_name = "type is 'text', 'char', or 'word'"
301
+ func = lambda el: hasattr(el, "type") and el.type in ["text", "char", "word"]
302
+ elif selector_type == "region":
303
+ filter_name = "type is 'region' (has region_type)"
304
+ # Note: Specific region type attribute (e.g., [type=table]) is checked below
305
+ func = lambda el: hasattr(el, "region_type")
306
+ else:
307
+ # Check against normalized_type first, then element.type
308
+ func = lambda el: (
309
+ hasattr(el, "normalized_type") and el.normalized_type == selector_type
310
+ ) or (
311
+ not hasattr(el, "normalized_type") # Only check element.type if normalized_type doesn't exist/match
312
+ and hasattr(el, "type") and el.type == selector_type
313
+ )
314
+ filters.append({"name": filter_name, "func": func})
315
+
316
+
317
+ # Filter by attributes
318
+ for name, attr_info in selector["attributes"].items():
319
+ op = attr_info["op"]
320
+ value = attr_info["value"]
321
+ python_name = name.replace("-", "_") # Convert CSS-style names
322
+
323
+ # --- Define the core value retrieval logic ---
324
+ def get_element_value(element, name=name, python_name=python_name, selector_type=selector_type):
325
+ # Special case for region attributes
326
+ if selector_type == "region":
327
+ if name == "type":
328
+ if hasattr(element, "normalized_type") and element.normalized_type:
329
+ return element.normalized_type
330
+ else:
331
+ return getattr(element, "region_type", "").lower().replace(" ", "_")
332
+ elif name == "model":
333
+ return getattr(element, "model", None)
334
+ else:
335
+ return getattr(element, python_name, None)
336
+ else:
337
+ # General case for non-region elements
338
+ return getattr(element, python_name, None)
339
+
340
+ # --- Define the comparison function or direct check ---
341
+ filter_lambda: Callable[[Any], bool]
342
+ filter_name: str
343
+
344
+ if op == "exists":
345
+ # Special handling for attribute presence check [attr]
346
+ filter_name = f"attribute [{name} exists]"
347
+ # Lambda checks that the retrieved value is not None
348
+ filter_lambda = (
349
+ lambda el, get_val=get_element_value:
350
+ get_val(el) is not None
351
+ )
352
+ else:
353
+ # Handle operators with values (e.g., =, !=, *=, etc.)
354
+ compare_func: Callable[[Any, Any], bool]
355
+ op_desc = f"{op} {value!r}" # Default description
208
356
 
209
- def filter_func(element):
210
- # Check element type
211
- if selector["type"] != "any":
212
- # Special handling for 'text' type to match both 'text', 'char', and 'word'
213
- if selector["type"] == "text":
214
- if element.type not in ["text", "char", "word"]:
215
- return False
216
- # Special handling for 'region' type to check for detected layout regions
217
- elif selector["type"] == "region":
218
- # Check if this is a Region with region_type property
219
- if not hasattr(element, "region_type"):
220
- return False
221
-
222
- # If 'type' attribute specified, it will be checked in the attributes section
223
- # Check for Docling-specific types (section-header, etc.)
224
- elif (
225
- hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
226
- ):
227
- # This is a direct match with a Docling region type
228
- pass
229
- # Otherwise, require exact match with the element's type attribute
230
- elif not hasattr(element, "type") or element.type != selector["type"]:
231
- return False
232
-
233
- # Check attributes
234
- for name, attr_info in selector["attributes"].items():
235
- op = attr_info["op"]
236
- value = attr_info["value"]
237
-
238
- # Special case for fontname attribute - allow matching part of the name
239
- if name == "fontname" and op == "*=":
240
- element_value = getattr(element, name, None)
241
- if element_value is None or value.lower() not in element_value.lower():
242
- return False
243
- continue
244
-
245
- # Convert hyphenated attribute names to underscore for Python properties
246
- python_name = name.replace("-", "_")
247
-
248
- # Special case for region attributes
249
- if selector["type"] == "region":
250
- if name == "type":
251
- # Use normalized_type for comparison if available
252
- if hasattr(element, "normalized_type") and element.normalized_type:
253
- element_value = element.normalized_type
254
- else:
255
- # Convert spaces to hyphens for consistency with the normalized format
256
- element_value = (
257
- getattr(element, "region_type", "").lower().replace(" ", "_")
258
- )
259
- elif name == "model":
260
- # Special handling for model attribute in regions
261
- element_value = getattr(element, "model", None)
262
- else:
263
- # Get the attribute value from the element normally
264
- element_value = getattr(element, python_name, None)
265
- else:
266
- # Get the attribute value from the element normally for non-region elements
267
- element_value = getattr(element, python_name, None)
268
-
269
- if element_value is None:
270
- return False
271
-
272
- # Apply operator
357
+ # Determine compare_func based on op (reuse existing logic)
273
358
  if op == "=":
274
- if element_value != value:
275
- return False
276
- elif op == "~=":
277
- # Approximate match (e.g., for colors)
278
- if not _is_approximate_match(element_value, value):
279
- return False
359
+ compare_func = lambda el_val, sel_val: el_val == sel_val
360
+ elif op == "!=":
361
+ compare_func = lambda el_val, sel_val: el_val != sel_val
362
+ elif op == "~":
363
+ op_desc = f"~= {value!r} (approx)"
364
+ compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
365
+ elif op == "^=":
366
+ compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.startswith(sel_val)
367
+ elif op == "$=":
368
+ compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.endswith(sel_val)
369
+ elif op == "*=":
370
+ if name == "fontname":
371
+ op_desc = f"*= {value!r} (contains, case-insensitive)"
372
+ compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val.lower() in el_val.lower()
373
+ else:
374
+ op_desc = f"*= {value!r} (contains)"
375
+ compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val in el_val
280
376
  elif op == ">=":
281
- # Greater than or equal (element value must be >= specified value)
282
- if not (
283
- isinstance(element_value, (int, float))
284
- and isinstance(value, (int, float))
285
- and element_value >= value
286
- ):
287
- return False
377
+ compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val >= sel_val
288
378
  elif op == "<=":
289
- # Less than or equal (element value must be <= specified value)
290
- if not (
291
- isinstance(element_value, (int, float))
292
- and isinstance(value, (int, float))
293
- and element_value <= value
294
- ):
295
- return False
379
+ compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val <= sel_val
296
380
  elif op == ">":
297
- # Greater than (element value must be > specified value)
298
- if not (
299
- isinstance(element_value, (int, float))
300
- and isinstance(value, (int, float))
301
- and element_value > value
302
- ):
303
- return False
381
+ compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val > sel_val
304
382
  elif op == "<":
305
- # Less than (element value must be < specified value)
306
- if not (
307
- isinstance(element_value, (int, float))
308
- and isinstance(value, (int, float))
309
- and element_value < value
310
- ):
311
- return False
383
+ compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val < sel_val
384
+ else:
385
+ # Should not happen with current parsing logic
386
+ logger.warning(f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'")
387
+ continue # Skip this attribute filter
388
+
389
+ # --- Create the final filter function for operators with values ---
390
+ filter_name = f"attribute [{name}{op_desc}]"
391
+ # Capture loop variables correctly in the lambda
392
+ filter_lambda = (
393
+ lambda el, get_val=get_element_value, compare=compare_func, expected_val=value:
394
+ (element_value := get_val(el)) is not None and compare(element_value, expected_val)
395
+ )
396
+
397
+ filters.append({"name": filter_name, "func": filter_lambda})
398
+
399
+
400
+ # Filter by pseudo-classes
401
+ for pseudo in selector["pseudo_classes"]:
402
+ name = pseudo["name"]
403
+ args = pseudo["args"]
404
+ filter_lambda = None
405
+ # Start with a base name, modify for specifics like :not
406
+ filter_name = f"pseudo-class :{name}"
407
+
408
+ # Relational pseudo-classes are handled separately by the caller
409
+ if name in ("above", "below", "near", "left-of", "right-of"):
410
+ continue
411
+
412
+ # --- Handle :not() ---
413
+ elif name == "not":
414
+ if not isinstance(args, dict): # args should be the parsed inner selector
415
+ logger.error(f"Invalid arguments for :not pseudo-class: {args}")
416
+ raise TypeError("Internal error: :not pseudo-class requires a parsed selector dictionary as args.")
417
+
418
+ # Recursively get the filter function for the inner selector
419
+ # Pass kwargs down in case regex/case flags affect the inner selector
420
+ inner_filter_func = selector_to_filter_func(args, **kwargs)
421
+
422
+ # The filter lambda applies the inner function and inverts the result
423
+ filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
424
+
425
+ # Try to create a descriptive name (can be long)
426
+ # Maybe simplify this later if needed
427
+ inner_filter_list = _build_filter_list(args, **kwargs)
428
+ inner_filter_names = ", ".join([f['name'] for f in inner_filter_list])
429
+ filter_name = f"pseudo-class :not({inner_filter_names})"
430
+
431
+ # --- Handle text-based pseudo-classes ---
432
+ elif name == "contains" and args is not None:
433
+ use_regex = kwargs.get("regex", False)
434
+ ignore_case = not kwargs.get("case", True) # Default case sensitive
435
+ filter_name = f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
436
+
437
+ def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
438
+ if not hasattr(element, "text") or not element.text:
439
+ return False # Element must have non-empty text
440
+
441
+ element_text = element.text
442
+ search_term = str(args) # Ensure args is string
443
+
444
+ if use_regex:
445
+ try:
446
+ pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
447
+ return bool(pattern.search(element_text))
448
+ except re.error as e:
449
+ logger.warning(f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search.")
450
+ # Fallback to literal search on regex error
451
+ if ignore_case:
452
+ return search_term.lower() in element_text.lower()
453
+ else:
454
+ return search_term in element_text
455
+ else: # Literal search
456
+ if ignore_case:
457
+ return search_term.lower() in element_text.lower()
458
+ else:
459
+ return search_term in element_text
460
+ filter_lambda = contains_check
461
+
462
+ elif name == "starts-with" and args is not None:
463
+ filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.startswith(str(arg))
464
+ elif name == "ends-with" and args is not None:
465
+ filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
466
+
467
+ # Boolean attribute pseudo-classes
468
+ elif name == "bold":
469
+ filter_lambda = lambda el: hasattr(el, "bold") and el.bold
470
+ elif name == "italic":
471
+ filter_lambda = lambda el: hasattr(el, "italic") and el.italic
472
+ elif name == "horizontal":
473
+ filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
474
+ elif name == "vertical":
475
+ filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
476
+
477
+ # Check predefined lambda functions (e.g., :first-child, :empty)
478
+ elif name in PSEUDO_CLASS_FUNCTIONS:
479
+ filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
480
+ filter_name = f"pseudo-class :{name}" # Set name for predefined ones
481
+ else:
482
+ raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
312
483
 
313
- # Check pseudo-classes
314
- for pseudo in selector["pseudo_classes"]:
315
- name = pseudo["name"]
316
- args = pseudo["args"]
317
-
318
- # Handle various pseudo-classes
319
- if name == "contains" and hasattr(element, "text"):
320
- use_regex = kwargs.get("regex", False)
321
- ignore_case = not kwargs.get("case", True)
322
-
323
- if use_regex:
324
- import re
325
-
326
- if not element.text:
327
- return False
328
- try:
329
- pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
330
- if not pattern.search(element.text):
331
- return False
332
- except re.error:
333
- # If regex is invalid, fall back to literal text search
334
- element_text = element.text
335
- search_text = args
336
-
337
- if ignore_case:
338
- element_text = element_text.lower()
339
- search_text = search_text.lower()
340
-
341
- if search_text not in element_text:
342
- return False
343
- else:
344
- # String comparison with case sensitivity option
345
- if not element.text:
346
- return False
347
484
 
348
- element_text = element.text
349
- search_text = args
485
+ if filter_lambda:
486
+ # Use the potentially updated filter_name
487
+ filters.append({"name": filter_name, "func": filter_lambda})
350
488
 
351
- if ignore_case:
352
- element_text = element_text.lower()
353
- search_text = search_text.lower()
489
+ return filters
354
490
 
355
- if search_text not in element_text:
356
- return False
357
- elif name == "starts-with" and hasattr(element, "text"):
358
- if not element.text or not element.text.startswith(args):
359
- return False
360
- elif name == "ends-with" and hasattr(element, "text"):
361
- if not element.text or not element.text.endswith(args):
362
- return False
363
- elif name == "bold":
364
- if not (hasattr(element, "bold") and element.bold):
365
- return False
366
- elif name == "italic":
367
- if not (hasattr(element, "italic") and element.italic):
368
- return False
369
- elif name == "horizontal":
370
- if not (hasattr(element, "is_horizontal") and element.is_horizontal):
371
- return False
372
- elif name == "vertical":
373
- if not (hasattr(element, "is_vertical") and element.is_vertical):
491
+
492
+ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool]:
493
+ """
494
+ Combine a list of named filter functions into a single callable.
495
+
496
+ Args:
497
+ filters: List of dictionaries, each with 'name' and 'func'.
498
+
499
+ Returns:
500
+ A single function that takes an element and returns True only if
501
+ it passes ALL filters in the list.
502
+ """
503
+ def combined_filter(element):
504
+ for f in filters:
505
+ try:
506
+ if not f['func'](element):
374
507
  return False
375
- else:
376
- # Check pseudo-classes (basic ones like :bold, :italic)
377
- if name in PSEUDO_CLASS_FUNCTIONS:
378
- if not PSEUDO_CLASS_FUNCTIONS[name](element):
379
- return False
380
- elif name == "contains":
381
- if not hasattr(element, "text") or not element.text:
382
- return False
383
- text_to_check = element.text
384
- search_term = args
385
- if not kwargs.get("case", True): # Check case flag from kwargs
386
- text_to_check = text_to_check.lower()
387
- search_term = search_term.lower()
388
-
389
- if kwargs.get("regex", False): # Check regex flag from kwargs
390
- try:
391
- if not re.search(search_term, text_to_check):
392
- return False
393
- except re.error as e:
394
- logger.warning(
395
- f"Invalid regex in :contains selector '{search_term}': {e}"
396
- )
397
- return False # Invalid regex cannot match
398
- else:
399
- if search_term not in text_to_check:
400
- return False
401
- # Skip complex pseudo-classes like :near, :above here, handled later
402
- elif name in ("above", "below", "near", "left-of", "right-of"):
403
- pass # Handled separately after initial filtering
404
- else:
405
- # Optionally log unknown pseudo-classes
406
- # logger.warning(f"Unknown pseudo-class: {name}")
407
- pass
508
+ except Exception as e:
509
+ logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
510
+ return False # Treat errors as filter failures
511
+ return True
512
+ return combined_filter
513
+
514
+
515
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
516
+ """
517
+ Convert a parsed selector to a single filter function.
518
+
519
+ Internally, this builds a list of individual filters and then combines them.
520
+ To inspect the individual filters, call `_build_filter_list` directly.
521
+
522
+ Args:
523
+ selector: Parsed selector dictionary
524
+ **kwargs: Additional filter parameters (e.g., regex, case).
525
+
526
+ Returns:
527
+ Function that takes an element and returns True if it matches the selector.
528
+ """
529
+ filter_list = _build_filter_list(selector, **kwargs)
408
530
 
409
- return True # Element passes all attribute and simple pseudo-class filters
531
+ if logger.isEnabledFor(logging.DEBUG):
532
+ filter_names = [f['name'] for f in filter_list]
533
+ logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
534
+
535
+ return _assemble_filter_func(filter_list)
410
536
 
411
- return filter_func