natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -3,11 +3,14 @@ CSS-like selector parser for natural-pdf.
3
3
  """
4
4
 
5
5
  import ast
6
+ import logging
6
7
  import re
7
- from typing import Any, Dict, List, Optional, Tuple, Union
8
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
9
10
  from colour import Color
10
11
 
12
+ logger = logging.getLogger(__name__)
13
+
11
14
 
12
15
  def safe_parse_value(value_str: str) -> Any:
13
16
  """
@@ -72,10 +75,11 @@ def parse_selector(selector: str) -> Dict[str, Any]:
72
75
  """
73
76
  Parse a CSS-like selector string into a structured selector object.
74
77
 
75
- Examples:
76
- - 'text:contains("Revenue")'
77
- - 'table:below("Financial Data")'
78
- - 'rect[fill=(1,0,0)]'
78
+ Handles:
79
+ - Element types (e.g., 'text', 'rect')
80
+ - Attribute presence (e.g., '[data-id]')
81
+ - Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
82
+ - Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
79
83
 
80
84
  Args:
81
85
  selector: CSS-like selector string
@@ -83,51 +87,154 @@ def parse_selector(selector: str) -> Dict[str, Any]:
83
87
  Returns:
84
88
  Dict representing the parsed selector
85
89
  """
86
- # Basic structure for result
87
90
  result = {
88
- "type": "any", # Default to any element type
89
- "filters": [],
90
- "attributes": {},
91
+ "type": "any",
92
+ "attributes": [],
91
93
  "pseudo_classes": [],
94
+ "filters": [], # Keep this for potential future use
92
95
  }
93
96
 
94
- # Check if empty or None
97
+ original_selector_for_error = selector # Keep for error messages
95
98
  if not selector or not isinstance(selector, str):
96
99
  return result
97
100
 
98
- # Parse element type
99
- type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
100
- if type_match:
101
- result["type"] = type_match.group(1).lower()
102
- selector = selector[len(type_match.group(0)) :]
103
-
104
- # Parse attributes (e.g., [color=(1,0,0)])
105
- attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
106
- attr_matches = re.findall(attr_pattern, selector)
107
- for name, op, value in attr_matches:
108
- # Handle special parsing for color attributes
109
- if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
110
- value = safe_parse_color(value)
111
- else:
112
- # Safe parsing for other attributes
113
- value = safe_parse_value(value)
114
-
115
- # Store attribute with operator
116
- result["attributes"][name] = {"op": op, "value": value}
117
-
118
- # Parse pseudo-classes (e.g., :contains("text"))
119
- pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
120
- pseudo_matches = re.findall(pseudo_pattern, selector)
121
- for name, args in pseudo_matches:
122
- # Process arguments
123
- processed_args = args
124
- if args:
125
- if name in ["color", "background"]:
126
- processed_args = safe_parse_color(args)
101
+ selector = selector.strip()
102
+
103
+ # --- Handle wildcard selector explicitly ---
104
+ if selector == "*":
105
+ # Wildcard matches any type, already the default.
106
+ # Clear selector so the loop doesn't run and error out.
107
+ selector = ""
108
+ # --- END NEW ---
109
+
110
+ # 1. Extract type (optional, at the beginning)
111
+ # Only run if selector wasn't '*'
112
+ if selector:
113
+ type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
114
+ if type_match:
115
+ result["type"] = type_match.group(1).lower()
116
+ selector = selector[len(type_match.group(0)) :].strip()
117
+ # Only run if selector wasn't '*'
118
+ if selector:
119
+ type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
120
+ if type_match:
121
+ result["type"] = type_match.group(1).lower()
122
+ selector = selector[len(type_match.group(0)) :].strip()
123
+
124
+ # Regexes for parts at the START of the remaining string
125
+ # Attribute: Starts with [, ends with ], content is non-greedy non-] chars
126
+ attr_pattern = re.compile(r"^\[\s*([^\s\]]+.*?)\s*\]")
127
+ # Pseudo: Starts with :, name is letters/hyphen/underscore, optionally followed by (...)
128
+ pseudo_pattern = re.compile(r"^:([a-zA-Z_\-]+)(?:\((.*?)\))?")
129
+ # :not() specifically requires careful parenthesis matching later
130
+ not_pseudo_prefix = ":not("
131
+
132
+ # 2. Iteratively parse attributes and pseudo-classes
133
+ while selector:
134
+ processed_chunk = False
135
+
136
+ # Check for attribute block `[...]`
137
+ attr_match = attr_pattern.match(selector)
138
+ if attr_match:
139
+ block_content = attr_match.group(1).strip()
140
+ # Parse the content inside the block
141
+ # Pattern: name, optional op, optional value
142
+ detail_match = re.match(
143
+ r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
144
+ )
145
+ if not detail_match:
146
+ raise ValueError(
147
+ f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
148
+ )
149
+
150
+ name, op, value_str = detail_match.groups()
151
+
152
+ if op is None:
153
+ # Presence selector [attr]
154
+ result["attributes"].append({"name": name, "op": "exists", "value": None})
127
155
  else:
128
- processed_args = safe_parse_value(args)
156
+ # Operator exists, value must also exist (even if empty via quotes)
157
+ if value_str is None: # Catches invalid [attr=]
158
+ raise ValueError(
159
+ f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
160
+ )
161
+ # Parse value
162
+ parsed_value: Any
163
+ if name in [
164
+ "color",
165
+ "non_stroking_color",
166
+ "fill",
167
+ "stroke",
168
+ "strokeColor",
169
+ "fillColor",
170
+ ]:
171
+ parsed_value = safe_parse_color(value_str)
172
+ else:
173
+ parsed_value = safe_parse_value(value_str) # Handles quotes
174
+ result["attributes"].append({"name": name, "op": op, "value": parsed_value})
175
+
176
+ selector = selector[attr_match.end() :].strip()
177
+ processed_chunk = True
178
+ continue
179
+
180
+ # Check for :not(...) block
181
+ if selector.lower().startswith(not_pseudo_prefix):
182
+ start_index = len(not_pseudo_prefix) - 1 # Index of '('
183
+ nesting = 1
184
+ end_index = -1
185
+ for i in range(start_index + 1, len(selector)):
186
+ if selector[i] == "(":
187
+ nesting += 1
188
+ elif selector[i] == ")":
189
+ nesting -= 1
190
+ if nesting == 0:
191
+ end_index = i
192
+ break
193
+
194
+ if end_index == -1:
195
+ raise ValueError(
196
+ f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
197
+ )
198
+
199
+ inner_selector_str = selector[start_index + 1 : end_index].strip()
200
+ if not inner_selector_str:
201
+ raise ValueError(
202
+ f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
203
+ )
204
+
205
+ # Recursively parse the inner selector
206
+ parsed_inner_selector = parse_selector(inner_selector_str)
207
+ result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
208
+
209
+ selector = selector[end_index + 1 :].strip()
210
+ processed_chunk = True
211
+ continue
212
+
213
+ # Check for other pseudo-class blocks `:name` or `:name(...)`
214
+ pseudo_match = pseudo_pattern.match(selector)
215
+ if pseudo_match:
216
+ name, args_str = pseudo_match.groups()
217
+ name = name.lower() # Normalize pseudo-class name
218
+ processed_args = args_str # Keep as string initially, or None
219
+
220
+ if args_str is not None:
221
+ # Only parse args if they exist and based on the pseudo-class type
222
+ if name in ["color", "background"]:
223
+ processed_args = safe_parse_color(args_str)
224
+ else:
225
+ processed_args = safe_parse_value(args_str)
226
+ # else: args remain None
129
227
 
130
- result["pseudo_classes"].append({"name": name, "args": processed_args})
228
+ result["pseudo_classes"].append({"name": name, "args": processed_args})
229
+ selector = selector[pseudo_match.end() :].strip()
230
+ processed_chunk = True
231
+ continue
232
+
233
+ # If we reach here and the selector string is not empty, something is wrong
234
+ if not processed_chunk and selector:
235
+ raise ValueError(
236
+ f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
237
+ )
131
238
 
132
239
  return result
133
240
 
@@ -180,21 +287,18 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
180
287
  PSEUDO_CLASS_FUNCTIONS = {
181
288
  "bold": lambda el: hasattr(el, "bold") and el.bold,
182
289
  "italic": lambda el: hasattr(el, "italic") and el.italic,
183
- "first-child": lambda el: hasattr(el, "parent")
184
- and el.parent
185
- and el.parent.children[0] == el, # Example placeholder
186
- "last-child": lambda el: hasattr(el, "parent")
187
- and el.parent
188
- and el.parent.children[-1] == el, # Example placeholder
189
- # Add the new pseudo-classes for negation
290
+ "first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
291
+ "last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
292
+ "empty": lambda el: not el.text,
293
+ "not-empty": lambda el: el.text,
190
294
  "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
191
295
  "not-italic": lambda el: hasattr(el, "italic") and not el.italic,
192
296
  }
193
297
 
194
298
 
195
- def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
299
+ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
196
300
  """
197
- Convert a parsed selector to a filter function.
301
+ Convert a parsed selector to a list of named filter functions.
198
302
 
199
303
  Args:
200
304
  selector: Parsed selector dictionary
@@ -203,209 +307,306 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
203
307
  - case: Whether to do case-sensitive text search
204
308
 
205
309
  Returns:
206
- Function that takes an element and returns True if it matches
310
+ List of dictionaries, each with 'name' (str) and 'func' (callable).
311
+ The callable takes an element and returns True if it matches the specific filter.
207
312
  """
208
-
209
- def filter_func(element):
210
- # Check element type
211
- if selector["type"] != "any":
212
- # Special handling for 'text' type to match both 'text', 'char', and 'word'
213
- if selector["type"] == "text":
214
- if element.type not in ["text", "char", "word"]:
215
- return False
216
- # Special handling for 'region' type to check for detected layout regions
217
- elif selector["type"] == "region":
218
- # Check if this is a Region with region_type property
219
- if not hasattr(element, "region_type"):
220
- return False
221
-
222
- # If 'type' attribute specified, it will be checked in the attributes section
223
- # Check for Docling-specific types (section-header, etc.)
224
- elif (
225
- hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
226
- ):
227
- # This is a direct match with a Docling region type
228
- pass
229
- # Otherwise, require exact match with the element's type attribute
230
- elif not hasattr(element, "type") or element.type != selector["type"]:
231
- return False
232
-
233
- # Check attributes
234
- for name, attr_info in selector["attributes"].items():
235
- op = attr_info["op"]
236
- value = attr_info["value"]
237
-
238
- # Special case for fontname attribute - allow matching part of the name
239
- if name == "fontname" and op == "*=":
240
- element_value = getattr(element, name, None)
241
- if element_value is None or value.lower() not in element_value.lower():
242
- return False
243
- continue
244
-
245
- # Convert hyphenated attribute names to underscore for Python properties
246
- python_name = name.replace("-", "_")
313
+ filters: List[Dict[str, Any]] = []
314
+ selector_type = selector["type"]
315
+
316
+ # Filter by element type
317
+ if selector_type != "any":
318
+ filter_name = f"type is '{selector_type}'"
319
+ if selector_type == "text":
320
+ filter_name = "type is 'text', 'char', or 'word'"
321
+ func = lambda el: hasattr(el, "type") and el.type in ["text", "char", "word"]
322
+ elif selector_type == "region":
323
+ filter_name = "type is 'region' (has region_type)"
324
+ # Note: Specific region type attribute (e.g., [type=table]) is checked below
325
+ func = lambda el: hasattr(el, "region_type")
326
+ else:
327
+ # Check against normalized_type first, then element.type
328
+ func = lambda el: (
329
+ hasattr(el, "normalized_type") and el.normalized_type == selector_type
330
+ ) or (
331
+ not hasattr(
332
+ el, "normalized_type"
333
+ ) # Only check element.type if normalized_type doesn't exist/match
334
+ and hasattr(el, "type")
335
+ and el.type == selector_type
336
+ )
337
+ filters.append({"name": filter_name, "func": func})
338
+
339
+ # Filter by attributes
340
+ for attr_filter in selector["attributes"]:
341
+ name = attr_filter["name"]
342
+ op = attr_filter["op"]
343
+ value = attr_filter["value"]
344
+ python_name = name.replace("-", "_") # Convert CSS-style names
345
+
346
+ # --- Define the core value retrieval logic ---
347
+ def get_element_value(
348
+ element, name=name, python_name=python_name, selector_type=selector_type
349
+ ):
350
+ bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
351
+ if name in bbox_mapping:
352
+ bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
353
+ return bbox[bbox_mapping[name]]
247
354
 
248
355
  # Special case for region attributes
249
- if selector["type"] == "region":
356
+ if selector_type == "region":
250
357
  if name == "type":
251
- # Use normalized_type for comparison if available
252
358
  if hasattr(element, "normalized_type") and element.normalized_type:
253
- element_value = element.normalized_type
359
+ return element.normalized_type
254
360
  else:
255
- # Convert spaces to hyphens for consistency with the normalized format
256
- element_value = (
257
- getattr(element, "region_type", "").lower().replace(" ", "_")
258
- )
361
+ return getattr(element, "region_type", "").lower().replace(" ", "_")
259
362
  elif name == "model":
260
- # Special handling for model attribute in regions
261
- element_value = getattr(element, "model", None)
363
+ return getattr(element, "model", None)
262
364
  else:
263
- # Get the attribute value from the element normally
264
- element_value = getattr(element, python_name, None)
365
+ return getattr(element, python_name, None)
265
366
  else:
266
- # Get the attribute value from the element normally for non-region elements
267
- element_value = getattr(element, python_name, None)
268
-
269
- if element_value is None:
270
- return False
367
+ # General case for non-region elements
368
+ return getattr(element, python_name, None)
369
+
370
+ # --- Define the comparison function or direct check ---
371
+ filter_lambda: Callable[[Any], bool]
372
+ filter_name: str
373
+
374
+ if op == "exists":
375
+ # Special handling for attribute presence check [attr]
376
+ filter_name = f"attribute [{name} exists]"
377
+ # Lambda checks that the retrieved value is not None
378
+ filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
379
+ else:
380
+ # Handle operators with values (e.g., =, !=, *=, etc.)
381
+ compare_func: Callable[[Any, Any], bool]
382
+ op_desc = f"{op} {value!r}" # Default description
271
383
 
272
- # Apply operator
384
+ # Determine compare_func based on op (reuse existing logic)
273
385
  if op == "=":
274
- if element_value != value:
275
- return False
276
- elif op == "~=":
277
- # Approximate match (e.g., for colors)
278
- if not _is_approximate_match(element_value, value):
279
- return False
386
+ compare_func = lambda el_val, sel_val: el_val == sel_val
387
+ elif op == "!=":
388
+ compare_func = lambda el_val, sel_val: el_val != sel_val
389
+ elif op == "~":
390
+ op_desc = f"~= {value!r} (approx)"
391
+ compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
392
+ elif op == "^=":
393
+ compare_func = (
394
+ lambda el_val, sel_val: isinstance(el_val, str)
395
+ and isinstance(sel_val, str)
396
+ and el_val.startswith(sel_val)
397
+ )
398
+ elif op == "$=":
399
+ compare_func = (
400
+ lambda el_val, sel_val: isinstance(el_val, str)
401
+ and isinstance(sel_val, str)
402
+ and el_val.endswith(sel_val)
403
+ )
404
+ elif op == "*=":
405
+ if name == "fontname":
406
+ op_desc = f"*= {value!r} (contains, case-insensitive)"
407
+ compare_func = (
408
+ lambda el_val, sel_val: isinstance(el_val, str)
409
+ and isinstance(sel_val, str)
410
+ and sel_val.lower() in el_val.lower()
411
+ )
412
+ else:
413
+ op_desc = f"*= {value!r} (contains)"
414
+ compare_func = (
415
+ lambda el_val, sel_val: isinstance(el_val, str)
416
+ and isinstance(sel_val, str)
417
+ and sel_val in el_val
418
+ )
280
419
  elif op == ">=":
281
- # Greater than or equal (element value must be >= specified value)
282
- if not (
283
- isinstance(element_value, (int, float))
284
- and isinstance(value, (int, float))
285
- and element_value >= value
286
- ):
287
- return False
420
+ compare_func = (
421
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
422
+ and isinstance(sel_val, (int, float))
423
+ and el_val >= sel_val
424
+ )
288
425
  elif op == "<=":
289
- # Less than or equal (element value must be <= specified value)
290
- if not (
291
- isinstance(element_value, (int, float))
292
- and isinstance(value, (int, float))
293
- and element_value <= value
294
- ):
295
- return False
426
+ compare_func = (
427
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
428
+ and isinstance(sel_val, (int, float))
429
+ and el_val <= sel_val
430
+ )
296
431
  elif op == ">":
297
- # Greater than (element value must be > specified value)
298
- if not (
299
- isinstance(element_value, (int, float))
300
- and isinstance(value, (int, float))
301
- and element_value > value
302
- ):
303
- return False
432
+ compare_func = (
433
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
434
+ and isinstance(sel_val, (int, float))
435
+ and el_val > sel_val
436
+ )
304
437
  elif op == "<":
305
- # Less than (element value must be < specified value)
306
- if not (
307
- isinstance(element_value, (int, float))
308
- and isinstance(value, (int, float))
309
- and element_value < value
310
- ):
311
- return False
312
-
313
- # Check pseudo-classes
314
- for pseudo in selector["pseudo_classes"]:
315
- name = pseudo["name"]
316
- args = pseudo["args"]
317
-
318
- # Handle various pseudo-classes
319
- if name == "contains" and hasattr(element, "text"):
320
- use_regex = kwargs.get("regex", False)
321
- ignore_case = not kwargs.get("case", True)
438
+ compare_func = (
439
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
440
+ and isinstance(sel_val, (int, float))
441
+ and el_val < sel_val
442
+ )
443
+ else:
444
+ # Should not happen with current parsing logic
445
+ logger.warning(
446
+ f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
447
+ )
448
+ continue # Skip this attribute filter
449
+
450
+ # --- Create the final filter function for operators with values ---
451
+ filter_name = f"attribute [{name}{op_desc}]"
452
+ # Capture loop variables correctly in the lambda
453
+ filter_lambda = (
454
+ lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
455
+ element_value := get_val(el)
456
+ )
457
+ is not None
458
+ and compare(element_value, expected_val)
459
+ )
460
+
461
+ filters.append({"name": filter_name, "func": filter_lambda})
462
+
463
+ # Filter by pseudo-classes
464
+ for pseudo in selector["pseudo_classes"]:
465
+ name = pseudo["name"]
466
+ args = pseudo["args"]
467
+ filter_lambda = None
468
+ # Start with a base name, modify for specifics like :not
469
+ filter_name = f"pseudo-class :{name}"
470
+
471
+ # Relational pseudo-classes are handled separately by the caller
472
+ if name in ("above", "below", "near", "left-of", "right-of"):
473
+ continue
474
+
475
+ # --- Handle :not() ---
476
+ elif name == "not":
477
+ if not isinstance(args, dict): # args should be the parsed inner selector
478
+ logger.error(f"Invalid arguments for :not pseudo-class: {args}")
479
+ raise TypeError(
480
+ "Internal error: :not pseudo-class requires a parsed selector dictionary as args."
481
+ )
482
+
483
+ # Recursively get the filter function for the inner selector
484
+ # Pass kwargs down in case regex/case flags affect the inner selector
485
+ inner_filter_func = selector_to_filter_func(args, **kwargs)
486
+
487
+ # The filter lambda applies the inner function and inverts the result
488
+ filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
489
+
490
+ # Try to create a descriptive name (can be long)
491
+ # Maybe simplify this later if needed
492
+ inner_filter_list = _build_filter_list(args, **kwargs)
493
+ inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
494
+ filter_name = f"pseudo-class :not({inner_filter_names})"
495
+
496
+ # --- Handle text-based pseudo-classes ---
497
+ elif name == "contains" and args is not None:
498
+ use_regex = kwargs.get("regex", False)
499
+ ignore_case = not kwargs.get("case", True) # Default case sensitive
500
+ filter_name = (
501
+ f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
502
+ )
503
+
504
+ def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
505
+ if not hasattr(element, "text") or not element.text:
506
+ return False # Element must have non-empty text
507
+
508
+ element_text = element.text
509
+ search_term = str(args) # Ensure args is string
322
510
 
323
511
  if use_regex:
324
- import re
325
-
326
- if not element.text:
327
- return False
328
512
  try:
329
- pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
330
- if not pattern.search(element.text):
331
- return False
332
- except re.error:
333
- # If regex is invalid, fall back to literal text search
334
- element_text = element.text
335
- search_text = args
336
-
513
+ pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
514
+ return bool(pattern.search(element_text))
515
+ except re.error as e:
516
+ logger.warning(
517
+ f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
518
+ )
519
+ # Fallback to literal search on regex error
337
520
  if ignore_case:
338
- element_text = element_text.lower()
339
- search_text = search_text.lower()
521
+ return search_term.lower() in element_text.lower()
522
+ else:
523
+ return search_term in element_text
524
+ else: # Literal search
525
+ if ignore_case:
526
+ return search_term.lower() in element_text.lower()
527
+ else:
528
+ return search_term in element_text
529
+
530
+ filter_lambda = contains_check
531
+
532
+ elif name == "starts-with" and args is not None:
533
+ filter_lambda = (
534
+ lambda el, arg=args: hasattr(el, "text")
535
+ and el.text
536
+ and el.text.startswith(str(arg))
537
+ )
538
+ elif name == "ends-with" and args is not None:
539
+ filter_lambda = (
540
+ lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
541
+ )
542
+
543
+ # Boolean attribute pseudo-classes
544
+ elif name == "bold":
545
+ filter_lambda = lambda el: hasattr(el, "bold") and el.bold
546
+ elif name == "italic":
547
+ filter_lambda = lambda el: hasattr(el, "italic") and el.italic
548
+ elif name == "horizontal":
549
+ filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
550
+ elif name == "vertical":
551
+ filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
552
+
553
+ # Check predefined lambda functions (e.g., :first-child, :empty)
554
+ elif name in PSEUDO_CLASS_FUNCTIONS:
555
+ filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
556
+ filter_name = f"pseudo-class :{name}" # Set name for predefined ones
557
+ else:
558
+ raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
340
559
 
341
- if search_text not in element_text:
342
- return False
343
- else:
344
- # String comparison with case sensitivity option
345
- if not element.text:
346
- return False
560
+ if filter_lambda:
561
+ # Use the potentially updated filter_name
562
+ filters.append({"name": filter_name, "func": filter_lambda})
347
563
 
348
- element_text = element.text
349
- search_text = args
564
+ return filters
350
565
 
351
- if ignore_case:
352
- element_text = element_text.lower()
353
- search_text = search_text.lower()
354
566
 
355
- if search_text not in element_text:
356
- return False
357
- elif name == "starts-with" and hasattr(element, "text"):
358
- if not element.text or not element.text.startswith(args):
359
- return False
360
- elif name == "ends-with" and hasattr(element, "text"):
361
- if not element.text or not element.text.endswith(args):
362
- return False
363
- elif name == "bold":
364
- if not (hasattr(element, "bold") and element.bold):
365
- return False
366
- elif name == "italic":
367
- if not (hasattr(element, "italic") and element.italic):
368
- return False
369
- elif name == "horizontal":
370
- if not (hasattr(element, "is_horizontal") and element.is_horizontal):
371
- return False
372
- elif name == "vertical":
373
- if not (hasattr(element, "is_vertical") and element.is_vertical):
567
+ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool]:
568
+ """
569
+ Combine a list of named filter functions into a single callable.
570
+
571
+ Args:
572
+ filters: List of dictionaries, each with 'name' and 'func'.
573
+
574
+ Returns:
575
+ A single function that takes an element and returns True only if
576
+ it passes ALL filters in the list.
577
+ """
578
+
579
+ def combined_filter(element):
580
+ for f in filters:
581
+ try:
582
+ if not f["func"](element):
374
583
  return False
375
- else:
376
- # Check pseudo-classes (basic ones like :bold, :italic)
377
- if name in PSEUDO_CLASS_FUNCTIONS:
378
- if not PSEUDO_CLASS_FUNCTIONS[name](element):
379
- return False
380
- elif name == "contains":
381
- if not hasattr(element, "text") or not element.text:
382
- return False
383
- text_to_check = element.text
384
- search_term = args
385
- if not kwargs.get("case", True): # Check case flag from kwargs
386
- text_to_check = text_to_check.lower()
387
- search_term = search_term.lower()
388
-
389
- if kwargs.get("regex", False): # Check regex flag from kwargs
390
- try:
391
- if not re.search(search_term, text_to_check):
392
- return False
393
- except re.error as e:
394
- logger.warning(
395
- f"Invalid regex in :contains selector '{search_term}': {e}"
396
- )
397
- return False # Invalid regex cannot match
398
- else:
399
- if search_term not in text_to_check:
400
- return False
401
- # Skip complex pseudo-classes like :near, :above here, handled later
402
- elif name in ("above", "below", "near", "left-of", "right-of"):
403
- pass # Handled separately after initial filtering
404
- else:
405
- # Optionally log unknown pseudo-classes
406
- # logger.warning(f"Unknown pseudo-class: {name}")
407
- pass
584
+ except Exception as e:
585
+ logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
586
+ return False # Treat errors as filter failures
587
+ return True
588
+
589
+ return combined_filter
590
+
591
+
592
+ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
593
+ """
594
+ Convert a parsed selector to a single filter function.
595
+
596
+ Internally, this builds a list of individual filters and then combines them.
597
+ To inspect the individual filters, call `_build_filter_list` directly.
598
+
599
+ Args:
600
+ selector: Parsed selector dictionary
601
+ **kwargs: Additional filter parameters (e.g., regex, case).
602
+
603
+ Returns:
604
+ Function that takes an element and returns True if it matches the selector.
605
+ """
606
+ filter_list = _build_filter_list(selector, **kwargs)
408
607
 
409
- return True # Element passes all attribute and simple pseudo-class filters
608
+ if logger.isEnabledFor(logging.DEBUG):
609
+ filter_names = [f["name"] for f in filter_list]
610
+ logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
410
611
 
411
- return filter_func
612
+ return _assemble_filter_func(filter_list)