natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -3,11 +3,11 @@ CSS-like selector parser for natural-pdf.
3
3
  """
4
4
 
5
5
  import ast
6
+ import logging
6
7
  import re
7
- from typing import Any, Dict, List, Optional, Tuple, Union, Callable
8
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
9
 
9
10
  from colour import Color
10
- import logging
11
11
 
12
12
  logger = logging.getLogger(__name__)
13
13
 
@@ -89,31 +89,37 @@ def parse_selector(selector: str) -> Dict[str, Any]:
89
89
  """
90
90
  result = {
91
91
  "type": "any",
92
- "attributes": {},
92
+ "attributes": [],
93
93
  "pseudo_classes": [],
94
- "filters": [], # Keep this for potential future use
94
+ "filters": [], # Keep this for potential future use
95
95
  }
96
96
 
97
- original_selector_for_error = selector # Keep for error messages
97
+ original_selector_for_error = selector # Keep for error messages
98
98
  if not selector or not isinstance(selector, str):
99
99
  return result
100
100
 
101
101
  selector = selector.strip()
102
102
 
103
- # --- NEW: Handle wildcard selector explicitly ---
103
+ # --- Handle wildcard selector explicitly ---
104
104
  if selector == "*":
105
105
  # Wildcard matches any type, already the default.
106
106
  # Clear selector so the loop doesn't run and error out.
107
- selector = ""
107
+ selector = ""
108
108
  # --- END NEW ---
109
109
 
110
110
  # 1. Extract type (optional, at the beginning)
111
111
  # Only run if selector wasn't '*'
112
- if selector:
112
+ if selector:
113
+ type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
114
+ if type_match:
115
+ result["type"] = type_match.group(1).lower()
116
+ selector = selector[len(type_match.group(0)) :].strip()
117
+ # Only run if selector wasn't '*'
118
+ if selector:
113
119
  type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
114
120
  if type_match:
115
121
  result["type"] = type_match.group(1).lower()
116
- selector = selector[len(type_match.group(0)):].strip()
122
+ selector = selector[len(type_match.group(0)) :].strip()
117
123
 
118
124
  # Regexes for parts at the START of the remaining string
119
125
  # Attribute: Starts with [, ends with ], content is non-greedy non-] chars
@@ -133,58 +139,74 @@ def parse_selector(selector: str) -> Dict[str, Any]:
133
139
  block_content = attr_match.group(1).strip()
134
140
  # Parse the content inside the block
135
141
  # Pattern: name, optional op, optional value
136
- detail_match = re.match(r"^([a-zA-Z_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content)
142
+ detail_match = re.match(
143
+ r"^([a-zA-Z0-9_\-]+)\s*(?:(>=|<=|>|<|!=|[\*\~\^\$]?=)\s*(.*?))?$", block_content
144
+ )
137
145
  if not detail_match:
138
- raise ValueError(f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'")
146
+ raise ValueError(
147
+ f"Invalid attribute syntax inside block: '[{block_content}]'. Full selector: '{original_selector_for_error}'"
148
+ )
139
149
 
140
150
  name, op, value_str = detail_match.groups()
141
151
 
142
152
  if op is None:
143
- # Presence selector [attr]
144
- result["attributes"][name] = {"op": "exists", "value": None}
153
+ # Presence selector [attr]
154
+ result["attributes"].append({"name": name, "op": "exists", "value": None})
145
155
  else:
146
- # Operator exists, value must also exist (even if empty via quotes)
147
- if value_str is None: # Catches invalid [attr=]
148
- raise ValueError(
149
- f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
150
- )
151
- # Parse value
152
- parsed_value: Any
153
- if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
154
- parsed_value = safe_parse_color(value_str)
155
- else:
156
- parsed_value = safe_parse_value(value_str) # Handles quotes
157
- result["attributes"][name] = {"op": op, "value": parsed_value}
158
-
159
- selector = selector[attr_match.end():].strip()
156
+ # Operator exists, value must also exist (even if empty via quotes)
157
+ if value_str is None: # Catches invalid [attr=]
158
+ raise ValueError(
159
+ f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
160
+ )
161
+ # Parse value
162
+ parsed_value: Any
163
+ if name in [
164
+ "color",
165
+ "non_stroking_color",
166
+ "fill",
167
+ "stroke",
168
+ "strokeColor",
169
+ "fillColor",
170
+ ]:
171
+ parsed_value = safe_parse_color(value_str)
172
+ else:
173
+ parsed_value = safe_parse_value(value_str) # Handles quotes
174
+ result["attributes"].append({"name": name, "op": op, "value": parsed_value})
175
+
176
+ selector = selector[attr_match.end() :].strip()
160
177
  processed_chunk = True
161
178
  continue
162
179
 
163
180
  # Check for :not(...) block
164
181
  if selector.lower().startswith(not_pseudo_prefix):
165
- start_index = len(not_pseudo_prefix) - 1 # Index of '('
182
+ start_index = len(not_pseudo_prefix) - 1 # Index of '('
166
183
  nesting = 1
167
184
  end_index = -1
168
185
  for i in range(start_index + 1, len(selector)):
169
- if selector[i] == '(': nesting += 1
170
- elif selector[i] == ')':
186
+ if selector[i] == "(":
187
+ nesting += 1
188
+ elif selector[i] == ")":
171
189
  nesting -= 1
172
190
  if nesting == 0:
173
191
  end_index = i
174
192
  break
175
193
 
176
194
  if end_index == -1:
177
- raise ValueError(f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'")
195
+ raise ValueError(
196
+ f"Mismatched parenthesis in :not() selector near '{selector}'. Full selector: '{original_selector_for_error}'"
197
+ )
178
198
 
179
199
  inner_selector_str = selector[start_index + 1 : end_index].strip()
180
200
  if not inner_selector_str:
181
- raise ValueError(f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'")
201
+ raise ValueError(
202
+ f"Empty selector inside :not(). Full selector: '{original_selector_for_error}'"
203
+ )
182
204
 
183
205
  # Recursively parse the inner selector
184
206
  parsed_inner_selector = parse_selector(inner_selector_str)
185
- result["pseudo_classes"].append({'name': 'not', 'args': parsed_inner_selector})
207
+ result["pseudo_classes"].append({"name": "not", "args": parsed_inner_selector})
186
208
 
187
- selector = selector[end_index + 1:].strip()
209
+ selector = selector[end_index + 1 :].strip()
188
210
  processed_chunk = True
189
211
  continue
190
212
 
@@ -192,25 +214,27 @@ def parse_selector(selector: str) -> Dict[str, Any]:
192
214
  pseudo_match = pseudo_pattern.match(selector)
193
215
  if pseudo_match:
194
216
  name, args_str = pseudo_match.groups()
195
- name = name.lower() # Normalize pseudo-class name
196
- processed_args = args_str # Keep as string initially, or None
217
+ name = name.lower() # Normalize pseudo-class name
218
+ processed_args = args_str # Keep as string initially, or None
197
219
 
198
220
  if args_str is not None:
199
221
  # Only parse args if they exist and based on the pseudo-class type
200
- if name in ["color", "background"]:
222
+ if name in ["color", "background"]:
201
223
  processed_args = safe_parse_color(args_str)
202
- else:
224
+ else:
203
225
  processed_args = safe_parse_value(args_str)
204
226
  # else: args remain None
205
227
 
206
228
  result["pseudo_classes"].append({"name": name, "args": processed_args})
207
- selector = selector[pseudo_match.end():].strip()
229
+ selector = selector[pseudo_match.end() :].strip()
208
230
  processed_chunk = True
209
231
  continue
210
232
 
211
233
  # If we reach here and the selector string is not empty, something is wrong
212
234
  if not processed_chunk and selector:
213
- raise ValueError(f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'")
235
+ raise ValueError(
236
+ f"Invalid or unexpected syntax near '{selector[:30]}...'. Full selector: '{original_selector_for_error}'"
237
+ )
214
238
 
215
239
  return result
216
240
 
@@ -263,12 +287,8 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
263
287
  PSEUDO_CLASS_FUNCTIONS = {
264
288
  "bold": lambda el: hasattr(el, "bold") and el.bold,
265
289
  "italic": lambda el: hasattr(el, "italic") and el.italic,
266
- "first-child": lambda el: hasattr(el, "parent")
267
- and el.parent
268
- and el.parent.children[0] == el,
269
- "last-child": lambda el: hasattr(el, "parent")
270
- and el.parent
271
- and el.parent.children[-1] == el,
290
+ "first-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[0] == el,
291
+ "last-child": lambda el: hasattr(el, "parent") and el.parent and el.parent.children[-1] == el,
272
292
  "empty": lambda el: not el.text,
273
293
  "not-empty": lambda el: el.text,
274
294
  "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
@@ -308,34 +328,44 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
308
328
  func = lambda el: (
309
329
  hasattr(el, "normalized_type") and el.normalized_type == selector_type
310
330
  ) or (
311
- not hasattr(el, "normalized_type") # Only check element.type if normalized_type doesn't exist/match
312
- and hasattr(el, "type") and el.type == selector_type
331
+ not hasattr(
332
+ el, "normalized_type"
333
+ ) # Only check element.type if normalized_type doesn't exist/match
334
+ and hasattr(el, "type")
335
+ and el.type == selector_type
313
336
  )
314
337
  filters.append({"name": filter_name, "func": func})
315
338
 
316
-
317
339
  # Filter by attributes
318
- for name, attr_info in selector["attributes"].items():
319
- op = attr_info["op"]
320
- value = attr_info["value"]
321
- python_name = name.replace("-", "_") # Convert CSS-style names
340
+ for attr_filter in selector["attributes"]:
341
+ name = attr_filter["name"]
342
+ op = attr_filter["op"]
343
+ value = attr_filter["value"]
344
+ python_name = name.replace("-", "_") # Convert CSS-style names
322
345
 
323
346
  # --- Define the core value retrieval logic ---
324
- def get_element_value(element, name=name, python_name=python_name, selector_type=selector_type):
325
- # Special case for region attributes
326
- if selector_type == "region":
327
- if name == "type":
328
- if hasattr(element, "normalized_type") and element.normalized_type:
329
- return element.normalized_type
330
- else:
331
- return getattr(element, "region_type", "").lower().replace(" ", "_")
332
- elif name == "model":
333
- return getattr(element, "model", None)
334
- else:
335
- return getattr(element, python_name, None)
336
- else:
337
- # General case for non-region elements
338
- return getattr(element, python_name, None)
347
+ def get_element_value(
348
+ element, name=name, python_name=python_name, selector_type=selector_type
349
+ ):
350
+ bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
351
+ if name in bbox_mapping:
352
+ bbox = getattr(element, "_bbox", None) or getattr(element, "bbox", None)
353
+ return bbox[bbox_mapping[name]]
354
+
355
+ # Special case for region attributes
356
+ if selector_type == "region":
357
+ if name == "type":
358
+ if hasattr(element, "normalized_type") and element.normalized_type:
359
+ return element.normalized_type
360
+ else:
361
+ return getattr(element, "region_type", "").lower().replace(" ", "_")
362
+ elif name == "model":
363
+ return getattr(element, "model", None)
364
+ else:
365
+ return getattr(element, python_name, None)
366
+ else:
367
+ # General case for non-region elements
368
+ return getattr(element, python_name, None)
339
369
 
340
370
  # --- Define the comparison function or direct check ---
341
371
  filter_lambda: Callable[[Any], bool]
@@ -345,14 +375,11 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
345
375
  # Special handling for attribute presence check [attr]
346
376
  filter_name = f"attribute [{name} exists]"
347
377
  # Lambda checks that the retrieved value is not None
348
- filter_lambda = (
349
- lambda el, get_val=get_element_value:
350
- get_val(el) is not None
351
- )
378
+ filter_lambda = lambda el, get_val=get_element_value: get_val(el) is not None
352
379
  else:
353
380
  # Handle operators with values (e.g., =, !=, *=, etc.)
354
381
  compare_func: Callable[[Any, Any], bool]
355
- op_desc = f"{op} {value!r}" # Default description
382
+ op_desc = f"{op} {value!r}" # Default description
356
383
 
357
384
  # Determine compare_func based on op (reuse existing logic)
358
385
  if op == "=":
@@ -363,40 +390,76 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
363
390
  op_desc = f"~= {value!r} (approx)"
364
391
  compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
365
392
  elif op == "^=":
366
- compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.startswith(sel_val)
393
+ compare_func = (
394
+ lambda el_val, sel_val: isinstance(el_val, str)
395
+ and isinstance(sel_val, str)
396
+ and el_val.startswith(sel_val)
397
+ )
367
398
  elif op == "$=":
368
- compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and el_val.endswith(sel_val)
399
+ compare_func = (
400
+ lambda el_val, sel_val: isinstance(el_val, str)
401
+ and isinstance(sel_val, str)
402
+ and el_val.endswith(sel_val)
403
+ )
369
404
  elif op == "*=":
370
405
  if name == "fontname":
371
- op_desc = f"*= {value!r} (contains, case-insensitive)"
372
- compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val.lower() in el_val.lower()
406
+ op_desc = f"*= {value!r} (contains, case-insensitive)"
407
+ compare_func = (
408
+ lambda el_val, sel_val: isinstance(el_val, str)
409
+ and isinstance(sel_val, str)
410
+ and sel_val.lower() in el_val.lower()
411
+ )
373
412
  else:
374
- op_desc = f"*= {value!r} (contains)"
375
- compare_func = lambda el_val, sel_val: isinstance(el_val, str) and isinstance(sel_val, str) and sel_val in el_val
413
+ op_desc = f"*= {value!r} (contains)"
414
+ compare_func = (
415
+ lambda el_val, sel_val: isinstance(el_val, str)
416
+ and isinstance(sel_val, str)
417
+ and sel_val in el_val
418
+ )
376
419
  elif op == ">=":
377
- compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val >= sel_val
420
+ compare_func = (
421
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
422
+ and isinstance(sel_val, (int, float))
423
+ and el_val >= sel_val
424
+ )
378
425
  elif op == "<=":
379
- compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val <= sel_val
426
+ compare_func = (
427
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
428
+ and isinstance(sel_val, (int, float))
429
+ and el_val <= sel_val
430
+ )
380
431
  elif op == ">":
381
- compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val > sel_val
432
+ compare_func = (
433
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
434
+ and isinstance(sel_val, (int, float))
435
+ and el_val > sel_val
436
+ )
382
437
  elif op == "<":
383
- compare_func = lambda el_val, sel_val: isinstance(el_val, (int, float)) and isinstance(sel_val, (int, float)) and el_val < sel_val
438
+ compare_func = (
439
+ lambda el_val, sel_val: isinstance(el_val, (int, float))
440
+ and isinstance(sel_val, (int, float))
441
+ and el_val < sel_val
442
+ )
384
443
  else:
385
444
  # Should not happen with current parsing logic
386
- logger.warning(f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'")
387
- continue # Skip this attribute filter
445
+ logger.warning(
446
+ f"Unsupported operator '{op}' encountered during filter building for attribute '{name}'"
447
+ )
448
+ continue # Skip this attribute filter
388
449
 
389
450
  # --- Create the final filter function for operators with values ---
390
451
  filter_name = f"attribute [{name}{op_desc}]"
391
452
  # Capture loop variables correctly in the lambda
392
453
  filter_lambda = (
393
- lambda el, get_val=get_element_value, compare=compare_func, expected_val=value:
394
- (element_value := get_val(el)) is not None and compare(element_value, expected_val)
454
+ lambda el, get_val=get_element_value, compare=compare_func, expected_val=value: (
455
+ element_value := get_val(el)
456
+ )
457
+ is not None
458
+ and compare(element_value, expected_val)
395
459
  )
396
460
 
397
461
  filters.append({"name": filter_name, "func": filter_lambda})
398
462
 
399
-
400
463
  # Filter by pseudo-classes
401
464
  for pseudo in selector["pseudo_classes"]:
402
465
  name = pseudo["name"]
@@ -407,62 +470,75 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
407
470
 
408
471
  # Relational pseudo-classes are handled separately by the caller
409
472
  if name in ("above", "below", "near", "left-of", "right-of"):
410
- continue
473
+ continue
411
474
 
412
- # --- Handle :not() ---
475
+ # --- Handle :not() ---
413
476
  elif name == "not":
414
- if not isinstance(args, dict): # args should be the parsed inner selector
415
- logger.error(f"Invalid arguments for :not pseudo-class: {args}")
416
- raise TypeError("Internal error: :not pseudo-class requires a parsed selector dictionary as args.")
417
-
418
- # Recursively get the filter function for the inner selector
419
- # Pass kwargs down in case regex/case flags affect the inner selector
420
- inner_filter_func = selector_to_filter_func(args, **kwargs)
421
-
422
- # The filter lambda applies the inner function and inverts the result
423
- filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
424
-
425
- # Try to create a descriptive name (can be long)
426
- # Maybe simplify this later if needed
427
- inner_filter_list = _build_filter_list(args, **kwargs)
428
- inner_filter_names = ", ".join([f['name'] for f in inner_filter_list])
429
- filter_name = f"pseudo-class :not({inner_filter_names})"
430
-
431
- # --- Handle text-based pseudo-classes ---
477
+ if not isinstance(args, dict): # args should be the parsed inner selector
478
+ logger.error(f"Invalid arguments for :not pseudo-class: {args}")
479
+ raise TypeError(
480
+ "Internal error: :not pseudo-class requires a parsed selector dictionary as args."
481
+ )
482
+
483
+ # Recursively get the filter function for the inner selector
484
+ # Pass kwargs down in case regex/case flags affect the inner selector
485
+ inner_filter_func = selector_to_filter_func(args, **kwargs)
486
+
487
+ # The filter lambda applies the inner function and inverts the result
488
+ filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
489
+
490
+ # Try to create a descriptive name (can be long)
491
+ # Maybe simplify this later if needed
492
+ inner_filter_list = _build_filter_list(args, **kwargs)
493
+ inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
494
+ filter_name = f"pseudo-class :not({inner_filter_names})"
495
+
496
+ # --- Handle text-based pseudo-classes ---
432
497
  elif name == "contains" and args is not None:
433
498
  use_regex = kwargs.get("regex", False)
434
- ignore_case = not kwargs.get("case", True) # Default case sensitive
435
- filter_name = f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
499
+ ignore_case = not kwargs.get("case", True) # Default case sensitive
500
+ filter_name = (
501
+ f"pseudo-class :contains({args!r}, regex={use_regex}, ignore_case={ignore_case})"
502
+ )
436
503
 
437
504
  def contains_check(element, args=args, use_regex=use_regex, ignore_case=ignore_case):
438
- if not hasattr(element, "text") or not element.text:
439
- return False # Element must have non-empty text
440
-
441
- element_text = element.text
442
- search_term = str(args) # Ensure args is string
443
-
444
- if use_regex:
445
- try:
446
- pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
447
- return bool(pattern.search(element_text))
448
- except re.error as e:
449
- logger.warning(f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search.")
450
- # Fallback to literal search on regex error
451
- if ignore_case:
452
- return search_term.lower() in element_text.lower()
453
- else:
454
- return search_term in element_text
455
- else: # Literal search
456
- if ignore_case:
457
- return search_term.lower() in element_text.lower()
458
- else:
459
- return search_term in element_text
505
+ if not hasattr(element, "text") or not element.text:
506
+ return False # Element must have non-empty text
507
+
508
+ element_text = element.text
509
+ search_term = str(args) # Ensure args is string
510
+
511
+ if use_regex:
512
+ try:
513
+ pattern = re.compile(search_term, re.IGNORECASE if ignore_case else 0)
514
+ return bool(pattern.search(element_text))
515
+ except re.error as e:
516
+ logger.warning(
517
+ f"Invalid regex '{search_term}' in :contains selector: {e}. Falling back to literal search."
518
+ )
519
+ # Fallback to literal search on regex error
520
+ if ignore_case:
521
+ return search_term.lower() in element_text.lower()
522
+ else:
523
+ return search_term in element_text
524
+ else: # Literal search
525
+ if ignore_case:
526
+ return search_term.lower() in element_text.lower()
527
+ else:
528
+ return search_term in element_text
529
+
460
530
  filter_lambda = contains_check
461
531
 
462
532
  elif name == "starts-with" and args is not None:
463
- filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.startswith(str(arg))
533
+ filter_lambda = (
534
+ lambda el, arg=args: hasattr(el, "text")
535
+ and el.text
536
+ and el.text.startswith(str(arg))
537
+ )
464
538
  elif name == "ends-with" and args is not None:
465
- filter_lambda = lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
539
+ filter_lambda = (
540
+ lambda el, arg=args: hasattr(el, "text") and el.text and el.text.endswith(str(arg))
541
+ )
466
542
 
467
543
  # Boolean attribute pseudo-classes
468
544
  elif name == "bold":
@@ -477,11 +553,10 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
477
553
  # Check predefined lambda functions (e.g., :first-child, :empty)
478
554
  elif name in PSEUDO_CLASS_FUNCTIONS:
479
555
  filter_lambda = PSEUDO_CLASS_FUNCTIONS[name]
480
- filter_name = f"pseudo-class :{name}" # Set name for predefined ones
556
+ filter_name = f"pseudo-class :{name}" # Set name for predefined ones
481
557
  else:
482
558
  raise ValueError(f"Unknown or unsupported pseudo-class: ':{name}'")
483
559
 
484
-
485
560
  if filter_lambda:
486
561
  # Use the potentially updated filter_name
487
562
  filters.append({"name": filter_name, "func": filter_lambda})
@@ -500,15 +575,17 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
500
575
  A single function that takes an element and returns True only if
501
576
  it passes ALL filters in the list.
502
577
  """
578
+
503
579
  def combined_filter(element):
504
580
  for f in filters:
505
581
  try:
506
- if not f['func'](element):
582
+ if not f["func"](element):
507
583
  return False
508
584
  except Exception as e:
509
- logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
510
- return False # Treat errors as filter failures
585
+ logger.error(f"Error applying filter '{f['name']}' to element: {e}", exc_info=True)
586
+ return False # Treat errors as filter failures
511
587
  return True
588
+
512
589
  return combined_filter
513
590
 
514
591
 
@@ -529,8 +606,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
529
606
  filter_list = _build_filter_list(selector, **kwargs)
530
607
 
531
608
  if logger.isEnabledFor(logging.DEBUG):
532
- filter_names = [f['name'] for f in filter_list]
533
- logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
534
-
535
- return _assemble_filter_func(filter_list)
609
+ filter_names = [f["name"] for f in filter_list]
610
+ logger.debug(f"Assembling filters for selector {selector}: {filter_names}")
536
611
 
612
+ return _assemble_filter_func(filter_list)
@@ -3,13 +3,13 @@ OCR debug utilities for natural-pdf.
3
3
  """
4
4
 
5
5
  import base64
6
+ import importlib.resources
7
+ import importlib.util
6
8
  import io
7
9
  import json
8
10
  import os
9
- import importlib.util
10
- import importlib.resources
11
11
  import webbrowser
12
- from typing import Dict, List, Any, Optional, Union, Tuple
12
+ from typing import Any, Dict, List, Optional, Tuple, Union
13
13
 
14
14
  from PIL import Image
15
15
 
@@ -2,8 +2,8 @@
2
2
  Utilities for generating consistent identifiers.
3
3
  """
4
4
 
5
- import hashlib
6
5
  import base64
6
+ import hashlib
7
7
  import os
8
8
 
9
9
 
@@ -5,4 +5,4 @@ Shared locks for thread synchronization across the natural-pdf library.
5
5
  import threading
6
6
 
7
7
  # Global lock for PDF rendering operations to prevent PDFium concurrency issues
8
- pdf_render_lock = threading.RLock()
8
+ pdf_render_lock = threading.RLock()
@@ -2,23 +2,25 @@
2
2
  Utilities for packaging data for external processes, like correction tasks.
3
3
  """
4
4
 
5
- import os
6
5
  import base64
7
6
  import io
8
7
  import json
9
- import zipfile
10
- import tempfile
11
8
  import logging
9
+ import os
12
10
  import shutil
13
- from typing import Any, List, Union, Iterable, TYPE_CHECKING, Dict
11
+ import tempfile
12
+ import zipfile
13
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Union
14
+
14
15
  from tqdm import tqdm
16
+
15
17
  from natural_pdf.elements.text import TextElement
16
18
 
17
19
  # Import the specific PDF/Page types if possible, otherwise use Any
18
20
  if TYPE_CHECKING:
19
- from natural_pdf.core.pdf import PDF
20
- from natural_pdf.core.page import Page
21
21
  from natural_pdf.collections.pdf_collection import PDFCollection
22
+ from natural_pdf.core.page import Page
23
+ from natural_pdf.core.pdf import PDF
22
24
  else:
23
25
  PDF = Any
24
26
  Page = Any