natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,28 +1,31 @@
1
1
  """
2
2
  CSS-like selector parser for natural-pdf.
3
3
  """
4
- import re
4
+
5
5
  import ast
6
- from typing import Dict, Any, List, Optional, Union, Tuple
6
+ import re
7
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
+
7
9
  from colour import Color
8
10
 
9
11
 
10
12
  def safe_parse_value(value_str: str) -> Any:
11
13
  """
12
14
  Safely parse a value string without using eval().
13
-
15
+
14
16
  Args:
15
17
  value_str: String representation of a value (number, tuple, string, etc.)
16
-
18
+
17
19
  Returns:
18
20
  Parsed value
19
21
  """
20
22
  # Strip quotes first if it's a quoted string
21
23
  value_str = value_str.strip()
22
- if (value_str.startswith('"') and value_str.endswith('"')) or \
23
- (value_str.startswith("'") and value_str.endswith("'")):
24
+ if (value_str.startswith('"') and value_str.endswith('"')) or (
25
+ value_str.startswith("'") and value_str.endswith("'")
26
+ ):
24
27
  return value_str[1:-1]
25
-
28
+
26
29
  # Try parsing as a Python literal (numbers, tuples, lists)
27
30
  try:
28
31
  return ast.literal_eval(value_str)
@@ -34,15 +37,15 @@ def safe_parse_value(value_str: str) -> Any:
34
37
  def safe_parse_color(value_str: str) -> tuple:
35
38
  """
36
39
  Parse a color value which could be an RGB tuple, color name, or hex code.
37
-
40
+
38
41
  Args:
39
42
  value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
40
-
43
+
41
44
  Returns:
42
45
  RGB tuple (r, g, b) with values from 0 to 1
43
46
  """
44
47
  value_str = value_str.strip()
45
-
48
+
46
49
  # Try parsing as a Python literal (for RGB tuples)
47
50
  try:
48
51
  # If it's already a valid tuple or list, parse it
@@ -60,7 +63,7 @@ def safe_parse_color(value_str: str) -> tuple:
60
63
  except (ValueError, AttributeError):
61
64
  # If color parsing fails, return a default (black)
62
65
  return (0, 0, 0)
63
-
66
+
64
67
  # If we got here with a non-tuple, return default
65
68
  return (0, 0, 0)
66
69
 
@@ -68,260 +71,78 @@ def safe_parse_color(value_str: str) -> tuple:
68
71
  def parse_selector(selector: str) -> Dict[str, Any]:
69
72
  """
70
73
  Parse a CSS-like selector string into a structured selector object.
71
-
74
+
72
75
  Examples:
73
76
  - 'text:contains("Revenue")'
74
77
  - 'table:below("Financial Data")'
75
78
  - 'rect[fill=(1,0,0)]'
76
-
79
+
77
80
  Args:
78
81
  selector: CSS-like selector string
79
-
82
+
80
83
  Returns:
81
84
  Dict representing the parsed selector
82
85
  """
83
86
  # Basic structure for result
84
87
  result = {
85
- 'type': 'any', # Default to any element type
86
- 'filters': [],
87
- 'attributes': {},
88
- 'pseudo_classes': [],
88
+ "type": "any", # Default to any element type
89
+ "filters": [],
90
+ "attributes": {},
91
+ "pseudo_classes": [],
89
92
  }
90
-
93
+
91
94
  # Check if empty or None
92
95
  if not selector or not isinstance(selector, str):
93
96
  return result
94
-
97
+
95
98
  # Parse element type
96
- type_match = re.match(r'^([a-zA-Z_]+)', selector)
99
+ type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
97
100
  if type_match:
98
- result['type'] = type_match.group(1).lower()
99
- selector = selector[len(type_match.group(0)):]
100
-
101
+ result["type"] = type_match.group(1).lower()
102
+ selector = selector[len(type_match.group(0)) :]
103
+
101
104
  # Parse attributes (e.g., [color=(1,0,0)])
102
- attr_pattern = r'\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]'
105
+ attr_pattern = r"\[([a-zA-Z_]+)(>=|<=|>|<|[*~]?=)([^\]]+)\]"
103
106
  attr_matches = re.findall(attr_pattern, selector)
104
107
  for name, op, value in attr_matches:
105
108
  # Handle special parsing for color attributes
106
- if name in ['color', 'non_stroking_color', 'fill', 'stroke', 'strokeColor', 'fillColor']:
109
+ if name in ["color", "non_stroking_color", "fill", "stroke", "strokeColor", "fillColor"]:
107
110
  value = safe_parse_color(value)
108
111
  else:
109
112
  # Safe parsing for other attributes
110
113
  value = safe_parse_value(value)
111
-
114
+
112
115
  # Store attribute with operator
113
- result['attributes'][name] = {
114
- 'op': op,
115
- 'value': value
116
- }
117
-
116
+ result["attributes"][name] = {"op": op, "value": value}
117
+
118
118
  # Parse pseudo-classes (e.g., :contains("text"))
119
- pseudo_pattern = r':([a-zA-Z_]+)(?:\(([^)]+)\))?'
119
+ pseudo_pattern = r":([a-zA-Z_]+)(?:\(([^)]+)\))?"
120
120
  pseudo_matches = re.findall(pseudo_pattern, selector)
121
121
  for name, args in pseudo_matches:
122
122
  # Process arguments
123
123
  processed_args = args
124
124
  if args:
125
- if name in ['color', 'background']:
125
+ if name in ["color", "background"]:
126
126
  processed_args = safe_parse_color(args)
127
127
  else:
128
128
  processed_args = safe_parse_value(args)
129
-
130
- result['pseudo_classes'].append({
131
- 'name': name,
132
- 'args': processed_args
133
- })
134
-
135
- return result
136
129
 
130
+ result["pseudo_classes"].append({"name": name, "args": processed_args})
137
131
 
138
- def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
139
- """
140
- Convert a parsed selector to a filter function.
141
-
142
- Args:
143
- selector: Parsed selector dictionary
144
- **kwargs: Additional filter parameters including:
145
- - regex: Whether to use regex for text search
146
- - case: Whether to do case-sensitive text search
147
-
148
- Returns:
149
- Function that takes an element and returns True if it matches
150
- """
151
- def filter_func(element):
152
- # Check element type
153
- if selector['type'] != 'any':
154
- # Special handling for 'text' type to match both 'text', 'char', and 'word'
155
- if selector['type'] == 'text':
156
- if element.type not in ['text', 'char', 'word']:
157
- return False
158
- # Special handling for 'region' type to check for detected layout regions
159
- elif selector['type'] == 'region':
160
- # Check if this is a Region with region_type property
161
- if not hasattr(element, 'region_type'):
162
- return False
163
-
164
- # If 'type' attribute specified, it will be checked in the attributes section
165
- # Check for Docling-specific types (section-header, etc.)
166
- elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
167
- # This is a direct match with a Docling region type
168
- pass
169
- # Otherwise, require exact match with the element's type attribute
170
- elif not hasattr(element, 'type') or element.type != selector['type']:
171
- return False
172
-
173
- # Check attributes
174
- for name, attr_info in selector['attributes'].items():
175
- op = attr_info['op']
176
- value = attr_info['value']
177
-
178
- # Special case for fontname attribute - allow matching part of the name
179
- if name == 'fontname' and op == '*=':
180
- element_value = getattr(element, name, None)
181
- if element_value is None or value.lower() not in element_value.lower():
182
- return False
183
- continue
184
-
185
- # Convert hyphenated attribute names to underscore for Python properties
186
- python_name = name.replace('-', '_')
187
-
188
- # Special case for region attributes
189
- if selector['type'] == 'region':
190
- if name == 'type':
191
- # Use normalized_type for comparison if available
192
- if hasattr(element, 'normalized_type') and element.normalized_type:
193
- element_value = element.normalized_type
194
- else:
195
- # Convert spaces to hyphens for consistency with the normalized format
196
- element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
197
- elif name == 'model':
198
- # Special handling for model attribute in regions
199
- element_value = getattr(element, 'model', None)
200
- else:
201
- # Get the attribute value from the element normally
202
- element_value = getattr(element, python_name, None)
203
- else:
204
- # Get the attribute value from the element normally for non-region elements
205
- element_value = getattr(element, python_name, None)
206
-
207
- if element_value is None:
208
- return False
209
-
210
- # Apply operator
211
- if op == '=':
212
- if element_value != value:
213
- return False
214
- elif op == '~=':
215
- # Approximate match (e.g., for colors)
216
- if not _is_approximate_match(element_value, value):
217
- return False
218
- elif op == '>=':
219
- # Greater than or equal (element value must be >= specified value)
220
- if not (isinstance(element_value, (int, float)) and
221
- isinstance(value, (int, float)) and
222
- element_value >= value):
223
- return False
224
- elif op == '<=':
225
- # Less than or equal (element value must be <= specified value)
226
- if not (isinstance(element_value, (int, float)) and
227
- isinstance(value, (int, float)) and
228
- element_value <= value):
229
- return False
230
- elif op == '>':
231
- # Greater than (element value must be > specified value)
232
- if not (isinstance(element_value, (int, float)) and
233
- isinstance(value, (int, float)) and
234
- element_value > value):
235
- return False
236
- elif op == '<':
237
- # Less than (element value must be < specified value)
238
- if not (isinstance(element_value, (int, float)) and
239
- isinstance(value, (int, float)) and
240
- element_value < value):
241
- return False
242
-
243
- # Check pseudo-classes
244
- for pseudo in selector['pseudo_classes']:
245
- name = pseudo['name']
246
- args = pseudo['args']
247
-
248
- # Handle various pseudo-classes
249
- if name == 'contains' and hasattr(element, 'text'):
250
- use_regex = kwargs.get('regex', False)
251
- ignore_case = not kwargs.get('case', True)
252
-
253
- if use_regex:
254
- import re
255
- if not element.text:
256
- return False
257
- try:
258
- pattern = re.compile(args, re.IGNORECASE if ignore_case else 0)
259
- if not pattern.search(element.text):
260
- return False
261
- except re.error:
262
- # If regex is invalid, fall back to literal text search
263
- element_text = element.text
264
- search_text = args
265
-
266
- if ignore_case:
267
- element_text = element_text.lower()
268
- search_text = search_text.lower()
269
-
270
- if search_text not in element_text:
271
- return False
272
- else:
273
- # String comparison with case sensitivity option
274
- if not element.text:
275
- return False
276
-
277
- element_text = element.text
278
- search_text = args
279
-
280
- if ignore_case:
281
- element_text = element_text.lower()
282
- search_text = search_text.lower()
283
-
284
- if search_text not in element_text:
285
- return False
286
- elif name == 'starts-with' and hasattr(element, 'text'):
287
- if not element.text or not element.text.startswith(args):
288
- return False
289
- elif name == 'ends-with' and hasattr(element, 'text'):
290
- if not element.text or not element.text.endswith(args):
291
- return False
292
- elif name == 'bold':
293
- if not (hasattr(element, 'bold') and element.bold):
294
- return False
295
- elif name == 'italic':
296
- if not (hasattr(element, 'italic') and element.italic):
297
- return False
298
- elif name == 'horizontal':
299
- if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
300
- return False
301
- elif name == 'vertical':
302
- if not (hasattr(element, 'is_vertical') and element.is_vertical):
303
- return False
304
- else:
305
- # Potentially unsupported pseudo-class, or one handled elsewhere (like :not)
306
- pass
307
-
308
- # If we get here, all checks passed
309
- return True
310
-
311
- return filter_func
132
+ return result
312
133
 
313
134
 
314
135
  def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
315
136
  """
316
137
  Check if two values approximately match.
317
-
138
+
318
139
  This is mainly used for color comparisons with some tolerance.
319
-
140
+
320
141
  Args:
321
142
  value1: First value
322
143
  value2: Second value
323
144
  tolerance: Maximum difference allowed
324
-
145
+
325
146
  Returns:
326
147
  True if the values approximately match
327
148
  """
@@ -331,157 +152,177 @@ def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
331
152
  value1 = tuple(Color(value1).rgb)
332
153
  except:
333
154
  pass
334
-
155
+
335
156
  if isinstance(value2, str):
336
157
  try:
337
158
  value2 = tuple(Color(value2).rgb)
338
159
  except:
339
160
  pass
340
-
161
+
341
162
  # If both are tuples/lists with the same length (e.g., colors)
342
- if (isinstance(value1, (list, tuple)) and
343
- isinstance(value2, (list, tuple)) and
344
- len(value1) == len(value2)):
345
-
163
+ if (
164
+ isinstance(value1, (list, tuple))
165
+ and isinstance(value2, (list, tuple))
166
+ and len(value1) == len(value2)
167
+ ):
168
+
346
169
  # Check if all components are within tolerance
347
170
  return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
348
-
171
+
349
172
  # If both are numbers
350
173
  if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
351
174
  return abs(value1 - value2) <= tolerance
352
-
175
+
353
176
  # Default to exact match for other types
354
177
  return value1 == value2
355
178
 
356
179
 
357
180
  PSEUDO_CLASS_FUNCTIONS = {
358
- 'bold': lambda el: hasattr(el, 'bold') and el.bold,
359
- 'italic': lambda el: hasattr(el, 'italic') and el.italic,
360
- 'first-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[0] == el, # Example placeholder
361
- 'last-child': lambda el: hasattr(el, 'parent') and el.parent and el.parent.children[-1] == el, # Example placeholder
181
+ "bold": lambda el: hasattr(el, "bold") and el.bold,
182
+ "italic": lambda el: hasattr(el, "italic") and el.italic,
183
+ "first-child": lambda el: hasattr(el, "parent")
184
+ and el.parent
185
+ and el.parent.children[0] == el, # Example placeholder
186
+ "last-child": lambda el: hasattr(el, "parent")
187
+ and el.parent
188
+ and el.parent.children[-1] == el, # Example placeholder
362
189
  # Add the new pseudo-classes for negation
363
- 'not-bold': lambda el: hasattr(el, 'bold') and not el.bold,
364
- 'not-italic': lambda el: hasattr(el, 'italic') and not el.italic,
190
+ "not-bold": lambda el: hasattr(el, "bold") and not el.bold,
191
+ "not-italic": lambda el: hasattr(el, "italic") and not el.italic,
365
192
  }
366
193
 
367
194
 
368
195
  def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
369
196
  """
370
197
  Convert a parsed selector to a filter function.
371
-
198
+
372
199
  Args:
373
200
  selector: Parsed selector dictionary
374
201
  **kwargs: Additional filter parameters including:
375
202
  - regex: Whether to use regex for text search
376
203
  - case: Whether to do case-sensitive text search
377
-
204
+
378
205
  Returns:
379
206
  Function that takes an element and returns True if it matches
380
207
  """
208
+
381
209
  def filter_func(element):
382
210
  # Check element type
383
- if selector['type'] != 'any':
211
+ if selector["type"] != "any":
384
212
  # Special handling for 'text' type to match both 'text', 'char', and 'word'
385
- if selector['type'] == 'text':
386
- if element.type not in ['text', 'char', 'word']:
213
+ if selector["type"] == "text":
214
+ if element.type not in ["text", "char", "word"]:
387
215
  return False
388
216
  # Special handling for 'region' type to check for detected layout regions
389
- elif selector['type'] == 'region':
217
+ elif selector["type"] == "region":
390
218
  # Check if this is a Region with region_type property
391
- if not hasattr(element, 'region_type'):
219
+ if not hasattr(element, "region_type"):
392
220
  return False
393
-
221
+
394
222
  # If 'type' attribute specified, it will be checked in the attributes section
395
223
  # Check for Docling-specific types (section-header, etc.)
396
- elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
224
+ elif (
225
+ hasattr(element, "normalized_type") and element.normalized_type == selector["type"]
226
+ ):
397
227
  # This is a direct match with a Docling region type
398
228
  pass
399
229
  # Otherwise, require exact match with the element's type attribute
400
- elif not hasattr(element, 'type') or element.type != selector['type']:
230
+ elif not hasattr(element, "type") or element.type != selector["type"]:
401
231
  return False
402
-
232
+
403
233
  # Check attributes
404
- for name, attr_info in selector['attributes'].items():
405
- op = attr_info['op']
406
- value = attr_info['value']
407
-
234
+ for name, attr_info in selector["attributes"].items():
235
+ op = attr_info["op"]
236
+ value = attr_info["value"]
237
+
408
238
  # Special case for fontname attribute - allow matching part of the name
409
- if name == 'fontname' and op == '*=':
239
+ if name == "fontname" and op == "*=":
410
240
  element_value = getattr(element, name, None)
411
241
  if element_value is None or value.lower() not in element_value.lower():
412
242
  return False
413
243
  continue
414
-
244
+
415
245
  # Convert hyphenated attribute names to underscore for Python properties
416
- python_name = name.replace('-', '_')
417
-
246
+ python_name = name.replace("-", "_")
247
+
418
248
  # Special case for region attributes
419
- if selector['type'] == 'region':
420
- if name == 'type':
249
+ if selector["type"] == "region":
250
+ if name == "type":
421
251
  # Use normalized_type for comparison if available
422
- if hasattr(element, 'normalized_type') and element.normalized_type:
252
+ if hasattr(element, "normalized_type") and element.normalized_type:
423
253
  element_value = element.normalized_type
424
254
  else:
425
255
  # Convert spaces to hyphens for consistency with the normalized format
426
- element_value = getattr(element, 'region_type', '').lower().replace(' ', '-')
427
- elif name == 'model':
256
+ element_value = (
257
+ getattr(element, "region_type", "").lower().replace(" ", "_")
258
+ )
259
+ elif name == "model":
428
260
  # Special handling for model attribute in regions
429
- element_value = getattr(element, 'model', None)
261
+ element_value = getattr(element, "model", None)
430
262
  else:
431
263
  # Get the attribute value from the element normally
432
264
  element_value = getattr(element, python_name, None)
433
265
  else:
434
266
  # Get the attribute value from the element normally for non-region elements
435
267
  element_value = getattr(element, python_name, None)
436
-
268
+
437
269
  if element_value is None:
438
270
  return False
439
-
271
+
440
272
  # Apply operator
441
- if op == '=':
273
+ if op == "=":
442
274
  if element_value != value:
443
275
  return False
444
- elif op == '~=':
276
+ elif op == "~=":
445
277
  # Approximate match (e.g., for colors)
446
278
  if not _is_approximate_match(element_value, value):
447
279
  return False
448
- elif op == '>=':
280
+ elif op == ">=":
449
281
  # Greater than or equal (element value must be >= specified value)
450
- if not (isinstance(element_value, (int, float)) and
451
- isinstance(value, (int, float)) and
452
- element_value >= value):
282
+ if not (
283
+ isinstance(element_value, (int, float))
284
+ and isinstance(value, (int, float))
285
+ and element_value >= value
286
+ ):
453
287
  return False
454
- elif op == '<=':
288
+ elif op == "<=":
455
289
  # Less than or equal (element value must be <= specified value)
456
- if not (isinstance(element_value, (int, float)) and
457
- isinstance(value, (int, float)) and
458
- element_value <= value):
290
+ if not (
291
+ isinstance(element_value, (int, float))
292
+ and isinstance(value, (int, float))
293
+ and element_value <= value
294
+ ):
459
295
  return False
460
- elif op == '>':
296
+ elif op == ">":
461
297
  # Greater than (element value must be > specified value)
462
- if not (isinstance(element_value, (int, float)) and
463
- isinstance(value, (int, float)) and
464
- element_value > value):
298
+ if not (
299
+ isinstance(element_value, (int, float))
300
+ and isinstance(value, (int, float))
301
+ and element_value > value
302
+ ):
465
303
  return False
466
- elif op == '<':
304
+ elif op == "<":
467
305
  # Less than (element value must be < specified value)
468
- if not (isinstance(element_value, (int, float)) and
469
- isinstance(value, (int, float)) and
470
- element_value < value):
306
+ if not (
307
+ isinstance(element_value, (int, float))
308
+ and isinstance(value, (int, float))
309
+ and element_value < value
310
+ ):
471
311
  return False
472
-
312
+
473
313
  # Check pseudo-classes
474
- for pseudo in selector['pseudo_classes']:
475
- name = pseudo['name']
476
- args = pseudo['args']
477
-
314
+ for pseudo in selector["pseudo_classes"]:
315
+ name = pseudo["name"]
316
+ args = pseudo["args"]
317
+
478
318
  # Handle various pseudo-classes
479
- if name == 'contains' and hasattr(element, 'text'):
480
- use_regex = kwargs.get('regex', False)
481
- ignore_case = not kwargs.get('case', True)
482
-
319
+ if name == "contains" and hasattr(element, "text"):
320
+ use_regex = kwargs.get("regex", False)
321
+ ignore_case = not kwargs.get("case", True)
322
+
483
323
  if use_regex:
484
324
  import re
325
+
485
326
  if not element.text:
486
327
  return False
487
328
  try:
@@ -492,77 +333,79 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
492
333
  # If regex is invalid, fall back to literal text search
493
334
  element_text = element.text
494
335
  search_text = args
495
-
336
+
496
337
  if ignore_case:
497
338
  element_text = element_text.lower()
498
339
  search_text = search_text.lower()
499
-
340
+
500
341
  if search_text not in element_text:
501
342
  return False
502
343
  else:
503
344
  # String comparison with case sensitivity option
504
345
  if not element.text:
505
346
  return False
506
-
347
+
507
348
  element_text = element.text
508
349
  search_text = args
509
-
350
+
510
351
  if ignore_case:
511
352
  element_text = element_text.lower()
512
353
  search_text = search_text.lower()
513
-
354
+
514
355
  if search_text not in element_text:
515
356
  return False
516
- elif name == 'starts-with' and hasattr(element, 'text'):
357
+ elif name == "starts-with" and hasattr(element, "text"):
517
358
  if not element.text or not element.text.startswith(args):
518
359
  return False
519
- elif name == 'ends-with' and hasattr(element, 'text'):
360
+ elif name == "ends-with" and hasattr(element, "text"):
520
361
  if not element.text or not element.text.endswith(args):
521
362
  return False
522
- elif name == 'bold':
523
- if not (hasattr(element, 'bold') and element.bold):
363
+ elif name == "bold":
364
+ if not (hasattr(element, "bold") and element.bold):
524
365
  return False
525
- elif name == 'italic':
526
- if not (hasattr(element, 'italic') and element.italic):
366
+ elif name == "italic":
367
+ if not (hasattr(element, "italic") and element.italic):
527
368
  return False
528
- elif name == 'horizontal':
529
- if not (hasattr(element, 'is_horizontal') and element.is_horizontal):
369
+ elif name == "horizontal":
370
+ if not (hasattr(element, "is_horizontal") and element.is_horizontal):
530
371
  return False
531
- elif name == 'vertical':
532
- if not (hasattr(element, 'is_vertical') and element.is_vertical):
372
+ elif name == "vertical":
373
+ if not (hasattr(element, "is_vertical") and element.is_vertical):
533
374
  return False
534
375
  else:
535
376
  # Check pseudo-classes (basic ones like :bold, :italic)
536
377
  if name in PSEUDO_CLASS_FUNCTIONS:
537
378
  if not PSEUDO_CLASS_FUNCTIONS[name](element):
538
379
  return False
539
- elif name == 'contains':
540
- if not hasattr(element, 'text') or not element.text:
380
+ elif name == "contains":
381
+ if not hasattr(element, "text") or not element.text:
541
382
  return False
542
383
  text_to_check = element.text
543
384
  search_term = args
544
- if not kwargs.get('case', True): # Check case flag from kwargs
385
+ if not kwargs.get("case", True): # Check case flag from kwargs
545
386
  text_to_check = text_to_check.lower()
546
387
  search_term = search_term.lower()
547
-
548
- if kwargs.get('regex', False): # Check regex flag from kwargs
388
+
389
+ if kwargs.get("regex", False): # Check regex flag from kwargs
549
390
  try:
550
391
  if not re.search(search_term, text_to_check):
551
392
  return False
552
393
  except re.error as e:
553
- logger.warning(f"Invalid regex in :contains selector '{search_term}': {e}")
554
- return False # Invalid regex cannot match
394
+ logger.warning(
395
+ f"Invalid regex in :contains selector '{search_term}': {e}"
396
+ )
397
+ return False # Invalid regex cannot match
555
398
  else:
556
399
  if search_term not in text_to_check:
557
400
  return False
558
401
  # Skip complex pseudo-classes like :near, :above here, handled later
559
- elif name in ('above', 'below', 'near', 'left-of', 'right-of'):
560
- pass # Handled separately after initial filtering
402
+ elif name in ("above", "below", "near", "left-of", "right-of"):
403
+ pass # Handled separately after initial filtering
561
404
  else:
562
- # Optionally log unknown pseudo-classes
563
- # logger.warning(f"Unknown pseudo-class: {name}")
564
- pass
565
-
566
- return True # Element passes all attribute and simple pseudo-class filters
405
+ # Optionally log unknown pseudo-classes
406
+ # logger.warning(f"Unknown pseudo-class: {name}")
407
+ pass
567
408
 
568
- return filter_func
409
+ return True # Element passes all attribute and simple pseudo-class filters
410
+
411
+ return filter_func