natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -7,6 +7,9 @@ import logging
|
|
7
7
|
import re
|
8
8
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
9
9
|
|
10
|
+
from colormath2.color_conversions import convert_color
|
11
|
+
from colormath2.color_diff import delta_e_cie2000
|
12
|
+
from colormath2.color_objects import LabColor, sRGBColor
|
10
13
|
from colour import Color
|
11
14
|
|
12
15
|
logger = logging.getLogger(__name__)
|
@@ -39,13 +42,16 @@ def safe_parse_value(value_str: str) -> Any:
|
|
39
42
|
|
40
43
|
def safe_parse_color(value_str: str) -> tuple:
|
41
44
|
"""
|
42
|
-
Parse a color value which could be an RGB tuple, color name, or
|
45
|
+
Parse a color value which could be an RGB tuple, color name, hex code, or CSS-style rgb(...)/rgba(...).
|
43
46
|
|
44
47
|
Args:
|
45
|
-
value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
|
48
|
+
value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)", "rgb(0,0,255)")
|
46
49
|
|
47
50
|
Returns:
|
48
51
|
RGB tuple (r, g, b) with values from 0 to 1
|
52
|
+
|
53
|
+
Raises:
|
54
|
+
ValueError: If the color cannot be parsed
|
49
55
|
"""
|
50
56
|
value_str = value_str.strip()
|
51
57
|
|
@@ -57,83 +63,100 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
57
63
|
# Return just the RGB components as a tuple
|
58
64
|
return tuple(color_tuple[:3])
|
59
65
|
except (SyntaxError, ValueError):
|
60
|
-
# Not a valid tuple/list, try
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
66
|
+
pass # Not a valid tuple/list, try other formats
|
67
|
+
|
68
|
+
# Try parsing CSS-style rgb(...) or rgba(...)
|
69
|
+
css_rgb_match = re.match(
|
70
|
+
r"rgb\s*\(\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)", value_str, re.IGNORECASE
|
71
|
+
)
|
72
|
+
css_rgba_match = re.match(
|
73
|
+
r"rgba\s*\(\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9\.]+)\s*\)",
|
74
|
+
value_str,
|
75
|
+
re.IGNORECASE,
|
76
|
+
)
|
77
|
+
if css_rgb_match:
|
78
|
+
r, g, b = map(int, css_rgb_match.groups())
|
79
|
+
return (r / 255.0, g / 255.0, b / 255.0)
|
80
|
+
elif css_rgba_match:
|
81
|
+
r, g, b, a = css_rgba_match.groups()
|
82
|
+
r, g, b = int(r), int(g), int(b)
|
83
|
+
# alpha is ignored for now, but could be used if needed
|
84
|
+
return (r / 255.0, g / 255.0, b / 255.0)
|
85
|
+
|
86
|
+
# Try as a color name or hex
|
87
|
+
try:
|
88
|
+
color = Color(value_str)
|
89
|
+
return (color.red, color.green, color.blue)
|
90
|
+
except (ValueError, AttributeError) as e:
|
91
|
+
raise ValueError(f"Could not parse color value: {value_str}") from e
|
69
92
|
|
70
|
-
# If we got here with a non-tuple,
|
71
|
-
|
93
|
+
# If we got here with a non-tuple, raise error
|
94
|
+
raise ValueError(f"Invalid color value: {value_str}")
|
72
95
|
|
73
96
|
|
74
97
|
def _split_top_level_or(selector: str) -> List[str]:
|
75
98
|
"""
|
76
99
|
Split a selector string on top-level OR operators (| or ,) only.
|
77
|
-
|
100
|
+
|
78
101
|
Respects parsing contexts and does not split when | or , appear inside:
|
79
102
|
- Quoted strings (both single and double quotes)
|
80
103
|
- Parentheses (for pseudo-class arguments like :not(...))
|
81
104
|
- Square brackets (for attribute selectors like [attr="value"])
|
82
|
-
|
105
|
+
|
83
106
|
Args:
|
84
107
|
selector: The selector string to split
|
85
|
-
|
108
|
+
|
86
109
|
Returns:
|
87
110
|
List of selector parts. If no top-level OR operators found, returns [selector].
|
88
|
-
|
111
|
+
|
89
112
|
Examples:
|
90
113
|
>>> _split_top_level_or('text:contains("a|b")|text:bold')
|
91
114
|
['text:contains("a|b")', 'text:bold']
|
92
|
-
|
115
|
+
|
93
116
|
>>> _split_top_level_or('text:contains("hello,world")')
|
94
117
|
['text:contains("hello,world")']
|
95
118
|
"""
|
96
119
|
if not selector or not isinstance(selector, str):
|
97
120
|
return [selector] if selector else []
|
98
|
-
|
121
|
+
|
99
122
|
parts = []
|
100
123
|
current_part = ""
|
101
124
|
i = 0
|
102
|
-
|
125
|
+
|
103
126
|
# Parsing state
|
104
127
|
in_double_quotes = False
|
105
128
|
in_single_quotes = False
|
106
129
|
paren_depth = 0
|
107
130
|
bracket_depth = 0
|
108
|
-
|
131
|
+
|
109
132
|
while i < len(selector):
|
110
133
|
char = selector[i]
|
111
|
-
|
134
|
+
|
112
135
|
# Handle escape sequences in quotes
|
113
|
-
if i > 0 and selector[i-1] ==
|
136
|
+
if i > 0 and selector[i - 1] == "\\":
|
114
137
|
current_part += char
|
115
138
|
i += 1
|
116
139
|
continue
|
117
|
-
|
140
|
+
|
118
141
|
# Handle quote state changes
|
119
142
|
if char == '"' and not in_single_quotes:
|
120
143
|
in_double_quotes = not in_double_quotes
|
121
144
|
elif char == "'" and not in_double_quotes:
|
122
145
|
in_single_quotes = not in_single_quotes
|
123
|
-
|
146
|
+
|
124
147
|
# Handle parentheses and brackets only when not in quotes
|
125
148
|
elif not in_double_quotes and not in_single_quotes:
|
126
|
-
if char ==
|
149
|
+
if char == "(":
|
127
150
|
paren_depth += 1
|
128
|
-
elif char ==
|
151
|
+
elif char == ")":
|
129
152
|
paren_depth -= 1
|
130
|
-
elif char ==
|
153
|
+
elif char == "[":
|
131
154
|
bracket_depth += 1
|
132
|
-
elif char ==
|
155
|
+
elif char == "]":
|
133
156
|
bracket_depth -= 1
|
134
|
-
|
157
|
+
|
135
158
|
# Check for top-level OR operators
|
136
|
-
elif (char ==
|
159
|
+
elif (char == "|" or char == ",") and paren_depth == 0 and bracket_depth == 0:
|
137
160
|
# Found a top-level OR operator
|
138
161
|
part = current_part.strip()
|
139
162
|
if part: # Only add non-empty parts
|
@@ -141,16 +164,16 @@ def _split_top_level_or(selector: str) -> List[str]:
|
|
141
164
|
current_part = ""
|
142
165
|
i += 1
|
143
166
|
continue
|
144
|
-
|
167
|
+
|
145
168
|
# Add character to current part
|
146
169
|
current_part += char
|
147
170
|
i += 1
|
148
|
-
|
171
|
+
|
149
172
|
# Add the final part
|
150
173
|
final_part = current_part.strip()
|
151
174
|
if final_part:
|
152
175
|
parts.append(final_part)
|
153
|
-
|
176
|
+
|
154
177
|
# If we only found one part, return it as a single-element list
|
155
178
|
# If we found multiple parts, those are the OR-separated parts
|
156
179
|
return parts if parts else [selector]
|
@@ -176,10 +199,10 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
176
199
|
Examples:
|
177
200
|
>>> parse_selector('text:contains("hello")') # Single selector
|
178
201
|
{'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
|
179
|
-
|
202
|
+
|
180
203
|
>>> parse_selector('text:contains("A")|text:bold') # OR with pipe
|
181
204
|
{'type': 'or', 'selectors': [...]}
|
182
|
-
|
205
|
+
|
183
206
|
>>> parse_selector('text:contains("A"),line[width>5]') # OR with comma
|
184
207
|
{'type': 'or', 'selectors': [...]}
|
185
208
|
|
@@ -205,7 +228,7 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
205
228
|
# Check if selector contains OR operators at the top level only
|
206
229
|
# (not inside quotes, parentheses, or brackets)
|
207
230
|
or_parts = _split_top_level_or(selector)
|
208
|
-
|
231
|
+
|
209
232
|
# If we found OR parts, parse each one recursively and return compound selector
|
210
233
|
if len(or_parts) > 1:
|
211
234
|
parsed_selectors = []
|
@@ -215,22 +238,21 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
215
238
|
except (ValueError, TypeError) as e:
|
216
239
|
logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
|
217
240
|
continue
|
218
|
-
|
241
|
+
|
219
242
|
if len(parsed_selectors) > 1:
|
220
|
-
return {
|
221
|
-
"type": "or",
|
222
|
-
"selectors": parsed_selectors
|
223
|
-
}
|
243
|
+
return {"type": "or", "selectors": parsed_selectors}
|
224
244
|
elif len(parsed_selectors) == 1:
|
225
245
|
# Only one valid part, return it directly
|
226
246
|
return parsed_selectors[0]
|
227
247
|
else:
|
228
248
|
# No valid parts, return default
|
229
|
-
logger.warning(
|
249
|
+
logger.warning(
|
250
|
+
f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector"
|
251
|
+
)
|
230
252
|
return result
|
231
253
|
|
232
254
|
# --- Continue with single selector parsing (existing logic) ---
|
233
|
-
|
255
|
+
|
234
256
|
# --- Handle wildcard selector explicitly ---
|
235
257
|
if selector == "*":
|
236
258
|
# Wildcard matches any type, already the default.
|
@@ -296,6 +318,12 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
296
318
|
parsed_value = safe_parse_color(value_str)
|
297
319
|
else:
|
298
320
|
parsed_value = safe_parse_value(value_str) # Handles quotes
|
321
|
+
# If using ~= with a numeric value, warn once during parsing
|
322
|
+
if op == "~=" and isinstance(parsed_value, (int, float)):
|
323
|
+
logger.warning(
|
324
|
+
f"Using ~= with numeric values. This will match if the absolute difference is <= 2.0. "
|
325
|
+
f"Consider using explicit ranges (e.g., [width>1][width<4]) for more control."
|
326
|
+
)
|
299
327
|
result["attributes"].append({"name": name, "op": op, "value": parsed_value})
|
300
328
|
|
301
329
|
selector = selector[attr_match.end() :].strip()
|
@@ -364,46 +392,62 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
364
392
|
return result
|
365
393
|
|
366
394
|
|
367
|
-
def
|
395
|
+
def _is_color_value(value) -> bool:
|
368
396
|
"""
|
369
|
-
Check if
|
397
|
+
Check if a value represents a color by attempting to parse it with Color.
|
398
|
+
"""
|
399
|
+
try:
|
400
|
+
# If it's already a tuple/list, convert to tuple
|
401
|
+
if isinstance(value, (list, tuple)) and len(value) >= 3:
|
402
|
+
return True
|
403
|
+
# Otherwise try parsing as a color name/hex
|
404
|
+
Color(value)
|
405
|
+
return True
|
406
|
+
except:
|
407
|
+
return False
|
370
408
|
|
371
|
-
This is mainly used for color comparisons with some tolerance.
|
372
409
|
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
410
|
+
def _color_distance(color1, color2) -> float:
|
411
|
+
"""
|
412
|
+
Calculate Delta E color difference between two colors.
|
413
|
+
Colors can be strings (names/hex) or RGB tuples.
|
377
414
|
|
378
415
|
Returns:
|
379
|
-
|
416
|
+
Delta E value, or float('inf') if colors can't be compared
|
380
417
|
"""
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
pass
|
393
|
-
|
394
|
-
# If both are tuples/lists with the same length (e.g., colors)
|
395
|
-
if (
|
396
|
-
isinstance(value1, (list, tuple))
|
397
|
-
and isinstance(value2, (list, tuple))
|
398
|
-
and len(value1) == len(value2)
|
399
|
-
):
|
418
|
+
try:
|
419
|
+
# Convert to RGB tuples
|
420
|
+
if isinstance(color1, (list, tuple)) and len(color1) >= 3:
|
421
|
+
rgb1 = sRGBColor(*color1[:3])
|
422
|
+
else:
|
423
|
+
rgb1 = sRGBColor(*Color(color1).rgb)
|
424
|
+
|
425
|
+
if isinstance(color2, (list, tuple)) and len(color2) >= 3:
|
426
|
+
rgb2 = sRGBColor(*color2[:3])
|
427
|
+
else:
|
428
|
+
rgb2 = sRGBColor(*Color(color2).rgb)
|
400
429
|
|
401
|
-
|
402
|
-
|
430
|
+
lab1 = convert_color(rgb1, LabColor)
|
431
|
+
lab2 = convert_color(rgb2, LabColor)
|
432
|
+
return delta_e_cie2000(lab1, lab2)
|
433
|
+
except:
|
434
|
+
return float("inf")
|
403
435
|
|
404
|
-
|
436
|
+
|
437
|
+
def _is_approximate_match(value1, value2) -> bool:
|
438
|
+
"""
|
439
|
+
Check if two values approximately match.
|
440
|
+
|
441
|
+
For colors: Uses Delta E color difference with tolerance of 20.0
|
442
|
+
For numbers: Uses absolute difference with tolerance of 2.0
|
443
|
+
"""
|
444
|
+
# First check if both values are colors
|
445
|
+
if _is_color_value(value1) and _is_color_value(value2):
|
446
|
+
return _color_distance(value1, value2) <= 20.0
|
447
|
+
|
448
|
+
# Then check if both are numbers
|
405
449
|
if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
|
406
|
-
return abs(value1 - value2) <=
|
450
|
+
return abs(value1 - value2) <= 2.0
|
407
451
|
|
408
452
|
# Default to exact match for other types
|
409
453
|
return value1 == value2
|
@@ -511,7 +555,7 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
511
555
|
compare_func = lambda el_val, sel_val: el_val == sel_val
|
512
556
|
elif op == "!=":
|
513
557
|
compare_func = lambda el_val, sel_val: el_val != sel_val
|
514
|
-
elif op == "
|
558
|
+
elif op == "~=":
|
515
559
|
op_desc = f"~= {value!r} (approx)"
|
516
560
|
compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
|
517
561
|
elif op == "^=":
|
@@ -734,15 +778,15 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
734
778
|
if not sub_selectors:
|
735
779
|
# Empty OR selector, return a function that never matches
|
736
780
|
return lambda element: False
|
737
|
-
|
781
|
+
|
738
782
|
# Create filter functions for each sub-selector
|
739
783
|
sub_filter_funcs = []
|
740
784
|
for sub_selector in sub_selectors:
|
741
785
|
sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
|
742
|
-
|
786
|
+
|
743
787
|
if logger.isEnabledFor(logging.DEBUG):
|
744
788
|
logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
|
745
|
-
|
789
|
+
|
746
790
|
# Return OR combination - element matches if ANY sub-selector matches
|
747
791
|
def or_filter(element):
|
748
792
|
for func in sub_filter_funcs:
|
@@ -754,9 +798,9 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
754
798
|
# Continue to next sub-filter on error
|
755
799
|
continue
|
756
800
|
return False
|
757
|
-
|
801
|
+
|
758
802
|
return or_filter
|
759
|
-
|
803
|
+
|
760
804
|
# Handle single selectors (existing logic)
|
761
805
|
filter_list = _build_filter_list(selector, **kwargs)
|
762
806
|
|
natural_pdf/widgets/__init__.py
CHANGED