natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +226 -70
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/elements/base.py +9 -9
  14. natural_pdf/elements/collections.py +105 -50
  15. natural_pdf/elements/region.py +320 -113
  16. natural_pdf/exporters/paddleocr.py +38 -13
  17. natural_pdf/flows/__init__.py +3 -3
  18. natural_pdf/flows/collections.py +303 -132
  19. natural_pdf/flows/element.py +277 -132
  20. natural_pdf/flows/flow.py +33 -16
  21. natural_pdf/flows/region.py +142 -79
  22. natural_pdf/ocr/engine_doctr.py +37 -4
  23. natural_pdf/ocr/engine_easyocr.py +23 -3
  24. natural_pdf/ocr/engine_paddle.py +281 -30
  25. natural_pdf/ocr/engine_surya.py +8 -3
  26. natural_pdf/ocr/ocr_manager.py +75 -76
  27. natural_pdf/ocr/ocr_options.py +52 -87
  28. natural_pdf/search/__init__.py +25 -12
  29. natural_pdf/search/lancedb_search_service.py +91 -54
  30. natural_pdf/search/numpy_search_service.py +86 -65
  31. natural_pdf/search/searchable_mixin.py +2 -2
  32. natural_pdf/selectors/parser.py +125 -81
  33. natural_pdf/widgets/__init__.py +1 -1
  34. natural_pdf/widgets/viewer.py +205 -449
  35. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
  36. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
  37. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
  38. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
  39. {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,9 @@ import logging
7
7
  import re
8
8
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
9
9
 
10
+ from colormath2.color_conversions import convert_color
11
+ from colormath2.color_diff import delta_e_cie2000
12
+ from colormath2.color_objects import LabColor, sRGBColor
10
13
  from colour import Color
11
14
 
12
15
  logger = logging.getLogger(__name__)
@@ -39,13 +42,16 @@ def safe_parse_value(value_str: str) -> Any:
39
42
 
40
43
  def safe_parse_color(value_str: str) -> tuple:
41
44
  """
42
- Parse a color value which could be an RGB tuple, color name, or hex code.
45
+ Parse a color value which could be an RGB tuple, color name, hex code, or CSS-style rgb(...)/rgba(...).
43
46
 
44
47
  Args:
45
- value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)")
48
+ value_str: String representation of a color (e.g., "red", "#ff0000", "(1,0,0)", "rgb(0,0,255)")
46
49
 
47
50
  Returns:
48
51
  RGB tuple (r, g, b) with values from 0 to 1
52
+
53
+ Raises:
54
+ ValueError: If the color cannot be parsed
49
55
  """
50
56
  value_str = value_str.strip()
51
57
 
@@ -57,83 +63,100 @@ def safe_parse_color(value_str: str) -> tuple:
57
63
  # Return just the RGB components as a tuple
58
64
  return tuple(color_tuple[:3])
59
65
  except (SyntaxError, ValueError):
60
- # Not a valid tuple/list, try as a color name or hex
61
- try:
62
- # Use colour library to parse color names, hex values, etc.
63
- color = Color(value_str)
64
- # Convert to RGB tuple with values between 0 and 1
65
- return (color.red, color.green, color.blue)
66
- except (ValueError, AttributeError):
67
- # If color parsing fails, return a default (black)
68
- return (0, 0, 0)
66
+ pass # Not a valid tuple/list, try other formats
67
+
68
+ # Try parsing CSS-style rgb(...) or rgba(...)
69
+ css_rgb_match = re.match(
70
+ r"rgb\s*\(\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*\)", value_str, re.IGNORECASE
71
+ )
72
+ css_rgba_match = re.match(
73
+ r"rgba\s*\(\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9\.]+)\s*\)",
74
+ value_str,
75
+ re.IGNORECASE,
76
+ )
77
+ if css_rgb_match:
78
+ r, g, b = map(int, css_rgb_match.groups())
79
+ return (r / 255.0, g / 255.0, b / 255.0)
80
+ elif css_rgba_match:
81
+ r, g, b, a = css_rgba_match.groups()
82
+ r, g, b = int(r), int(g), int(b)
83
+ # alpha is ignored for now, but could be used if needed
84
+ return (r / 255.0, g / 255.0, b / 255.0)
85
+
86
+ # Try as a color name or hex
87
+ try:
88
+ color = Color(value_str)
89
+ return (color.red, color.green, color.blue)
90
+ except (ValueError, AttributeError) as e:
91
+ raise ValueError(f"Could not parse color value: {value_str}") from e
69
92
 
70
- # If we got here with a non-tuple, return default
71
- return (0, 0, 0)
93
+ # If we got here with a non-tuple, raise error
94
+ raise ValueError(f"Invalid color value: {value_str}")
72
95
 
73
96
 
74
97
  def _split_top_level_or(selector: str) -> List[str]:
75
98
  """
76
99
  Split a selector string on top-level OR operators (| or ,) only.
77
-
100
+
78
101
  Respects parsing contexts and does not split when | or , appear inside:
79
102
  - Quoted strings (both single and double quotes)
80
103
  - Parentheses (for pseudo-class arguments like :not(...))
81
104
  - Square brackets (for attribute selectors like [attr="value"])
82
-
105
+
83
106
  Args:
84
107
  selector: The selector string to split
85
-
108
+
86
109
  Returns:
87
110
  List of selector parts. If no top-level OR operators found, returns [selector].
88
-
111
+
89
112
  Examples:
90
113
  >>> _split_top_level_or('text:contains("a|b")|text:bold')
91
114
  ['text:contains("a|b")', 'text:bold']
92
-
115
+
93
116
  >>> _split_top_level_or('text:contains("hello,world")')
94
117
  ['text:contains("hello,world")']
95
118
  """
96
119
  if not selector or not isinstance(selector, str):
97
120
  return [selector] if selector else []
98
-
121
+
99
122
  parts = []
100
123
  current_part = ""
101
124
  i = 0
102
-
125
+
103
126
  # Parsing state
104
127
  in_double_quotes = False
105
128
  in_single_quotes = False
106
129
  paren_depth = 0
107
130
  bracket_depth = 0
108
-
131
+
109
132
  while i < len(selector):
110
133
  char = selector[i]
111
-
134
+
112
135
  # Handle escape sequences in quotes
113
- if i > 0 and selector[i-1] == '\\':
136
+ if i > 0 and selector[i - 1] == "\\":
114
137
  current_part += char
115
138
  i += 1
116
139
  continue
117
-
140
+
118
141
  # Handle quote state changes
119
142
  if char == '"' and not in_single_quotes:
120
143
  in_double_quotes = not in_double_quotes
121
144
  elif char == "'" and not in_double_quotes:
122
145
  in_single_quotes = not in_single_quotes
123
-
146
+
124
147
  # Handle parentheses and brackets only when not in quotes
125
148
  elif not in_double_quotes and not in_single_quotes:
126
- if char == '(':
149
+ if char == "(":
127
150
  paren_depth += 1
128
- elif char == ')':
151
+ elif char == ")":
129
152
  paren_depth -= 1
130
- elif char == '[':
153
+ elif char == "[":
131
154
  bracket_depth += 1
132
- elif char == ']':
155
+ elif char == "]":
133
156
  bracket_depth -= 1
134
-
157
+
135
158
  # Check for top-level OR operators
136
- elif (char == '|' or char == ',') and paren_depth == 0 and bracket_depth == 0:
159
+ elif (char == "|" or char == ",") and paren_depth == 0 and bracket_depth == 0:
137
160
  # Found a top-level OR operator
138
161
  part = current_part.strip()
139
162
  if part: # Only add non-empty parts
@@ -141,16 +164,16 @@ def _split_top_level_or(selector: str) -> List[str]:
141
164
  current_part = ""
142
165
  i += 1
143
166
  continue
144
-
167
+
145
168
  # Add character to current part
146
169
  current_part += char
147
170
  i += 1
148
-
171
+
149
172
  # Add the final part
150
173
  final_part = current_part.strip()
151
174
  if final_part:
152
175
  parts.append(final_part)
153
-
176
+
154
177
  # If we only found one part, return it as a single-element list
155
178
  # If we found multiple parts, those are the OR-separated parts
156
179
  return parts if parts else [selector]
@@ -176,10 +199,10 @@ def parse_selector(selector: str) -> Dict[str, Any]:
176
199
  Examples:
177
200
  >>> parse_selector('text:contains("hello")') # Single selector
178
201
  {'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
179
-
202
+
180
203
  >>> parse_selector('text:contains("A")|text:bold') # OR with pipe
181
204
  {'type': 'or', 'selectors': [...]}
182
-
205
+
183
206
  >>> parse_selector('text:contains("A"),line[width>5]') # OR with comma
184
207
  {'type': 'or', 'selectors': [...]}
185
208
 
@@ -205,7 +228,7 @@ def parse_selector(selector: str) -> Dict[str, Any]:
205
228
  # Check if selector contains OR operators at the top level only
206
229
  # (not inside quotes, parentheses, or brackets)
207
230
  or_parts = _split_top_level_or(selector)
208
-
231
+
209
232
  # If we found OR parts, parse each one recursively and return compound selector
210
233
  if len(or_parts) > 1:
211
234
  parsed_selectors = []
@@ -215,22 +238,21 @@ def parse_selector(selector: str) -> Dict[str, Any]:
215
238
  except (ValueError, TypeError) as e:
216
239
  logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
217
240
  continue
218
-
241
+
219
242
  if len(parsed_selectors) > 1:
220
- return {
221
- "type": "or",
222
- "selectors": parsed_selectors
223
- }
243
+ return {"type": "or", "selectors": parsed_selectors}
224
244
  elif len(parsed_selectors) == 1:
225
245
  # Only one valid part, return it directly
226
246
  return parsed_selectors[0]
227
247
  else:
228
248
  # No valid parts, return default
229
- logger.warning(f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector")
249
+ logger.warning(
250
+ f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector"
251
+ )
230
252
  return result
231
253
 
232
254
  # --- Continue with single selector parsing (existing logic) ---
233
-
255
+
234
256
  # --- Handle wildcard selector explicitly ---
235
257
  if selector == "*":
236
258
  # Wildcard matches any type, already the default.
@@ -296,6 +318,12 @@ def parse_selector(selector: str) -> Dict[str, Any]:
296
318
  parsed_value = safe_parse_color(value_str)
297
319
  else:
298
320
  parsed_value = safe_parse_value(value_str) # Handles quotes
321
+ # If using ~= with a numeric value, warn once during parsing
322
+ if op == "~=" and isinstance(parsed_value, (int, float)):
323
+ logger.warning(
324
+ f"Using ~= with numeric values. This will match if the absolute difference is <= 2.0. "
325
+ f"Consider using explicit ranges (e.g., [width>1][width<4]) for more control."
326
+ )
299
327
  result["attributes"].append({"name": name, "op": op, "value": parsed_value})
300
328
 
301
329
  selector = selector[attr_match.end() :].strip()
@@ -364,46 +392,62 @@ def parse_selector(selector: str) -> Dict[str, Any]:
364
392
  return result
365
393
 
366
394
 
367
- def _is_approximate_match(value1, value2, tolerance: float = 0.1) -> bool:
395
+ def _is_color_value(value) -> bool:
368
396
  """
369
- Check if two values approximately match.
397
+ Check if a value represents a color by attempting to parse it with Color.
398
+ """
399
+ try:
400
+ # If it's already a tuple/list, convert to tuple
401
+ if isinstance(value, (list, tuple)) and len(value) >= 3:
402
+ return True
403
+ # Otherwise try parsing as a color name/hex
404
+ Color(value)
405
+ return True
406
+ except:
407
+ return False
370
408
 
371
- This is mainly used for color comparisons with some tolerance.
372
409
 
373
- Args:
374
- value1: First value
375
- value2: Second value
376
- tolerance: Maximum difference allowed
410
+ def _color_distance(color1, color2) -> float:
411
+ """
412
+ Calculate Delta E color difference between two colors.
413
+ Colors can be strings (names/hex) or RGB tuples.
377
414
 
378
415
  Returns:
379
- True if the values approximately match
416
+ Delta E value, or float('inf') if colors can't be compared
380
417
  """
381
- # Handle string colors by converting them to RGB tuples
382
- if isinstance(value1, str):
383
- try:
384
- value1 = tuple(Color(value1).rgb)
385
- except:
386
- pass
387
-
388
- if isinstance(value2, str):
389
- try:
390
- value2 = tuple(Color(value2).rgb)
391
- except:
392
- pass
393
-
394
- # If both are tuples/lists with the same length (e.g., colors)
395
- if (
396
- isinstance(value1, (list, tuple))
397
- and isinstance(value2, (list, tuple))
398
- and len(value1) == len(value2)
399
- ):
418
+ try:
419
+ # Convert to RGB tuples
420
+ if isinstance(color1, (list, tuple)) and len(color1) >= 3:
421
+ rgb1 = sRGBColor(*color1[:3])
422
+ else:
423
+ rgb1 = sRGBColor(*Color(color1).rgb)
424
+
425
+ if isinstance(color2, (list, tuple)) and len(color2) >= 3:
426
+ rgb2 = sRGBColor(*color2[:3])
427
+ else:
428
+ rgb2 = sRGBColor(*Color(color2).rgb)
400
429
 
401
- # Check if all components are within tolerance
402
- return all(abs(a - b) <= tolerance for a, b in zip(value1, value2))
430
+ lab1 = convert_color(rgb1, LabColor)
431
+ lab2 = convert_color(rgb2, LabColor)
432
+ return delta_e_cie2000(lab1, lab2)
433
+ except:
434
+ return float("inf")
403
435
 
404
- # If both are numbers
436
+
437
+ def _is_approximate_match(value1, value2) -> bool:
438
+ """
439
+ Check if two values approximately match.
440
+
441
+ For colors: Uses Delta E color difference with tolerance of 20.0
442
+ For numbers: Uses absolute difference with tolerance of 2.0
443
+ """
444
+ # First check if both values are colors
445
+ if _is_color_value(value1) and _is_color_value(value2):
446
+ return _color_distance(value1, value2) <= 20.0
447
+
448
+ # Then check if both are numbers
405
449
  if isinstance(value1, (int, float)) and isinstance(value2, (int, float)):
406
- return abs(value1 - value2) <= tolerance
450
+ return abs(value1 - value2) <= 2.0
407
451
 
408
452
  # Default to exact match for other types
409
453
  return value1 == value2
@@ -511,7 +555,7 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
511
555
  compare_func = lambda el_val, sel_val: el_val == sel_val
512
556
  elif op == "!=":
513
557
  compare_func = lambda el_val, sel_val: el_val != sel_val
514
- elif op == "~":
558
+ elif op == "~=":
515
559
  op_desc = f"~= {value!r} (approx)"
516
560
  compare_func = lambda el_val, sel_val: _is_approximate_match(el_val, sel_val)
517
561
  elif op == "^=":
@@ -734,15 +778,15 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
734
778
  if not sub_selectors:
735
779
  # Empty OR selector, return a function that never matches
736
780
  return lambda element: False
737
-
781
+
738
782
  # Create filter functions for each sub-selector
739
783
  sub_filter_funcs = []
740
784
  for sub_selector in sub_selectors:
741
785
  sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
742
-
786
+
743
787
  if logger.isEnabledFor(logging.DEBUG):
744
788
  logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
745
-
789
+
746
790
  # Return OR combination - element matches if ANY sub-selector matches
747
791
  def or_filter(element):
748
792
  for func in sub_filter_funcs:
@@ -754,9 +798,9 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
754
798
  # Continue to next sub-filter on error
755
799
  continue
756
800
  return False
757
-
801
+
758
802
  return or_filter
759
-
803
+
760
804
  # Handle single selectors (existing logic)
761
805
  filter_list = _build_filter_list(selector, **kwargs)
762
806
 
@@ -1,3 +1,3 @@
1
1
  # Also provide the original implementation for reference
2
+ from .viewer import InteractiveViewerWidget
2
3
  from .viewer import InteractiveViewerWidget as _OriginalInteractiveViewerWidget
3
- from .viewer import SimpleInteractiveViewerWidget as InteractiveViewerWidget