natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
 
6
-
7
6
  # --- Base Options ---
8
7
  @dataclass
9
8
  class BaseOCROptions:
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
54
53
  output_format: str = "standard"
55
54
 
56
55
 
57
-
58
56
  # --- PaddleOCR Specific Options ---
59
57
  @dataclass
60
58
  class PaddleOCROptions(BaseOCROptions):
natural_pdf/ocr/utils.py CHANGED
@@ -90,7 +90,8 @@ def direct_ocr_llm(
90
90
  buffered = io.BytesIO()
91
91
  # Use the global PDF render lock when rendering images
92
92
  with pdf_render_lock:
93
- region_img = region.to_image(resolution=resolution, include_highlights=False)
93
+ # Use render() for clean image without highlights
94
+ region_img = region.render(resolution=resolution)
94
95
 
95
96
  # Handle cases where image creation might fail (e.g., zero-dim region)
96
97
  if region_img is None:
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
- from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.elements.element_collection import ElementCollection
12
12
 
13
13
  from .qa_result import QAResult
14
14
 
@@ -63,8 +63,22 @@ class DocumentQA:
63
63
 
64
64
  logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
65
65
 
66
- # Initialize the pipeline
67
- self.pipe = pipeline("document-question-answering", model=model_name, device=device)
66
+ # Try MPS, fallback to CPU if OOM
67
+ if device is None and torch.backends.mps.is_available():
68
+ try:
69
+ self.pipe = pipeline(
70
+ "document-question-answering", model=model_name, device="mps"
71
+ )
72
+ self.device = "mps"
73
+ except RuntimeError as e:
74
+ logger.warning(f"MPS OOM: {e}, falling back to CPU")
75
+ self.pipe = pipeline(
76
+ "document-question-answering", model=model_name, device="cpu"
77
+ )
78
+ self.device = "cpu"
79
+ else:
80
+ self.pipe = pipeline("document-question-answering", model=model_name, device=device)
81
+ self.device = device
68
82
 
69
83
  self.model_name = model_name
70
84
  self.device = device
@@ -356,7 +370,8 @@ class DocumentQA:
356
370
  temp_path = temp_file.name
357
371
 
358
372
  # Save a high resolution image (300 DPI)
359
- page_image = page.to_image(resolution=300, include_highlights=False)
373
+ # Use render() for clean image without highlights
374
+ page_image = page.render(resolution=300)
360
375
  page_image.save(temp_path)
361
376
 
362
377
  try:
@@ -470,7 +485,8 @@ class DocumentQA:
470
485
  temp_path = temp_file.name
471
486
 
472
487
  # Get page image at high resolution - this returns a PIL Image directly
473
- page_image = region.page.to_image(resolution=300, include_highlights=False)
488
+ # Use render() for clean image without highlights
489
+ page_image = region.page.render(resolution=300)
474
490
 
475
491
  # Crop to region
476
492
  x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
@@ -49,7 +49,7 @@ class Indexable(Protocol):
49
49
  """
50
50
  Return the primary content of this item.
51
51
  The SearchService implementation will determine how to process this content
52
- (e.g., call .extract_text(), .to_image(), or handle directly).
52
+ (e.g., call .extract_text(), .render(), or handle directly).
53
53
  """
54
54
  ...
55
55
 
@@ -24,7 +24,7 @@ This enables powerful document navigation like:
24
24
  - page.find('text[size>12]:bold:contains("Summary")')
25
25
  - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
26
  - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
27
- - page.find('text:regex("[\u2500-\u257F]")') # Box drawing characters
27
+ - page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
28
28
  """
29
29
 
30
30
  import ast
@@ -101,6 +101,12 @@ def safe_parse_color(value_str: str) -> tuple:
101
101
  """
102
102
  value_str = value_str.strip()
103
103
 
104
+ # Strip quotes first if it's a quoted string (same logic as safe_parse_value)
105
+ if (value_str.startswith('"') and value_str.endswith('"')) or (
106
+ value_str.startswith("'") and value_str.endswith("'")
107
+ ):
108
+ value_str = value_str[1:-1]
109
+
104
110
  # Try parsing as a Python literal (for RGB tuples)
105
111
  try:
106
112
  # If it's already a valid tuple or list, parse it
@@ -504,6 +510,21 @@ def _is_approximate_match(value1, value2) -> bool:
504
510
  return value1 == value2
505
511
 
506
512
 
513
+ def _is_exact_color_match(value1, value2) -> bool:
514
+ """
515
+ Check if two color values match exactly (with small tolerance for color variations).
516
+
517
+ For colors: Uses Delta E color difference with strict tolerance of 2.0
518
+ For non-colors: Falls back to exact equality
519
+ """
520
+ # First check if both values are colors
521
+ if _is_color_value(value1) and _is_color_value(value2):
522
+ return _color_distance(value1, value2) <= 2.0
523
+
524
+ # Default to exact match for non-colors
525
+ return value1 == value2
526
+
527
+
507
528
  PSEUDO_CLASS_FUNCTIONS = {
508
529
  "bold": lambda el: hasattr(el, "bold") and el.bold,
509
530
  "italic": lambda el: hasattr(el, "italic") and el.italic,
@@ -603,7 +624,19 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
603
624
 
604
625
  # Determine compare_func based on op (reuse existing logic)
605
626
  if op == "=":
606
- compare_func = lambda el_val, sel_val: el_val == sel_val
627
+ # For color attributes, use exact color matching with small tolerance
628
+ if name in [
629
+ "color",
630
+ "non_stroking_color",
631
+ "fill",
632
+ "stroke",
633
+ "strokeColor",
634
+ "fillColor",
635
+ ]:
636
+ op_desc = f"= {value!r} (exact color)"
637
+ compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
638
+ else:
639
+ compare_func = lambda el_val, sel_val: el_val == sel_val
607
640
  elif op == "!=":
608
641
  compare_func = lambda el_val, sel_val: el_val != sel_val
609
642
  elif op == "~=":
@@ -39,7 +39,13 @@ class TableResult(Sequence):
39
39
  """Quick property alias → calls :py:meth:`to_df` with default args."""
40
40
  return self.to_df()
41
41
 
42
- def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
42
+ def to_df(
43
+ self,
44
+ header: Union[str, int, List[int], None] = "first",
45
+ index_col=None,
46
+ skip_repeating_headers=None,
47
+ **kwargs,
48
+ ):
43
49
  """Convert to *pandas* DataFrame.
44
50
 
45
51
  Parameters
@@ -47,6 +53,10 @@ class TableResult(Sequence):
47
53
  header : "first" | int | list[int] | None, default "first"
48
54
  • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
49
55
  index_col : same semantics as pandas, forwarded.
56
+ skip_repeating_headers : bool, optional
57
+ Whether to remove body rows that exactly match the header row(s).
58
+ Defaults to True when header is truthy, False otherwise.
59
+ Useful for PDFs where headers repeat throughout the table body.
50
60
  **kwargs : forwarded to :pyclass:`pandas.DataFrame`.
51
61
  """
52
62
  try:
@@ -60,6 +70,10 @@ class TableResult(Sequence):
60
70
  if not rows:
61
71
  return pd.DataFrame()
62
72
 
73
+ # Determine default for skip_repeating_headers based on header parameter
74
+ if skip_repeating_headers is None:
75
+ skip_repeating_headers = header is not None and header is not False
76
+
63
77
  # Determine header rows and body rows
64
78
  body = rows
65
79
  hdr = None
@@ -78,6 +92,26 @@ class TableResult(Sequence):
78
92
  else:
79
93
  raise ValueError("Invalid value for header parameter")
80
94
 
95
+ # Skip repeating headers in body if requested
96
+ if skip_repeating_headers and hdr is not None and body:
97
+ original_body_len = len(body)
98
+ if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
99
+ # Single header row (most common case)
100
+ body = [row for row in body if row != hdr]
101
+ elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
102
+ # Multi-row header (less common)
103
+ hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
104
+ body = [
105
+ row
106
+ for row in body
107
+ if (tuple(row) if isinstance(row, list) else row) not in hdr_set
108
+ ]
109
+
110
+ skipped_count = original_body_len - len(body)
111
+ if skipped_count > 0:
112
+ # Could add logging here if desired
113
+ pass
114
+
81
115
  df = pd.DataFrame(body, columns=hdr)
82
116
  if index_col is not None and not df.empty:
83
117
  df.set_index(
@@ -0,0 +1,101 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Any, Callable, Optional
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class TextMixin: # pylint: disable=too-few-public-methods
10
+ """Mixin that adds general text-replacement capabilities.
11
+
12
+ Two public entry points are exposed to any class that inherits this mix-in:
13
+
14
+ 1. ``update_text`` (preferred) – iterate over text elements selected via the
15
+ ``selector`` argument (default: ``"text"``) and apply a *correction* callback
16
+ which optionally returns replacement text. If the callback returns a
17
+ non-``None`` string that differs from the current value, the element's
18
+ ``text`` attribute is updated in-place.
19
+
20
+ 2. ``correct_ocr`` – legacy name kept for backward compatibility. It simply
21
+ forwards to :py:meth:`update_text` while forcing
22
+ ``selector="text[source=ocr]"`` so that the historic behaviour (acting only
23
+ on OCR-generated elements) is preserved.
24
+ """
25
+
26
+ # ---------------------------------------------------------------------
27
+ # Back-compat shim
28
+ # ---------------------------------------------------------------------
29
+ def correct_ocr(self, *args, selector: str = "text[source=ocr]", **kwargs): # type: ignore[override]
30
+ """Backward-compatibility wrapper that forwards to *update_text*.
31
+
32
+ Parameters
33
+ ----------
34
+ *args, **kwargs
35
+ Forwarded verbatim to :py:meth:`update_text` (after injecting the
36
+ ``selector`` default shown above).
37
+ """
38
+
39
+ # Delegate – subclasses may have overridden *update_text* with a richer
40
+ # signature so we pass everything through untouched.
41
+ return self.update_text(*args, selector=selector, **kwargs) # type: ignore[arg-type]
42
+
43
+ # ------------------------------------------------------------------
44
+ # Generic fallback implementation
45
+ # ------------------------------------------------------------------
46
+ def update_text( # type: ignore[override]
47
+ self,
48
+ transform: Callable[[Any], Optional[str]],
49
+ *,
50
+ selector: str = "text",
51
+ apply_exclusions: bool = False,
52
+ **_,
53
+ ):
54
+ """Generic implementation that works for any object exposing *find_all*.
55
+
56
+ Classes that require more sophisticated behaviour (parallelism, page
57
+ delegation, etc.) are expected to *override* this method while keeping
58
+ the same public contract.
59
+ """
60
+
61
+ if not callable(transform):
62
+ raise TypeError("transform must be callable")
63
+
64
+ # We rely on the presence of *find_all* to obtain elements. If the
65
+ # subclass does not implement it then it *must* override update_text.
66
+ if not hasattr(self, "find_all"):
67
+ raise NotImplementedError(
68
+ f"{self.__class__.__name__} must implement `update_text` explicitly "
69
+ "(no `find_all` method found)."
70
+ )
71
+
72
+ try:
73
+ elements_collection = self.find_all(
74
+ selector=selector, apply_exclusions=apply_exclusions
75
+ )
76
+ except Exception as exc: # pragma: no cover – defensive
77
+ raise RuntimeError(
78
+ f"Failed to gather elements with selector '{selector}': {exc}"
79
+ ) from exc
80
+
81
+ # `find_all` returns an ElementCollection; fall back gracefully otherwise.
82
+ elements_iter = getattr(elements_collection, "elements", elements_collection)
83
+ updated = 0
84
+
85
+ for element in elements_iter:
86
+ if not hasattr(element, "text"):
87
+ continue
88
+
89
+ new_text = transform(element)
90
+ if new_text is not None and isinstance(new_text, str) and new_text != element.text:
91
+ element.text = new_text
92
+ updated += 1
93
+
94
+ logger.info(
95
+ "%s.update_text – processed %d element(s); updated %d.",
96
+ self.__class__.__name__,
97
+ len(elements_iter),
98
+ updated,
99
+ )
100
+
101
+ return self
@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
24
24
  """Generate a base64 encoded image of the page."""
25
25
  # Create a clean image of the page without highlights for the base background
26
26
  # Use a fixed scale consistent with the HTML/JS rendering logic
27
- img = page.to_image(scale=2.0, include_highlights=False)
27
+ # Use render() for clean image without highlights
28
+ img = page.render(resolution=144)
28
29
  if img is None:
29
30
  raise ValueError(f"Failed to render image for page {page.number}")
30
31
 
@@ -7,6 +7,7 @@ The main highlighting logic is now centralized in `natural_pdf.core.highlighting
7
7
 
8
8
  # Re-export necessary functions from visualization
9
9
  from .visualization import (
10
+ create_colorbar,
10
11
  create_legend,
11
12
  get_next_highlight_color,
12
13
  merge_images_with_legend,
@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
2
2
 
3
3
 
4
4
  def merge_bboxes(
5
- bboxes: List[Optional[Tuple[float, float, float, float]]]
5
+ bboxes: List[Optional[Tuple[float, float, float, float]]],
6
6
  ) -> Optional[Tuple[float, float, float, float]]:
7
7
  """
8
8
  Merge multiple bounding boxes into a single one that encompasses all of them.
@@ -23,4 +23,4 @@ def merge_bboxes(
23
23
 
24
24
  x0s, tops, x1s, bottoms = zip(*valid_bboxes)
25
25
 
26
- return (min(x0s), min(tops), max(x1s), max(bottoms))
26
+ return (min(x0s), min(tops), max(x1s), max(bottoms))
@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
18
18
 
19
19
  # Import the specific PDF/Page types if possible, otherwise use Any
20
20
  if TYPE_CHECKING:
21
- from natural_pdf.collections.pdf_collection import PDFCollection
22
21
  from natural_pdf.core.page import Page
23
22
  from natural_pdf.core.pdf import PDF
23
+ from natural_pdf.core.pdf_collection import PDFCollection
24
24
  else:
25
25
  PDF = Any
26
26
  Page = Any
@@ -145,9 +145,10 @@ def create_correction_task_package(
145
145
  image_filename = f"{pdf_short_id}_page_{page.index}.png"
146
146
  image_save_path = os.path.join(images_dir, image_filename)
147
147
  try:
148
- img = page.to_image(resolution=resolution, include_highlights=False)
148
+ # Use render() for clean image without highlights
149
+ img = page.render(resolution=resolution)
149
150
  if img is None:
150
- raise ValueError("page.to_image returned None")
151
+ raise ValueError("page.render returned None")
151
152
  img.save(image_save_path, "PNG")
152
153
  except Exception as e:
153
154
  logger.error(
@@ -175,28 +175,27 @@ def filter_chars_spatially(
175
175
 
176
176
 
177
177
  def _apply_content_filter(
178
- char_dicts: List[Dict[str, Any]],
179
- content_filter: Union[str, Callable[[str], bool], List[str]]
178
+ char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
180
179
  ) -> List[Dict[str, Any]]:
181
180
  """
182
181
  Applies content filtering to character dictionaries based on their text content.
183
-
182
+
184
183
  Args:
185
184
  char_dicts: List of character dictionaries to filter.
186
185
  content_filter: Can be:
187
186
  - A regex pattern string (characters matching the pattern are EXCLUDED)
188
187
  - A callable that takes text and returns True to KEEP the character
189
188
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
190
-
189
+
191
190
  Returns:
192
191
  Filtered list of character dictionaries.
193
192
  """
194
193
  if not char_dicts or content_filter is None:
195
194
  return char_dicts
196
-
195
+
197
196
  initial_count = len(char_dicts)
198
197
  filtered_chars = []
199
-
198
+
200
199
  # Handle different filter types
201
200
  if isinstance(content_filter, str):
202
201
  # Single regex pattern - exclude matching characters
@@ -207,9 +206,11 @@ def _apply_content_filter(
207
206
  if not pattern.search(text):
208
207
  filtered_chars.append(char_dict)
209
208
  except re.error as e:
210
- logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
209
+ logger.warning(
210
+ f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
211
+ )
211
212
  return char_dicts
212
-
213
+
213
214
  elif isinstance(content_filter, list):
214
215
  # List of regex patterns - exclude characters matching ANY pattern
215
216
  try:
@@ -221,7 +222,7 @@ def _apply_content_filter(
221
222
  except re.error as e:
222
223
  logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
223
224
  return char_dicts
224
-
225
+
225
226
  elif callable(content_filter):
226
227
  # Callable filter - keep characters where function returns True
227
228
  try:
@@ -233,13 +234,15 @@ def _apply_content_filter(
233
234
  logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
234
235
  return char_dicts
235
236
  else:
236
- logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
237
+ logger.warning(
238
+ f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
239
+ )
237
240
  return char_dicts
238
-
241
+
239
242
  filtered_count = initial_count - len(filtered_chars)
240
243
  if filtered_count > 0:
241
244
  logger.debug(f"Content filter removed {filtered_count} characters.")
242
-
245
+
243
246
  return filtered_chars
244
247
 
245
248