natural-pdf 0.1.40__py3-none-any.whl → 0.2.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +6 -7
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +236 -383
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +172 -83
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +318 -243
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +4 -4
  33. natural_pdf/flows/flow.py +1200 -243
  34. natural_pdf/flows/region.py +707 -261
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +2 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +7 -3
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/RECORD +55 -53
  50. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/top_level.txt +0 -2
  51. optimization/memory_comparison.py +1 -1
  52. optimization/pdf_analyzer.py +2 -2
  53. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/WHEEL +0 -0
  54. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/entry_points.txt +0 -0
  55. {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.1.dev0.dist-info}/licenses/LICENSE +0 -0
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
3
3
  from typing import Any, Dict, List, Optional, Tuple, Union
4
4
 
5
5
 
6
-
7
6
  # --- Base Options ---
8
7
  @dataclass
9
8
  class BaseOCROptions:
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
54
53
  output_format: str = "standard"
55
54
 
56
55
 
57
-
58
56
  # --- PaddleOCR Specific Options ---
59
57
  @dataclass
60
58
  class PaddleOCROptions(BaseOCROptions):
natural_pdf/ocr/utils.py CHANGED
@@ -90,7 +90,8 @@ def direct_ocr_llm(
90
90
  buffered = io.BytesIO()
91
91
  # Use the global PDF render lock when rendering images
92
92
  with pdf_render_lock:
93
- region_img = region.to_image(resolution=resolution, include_highlights=False)
93
+ # Use render() for clean image without highlights
94
+ region_img = region.render(resolution=resolution)
94
95
 
95
96
  # Handle cases where image creation might fail (e.g., zero-dim region)
96
97
  if region_img is None:
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
10
10
 
11
- from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.elements.element_collection import ElementCollection
12
12
 
13
13
  from .qa_result import QAResult
14
14
 
@@ -63,8 +63,22 @@ class DocumentQA:
63
63
 
64
64
  logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
65
65
 
66
- # Initialize the pipeline
67
- self.pipe = pipeline("document-question-answering", model=model_name, device=device)
66
+ # Try MPS, fallback to CPU if OOM
67
+ if device is None and torch.backends.mps.is_available():
68
+ try:
69
+ self.pipe = pipeline(
70
+ "document-question-answering", model=model_name, device="mps"
71
+ )
72
+ self.device = "mps"
73
+ except RuntimeError as e:
74
+ logger.warning(f"MPS OOM: {e}, falling back to CPU")
75
+ self.pipe = pipeline(
76
+ "document-question-answering", model=model_name, device="cpu"
77
+ )
78
+ self.device = "cpu"
79
+ else:
80
+ self.pipe = pipeline("document-question-answering", model=model_name, device=device)
81
+ self.device = device
68
82
 
69
83
  self.model_name = model_name
70
84
  self.device = device
@@ -356,7 +370,8 @@ class DocumentQA:
356
370
  temp_path = temp_file.name
357
371
 
358
372
  # Save a high resolution image (300 DPI)
359
- page_image = page.to_image(resolution=300, include_highlights=False)
373
+ # Use render() for clean image without highlights
374
+ page_image = page.render(resolution=300)
360
375
  page_image.save(temp_path)
361
376
 
362
377
  try:
@@ -470,7 +485,8 @@ class DocumentQA:
470
485
  temp_path = temp_file.name
471
486
 
472
487
  # Get page image at high resolution - this returns a PIL Image directly
473
- page_image = region.page.to_image(resolution=300, include_highlights=False)
488
+ # Use render() for clean image without highlights
489
+ page_image = region.page.render(resolution=300)
474
490
 
475
491
  # Crop to region
476
492
  x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
@@ -49,7 +49,7 @@ class Indexable(Protocol):
49
49
  """
50
50
  Return the primary content of this item.
51
51
  The SearchService implementation will determine how to process this content
52
- (e.g., call .extract_text(), .to_image(), or handle directly).
52
+ (e.g., call .extract_text(), .render(), or handle directly).
53
53
  """
54
54
  ...
55
55
 
@@ -24,7 +24,7 @@ This enables powerful document navigation like:
24
24
  - page.find('text[size>12]:bold:contains("Summary")')
25
25
  - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
26
  - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
27
- - page.find('text:regex("[\u2500-\u257F]")') # Box drawing characters
27
+ - page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
28
28
  """
29
29
 
30
30
  import ast
@@ -100,7 +100,7 @@ def safe_parse_color(value_str: str) -> tuple:
100
100
  ValueError: If the color cannot be parsed
101
101
  """
102
102
  value_str = value_str.strip()
103
-
103
+
104
104
  # Strip quotes first if it's a quoted string (same logic as safe_parse_value)
105
105
  if (value_str.startswith('"') and value_str.endswith('"')) or (
106
106
  value_str.startswith("'") and value_str.endswith("'")
@@ -39,7 +39,13 @@ class TableResult(Sequence):
39
39
  """Quick property alias → calls :py:meth:`to_df` with default args."""
40
40
  return self.to_df()
41
41
 
42
- def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
42
+ def to_df(
43
+ self,
44
+ header: Union[str, int, List[int], None] = "first",
45
+ index_col=None,
46
+ skip_repeating_headers=None,
47
+ **kwargs,
48
+ ):
43
49
  """Convert to *pandas* DataFrame.
44
50
 
45
51
  Parameters
@@ -47,6 +53,10 @@ class TableResult(Sequence):
47
53
  header : "first" | int | list[int] | None, default "first"
48
54
  • "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
49
55
  index_col : same semantics as pandas, forwarded.
56
+ skip_repeating_headers : bool, optional
57
+ Whether to remove body rows that exactly match the header row(s).
58
+ Defaults to True when header is truthy, False otherwise.
59
+ Useful for PDFs where headers repeat throughout the table body.
50
60
  **kwargs : forwarded to :pyclass:`pandas.DataFrame`.
51
61
  """
52
62
  try:
@@ -60,6 +70,10 @@ class TableResult(Sequence):
60
70
  if not rows:
61
71
  return pd.DataFrame()
62
72
 
73
+ # Determine default for skip_repeating_headers based on header parameter
74
+ if skip_repeating_headers is None:
75
+ skip_repeating_headers = header is not None and header is not False
76
+
63
77
  # Determine header rows and body rows
64
78
  body = rows
65
79
  hdr = None
@@ -78,6 +92,26 @@ class TableResult(Sequence):
78
92
  else:
79
93
  raise ValueError("Invalid value for header parameter")
80
94
 
95
+ # Skip repeating headers in body if requested
96
+ if skip_repeating_headers and hdr is not None and body:
97
+ original_body_len = len(body)
98
+ if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
99
+ # Single header row (most common case)
100
+ body = [row for row in body if row != hdr]
101
+ elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
102
+ # Multi-row header (less common)
103
+ hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
104
+ body = [
105
+ row
106
+ for row in body
107
+ if (tuple(row) if isinstance(row, list) else row) not in hdr_set
108
+ ]
109
+
110
+ skipped_count = original_body_len - len(body)
111
+ if skipped_count > 0:
112
+ # Could add logging here if desired
113
+ pass
114
+
81
115
  df = pd.DataFrame(body, columns=hdr)
82
116
  if index_col is not None and not df.empty:
83
117
  df.set_index(
natural_pdf/text_mixin.py CHANGED
@@ -70,9 +70,13 @@ class TextMixin: # pylint: disable=too-few-public-methods
70
70
  )
71
71
 
72
72
  try:
73
- elements_collection = self.find_all(selector=selector, apply_exclusions=apply_exclusions)
73
+ elements_collection = self.find_all(
74
+ selector=selector, apply_exclusions=apply_exclusions
75
+ )
74
76
  except Exception as exc: # pragma: no cover – defensive
75
- raise RuntimeError(f"Failed to gather elements with selector '{selector}': {exc}") from exc
77
+ raise RuntimeError(
78
+ f"Failed to gather elements with selector '{selector}': {exc}"
79
+ ) from exc
76
80
 
77
81
  # `find_all` returns an ElementCollection; fall back gracefully otherwise.
78
82
  elements_iter = getattr(elements_collection, "elements", elements_collection)
@@ -94,4 +98,4 @@ class TextMixin: # pylint: disable=too-few-public-methods
94
98
  updated,
95
99
  )
96
100
 
97
- return self
101
+ return self
@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
24
24
  """Generate a base64 encoded image of the page."""
25
25
  # Create a clean image of the page without highlights for the base background
26
26
  # Use a fixed scale consistent with the HTML/JS rendering logic
27
- img = page.to_image(scale=2.0, include_highlights=False)
27
+ # Use render() for clean image without highlights
28
+ img = page.render(resolution=144)
28
29
  if img is None:
29
30
  raise ValueError(f"Failed to render image for page {page.number}")
30
31
 
@@ -7,6 +7,7 @@ The main highlighting logic is now centralized in `natural_pdf.core.highlighting
7
7
 
8
8
  # Re-export necessary functions from visualization
9
9
  from .visualization import (
10
+ create_colorbar,
10
11
  create_legend,
11
12
  get_next_highlight_color,
12
13
  merge_images_with_legend,
@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
2
2
 
3
3
 
4
4
  def merge_bboxes(
5
- bboxes: List[Optional[Tuple[float, float, float, float]]]
5
+ bboxes: List[Optional[Tuple[float, float, float, float]]],
6
6
  ) -> Optional[Tuple[float, float, float, float]]:
7
7
  """
8
8
  Merge multiple bounding boxes into a single one that encompasses all of them.
@@ -23,4 +23,4 @@ def merge_bboxes(
23
23
 
24
24
  x0s, tops, x1s, bottoms = zip(*valid_bboxes)
25
25
 
26
- return (min(x0s), min(tops), max(x1s), max(bottoms))
26
+ return (min(x0s), min(tops), max(x1s), max(bottoms))
@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
18
18
 
19
19
  # Import the specific PDF/Page types if possible, otherwise use Any
20
20
  if TYPE_CHECKING:
21
- from natural_pdf.collections.pdf_collection import PDFCollection
22
21
  from natural_pdf.core.page import Page
23
22
  from natural_pdf.core.pdf import PDF
23
+ from natural_pdf.core.pdf_collection import PDFCollection
24
24
  else:
25
25
  PDF = Any
26
26
  Page = Any
@@ -145,9 +145,10 @@ def create_correction_task_package(
145
145
  image_filename = f"{pdf_short_id}_page_{page.index}.png"
146
146
  image_save_path = os.path.join(images_dir, image_filename)
147
147
  try:
148
- img = page.to_image(resolution=resolution, include_highlights=False)
148
+ # Use render() for clean image without highlights
149
+ img = page.render(resolution=resolution)
149
150
  if img is None:
150
- raise ValueError("page.to_image returned None")
151
+ raise ValueError("page.render returned None")
151
152
  img.save(image_save_path, "PNG")
152
153
  except Exception as e:
153
154
  logger.error(
@@ -175,28 +175,27 @@ def filter_chars_spatially(
175
175
 
176
176
 
177
177
  def _apply_content_filter(
178
- char_dicts: List[Dict[str, Any]],
179
- content_filter: Union[str, Callable[[str], bool], List[str]]
178
+ char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
180
179
  ) -> List[Dict[str, Any]]:
181
180
  """
182
181
  Applies content filtering to character dictionaries based on their text content.
183
-
182
+
184
183
  Args:
185
184
  char_dicts: List of character dictionaries to filter.
186
185
  content_filter: Can be:
187
186
  - A regex pattern string (characters matching the pattern are EXCLUDED)
188
187
  - A callable that takes text and returns True to KEEP the character
189
188
  - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
190
-
189
+
191
190
  Returns:
192
191
  Filtered list of character dictionaries.
193
192
  """
194
193
  if not char_dicts or content_filter is None:
195
194
  return char_dicts
196
-
195
+
197
196
  initial_count = len(char_dicts)
198
197
  filtered_chars = []
199
-
198
+
200
199
  # Handle different filter types
201
200
  if isinstance(content_filter, str):
202
201
  # Single regex pattern - exclude matching characters
@@ -207,9 +206,11 @@ def _apply_content_filter(
207
206
  if not pattern.search(text):
208
207
  filtered_chars.append(char_dict)
209
208
  except re.error as e:
210
- logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
209
+ logger.warning(
210
+ f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
211
+ )
211
212
  return char_dicts
212
-
213
+
213
214
  elif isinstance(content_filter, list):
214
215
  # List of regex patterns - exclude characters matching ANY pattern
215
216
  try:
@@ -221,7 +222,7 @@ def _apply_content_filter(
221
222
  except re.error as e:
222
223
  logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
223
224
  return char_dicts
224
-
225
+
225
226
  elif callable(content_filter):
226
227
  # Callable filter - keep characters where function returns True
227
228
  try:
@@ -233,13 +234,15 @@ def _apply_content_filter(
233
234
  logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
234
235
  return char_dicts
235
236
  else:
236
- logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
237
+ logger.warning(
238
+ f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
239
+ )
237
240
  return char_dicts
238
-
241
+
239
242
  filtered_count = initial_count - len(filtered_chars)
240
243
  if filtered_count > 0:
241
244
  logger.debug(f"Content filter removed {filtered_count} characters.")
242
-
245
+
243
246
  return filtered_chars
244
247
 
245
248