natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -7
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +236 -383
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +172 -83
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +318 -243
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +4 -4
- natural_pdf/flows/flow.py +1200 -243
- natural_pdf/flows/region.py +707 -261
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +2 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +7 -3
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
3
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
|
6
|
-
|
7
6
|
# --- Base Options ---
|
8
7
|
@dataclass
|
9
8
|
class BaseOCROptions:
|
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
|
|
54
53
|
output_format: str = "standard"
|
55
54
|
|
56
55
|
|
57
|
-
|
58
56
|
# --- PaddleOCR Specific Options ---
|
59
57
|
@dataclass
|
60
58
|
class PaddleOCROptions(BaseOCROptions):
|
natural_pdf/ocr/utils.py
CHANGED
@@ -90,7 +90,8 @@ def direct_ocr_llm(
|
|
90
90
|
buffered = io.BytesIO()
|
91
91
|
# Use the global PDF render lock when rendering images
|
92
92
|
with pdf_render_lock:
|
93
|
-
|
93
|
+
# Use render() for clean image without highlights
|
94
|
+
region_img = region.render(resolution=resolution)
|
94
95
|
|
95
96
|
# Handle cases where image creation might fail (e.g., zero-dim region)
|
96
97
|
if region_img is None:
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
|
11
|
-
from natural_pdf.elements.
|
11
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
12
12
|
|
13
13
|
from .qa_result import QAResult
|
14
14
|
|
@@ -63,8 +63,22 @@ class DocumentQA:
|
|
63
63
|
|
64
64
|
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
65
65
|
|
66
|
-
#
|
67
|
-
|
66
|
+
# Try MPS, fallback to CPU if OOM
|
67
|
+
if device is None and torch.backends.mps.is_available():
|
68
|
+
try:
|
69
|
+
self.pipe = pipeline(
|
70
|
+
"document-question-answering", model=model_name, device="mps"
|
71
|
+
)
|
72
|
+
self.device = "mps"
|
73
|
+
except RuntimeError as e:
|
74
|
+
logger.warning(f"MPS OOM: {e}, falling back to CPU")
|
75
|
+
self.pipe = pipeline(
|
76
|
+
"document-question-answering", model=model_name, device="cpu"
|
77
|
+
)
|
78
|
+
self.device = "cpu"
|
79
|
+
else:
|
80
|
+
self.pipe = pipeline("document-question-answering", model=model_name, device=device)
|
81
|
+
self.device = device
|
68
82
|
|
69
83
|
self.model_name = model_name
|
70
84
|
self.device = device
|
@@ -356,7 +370,8 @@ class DocumentQA:
|
|
356
370
|
temp_path = temp_file.name
|
357
371
|
|
358
372
|
# Save a high resolution image (300 DPI)
|
359
|
-
|
373
|
+
# Use render() for clean image without highlights
|
374
|
+
page_image = page.render(resolution=300)
|
360
375
|
page_image.save(temp_path)
|
361
376
|
|
362
377
|
try:
|
@@ -470,7 +485,8 @@ class DocumentQA:
|
|
470
485
|
temp_path = temp_file.name
|
471
486
|
|
472
487
|
# Get page image at high resolution - this returns a PIL Image directly
|
473
|
-
|
488
|
+
# Use render() for clean image without highlights
|
489
|
+
page_image = region.page.render(resolution=300)
|
474
490
|
|
475
491
|
# Crop to region
|
476
492
|
x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
|
@@ -49,7 +49,7 @@ class Indexable(Protocol):
|
|
49
49
|
"""
|
50
50
|
Return the primary content of this item.
|
51
51
|
The SearchService implementation will determine how to process this content
|
52
|
-
(e.g., call .extract_text(), .
|
52
|
+
(e.g., call .extract_text(), .render(), or handle directly).
|
53
53
|
"""
|
54
54
|
...
|
55
55
|
|
natural_pdf/selectors/parser.py
CHANGED
@@ -24,7 +24,7 @@ This enables powerful document navigation like:
|
|
24
24
|
- page.find('text[size>12]:bold:contains("Summary")')
|
25
25
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
26
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
|
-
- page.find('text:regex("[\u2500-\
|
27
|
+
- page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
|
28
28
|
"""
|
29
29
|
|
30
30
|
import ast
|
@@ -100,7 +100,7 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
100
100
|
ValueError: If the color cannot be parsed
|
101
101
|
"""
|
102
102
|
value_str = value_str.strip()
|
103
|
-
|
103
|
+
|
104
104
|
# Strip quotes first if it's a quoted string (same logic as safe_parse_value)
|
105
105
|
if (value_str.startswith('"') and value_str.endswith('"')) or (
|
106
106
|
value_str.startswith("'") and value_str.endswith("'")
|
natural_pdf/tables/result.py
CHANGED
@@ -39,7 +39,13 @@ class TableResult(Sequence):
|
|
39
39
|
"""Quick property alias → calls :py:meth:`to_df` with default args."""
|
40
40
|
return self.to_df()
|
41
41
|
|
42
|
-
def to_df(
|
42
|
+
def to_df(
|
43
|
+
self,
|
44
|
+
header: Union[str, int, List[int], None] = "first",
|
45
|
+
index_col=None,
|
46
|
+
skip_repeating_headers=None,
|
47
|
+
**kwargs,
|
48
|
+
):
|
43
49
|
"""Convert to *pandas* DataFrame.
|
44
50
|
|
45
51
|
Parameters
|
@@ -47,6 +53,10 @@ class TableResult(Sequence):
|
|
47
53
|
header : "first" | int | list[int] | None, default "first"
|
48
54
|
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
49
55
|
index_col : same semantics as pandas, forwarded.
|
56
|
+
skip_repeating_headers : bool, optional
|
57
|
+
Whether to remove body rows that exactly match the header row(s).
|
58
|
+
Defaults to True when header is truthy, False otherwise.
|
59
|
+
Useful for PDFs where headers repeat throughout the table body.
|
50
60
|
**kwargs : forwarded to :pyclass:`pandas.DataFrame`.
|
51
61
|
"""
|
52
62
|
try:
|
@@ -60,6 +70,10 @@ class TableResult(Sequence):
|
|
60
70
|
if not rows:
|
61
71
|
return pd.DataFrame()
|
62
72
|
|
73
|
+
# Determine default for skip_repeating_headers based on header parameter
|
74
|
+
if skip_repeating_headers is None:
|
75
|
+
skip_repeating_headers = header is not None and header is not False
|
76
|
+
|
63
77
|
# Determine header rows and body rows
|
64
78
|
body = rows
|
65
79
|
hdr = None
|
@@ -78,6 +92,26 @@ class TableResult(Sequence):
|
|
78
92
|
else:
|
79
93
|
raise ValueError("Invalid value for header parameter")
|
80
94
|
|
95
|
+
# Skip repeating headers in body if requested
|
96
|
+
if skip_repeating_headers and hdr is not None and body:
|
97
|
+
original_body_len = len(body)
|
98
|
+
if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
|
99
|
+
# Single header row (most common case)
|
100
|
+
body = [row for row in body if row != hdr]
|
101
|
+
elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
|
102
|
+
# Multi-row header (less common)
|
103
|
+
hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
|
104
|
+
body = [
|
105
|
+
row
|
106
|
+
for row in body
|
107
|
+
if (tuple(row) if isinstance(row, list) else row) not in hdr_set
|
108
|
+
]
|
109
|
+
|
110
|
+
skipped_count = original_body_len - len(body)
|
111
|
+
if skipped_count > 0:
|
112
|
+
# Could add logging here if desired
|
113
|
+
pass
|
114
|
+
|
81
115
|
df = pd.DataFrame(body, columns=hdr)
|
82
116
|
if index_col is not None and not df.empty:
|
83
117
|
df.set_index(
|
natural_pdf/text_mixin.py
CHANGED
@@ -70,9 +70,13 @@ class TextMixin: # pylint: disable=too-few-public-methods
|
|
70
70
|
)
|
71
71
|
|
72
72
|
try:
|
73
|
-
elements_collection = self.find_all(
|
73
|
+
elements_collection = self.find_all(
|
74
|
+
selector=selector, apply_exclusions=apply_exclusions
|
75
|
+
)
|
74
76
|
except Exception as exc: # pragma: no cover – defensive
|
75
|
-
raise RuntimeError(
|
77
|
+
raise RuntimeError(
|
78
|
+
f"Failed to gather elements with selector '{selector}': {exc}"
|
79
|
+
) from exc
|
76
80
|
|
77
81
|
# `find_all` returns an ElementCollection; fall back gracefully otherwise.
|
78
82
|
elements_iter = getattr(elements_collection, "elements", elements_collection)
|
@@ -94,4 +98,4 @@ class TextMixin: # pylint: disable=too-few-public-methods
|
|
94
98
|
updated,
|
95
99
|
)
|
96
100
|
|
97
|
-
return self
|
101
|
+
return self
|
natural_pdf/utils/debug.py
CHANGED
@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
|
|
24
24
|
"""Generate a base64 encoded image of the page."""
|
25
25
|
# Create a clean image of the page without highlights for the base background
|
26
26
|
# Use a fixed scale consistent with the HTML/JS rendering logic
|
27
|
-
|
27
|
+
# Use render() for clean image without highlights
|
28
|
+
img = page.render(resolution=144)
|
28
29
|
if img is None:
|
29
30
|
raise ValueError(f"Failed to render image for page {page.number}")
|
30
31
|
|
natural_pdf/utils/layout.py
CHANGED
@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
|
|
2
2
|
|
3
3
|
|
4
4
|
def merge_bboxes(
|
5
|
-
bboxes: List[Optional[Tuple[float, float, float, float]]]
|
5
|
+
bboxes: List[Optional[Tuple[float, float, float, float]]],
|
6
6
|
) -> Optional[Tuple[float, float, float, float]]:
|
7
7
|
"""
|
8
8
|
Merge multiple bounding boxes into a single one that encompasses all of them.
|
@@ -23,4 +23,4 @@ def merge_bboxes(
|
|
23
23
|
|
24
24
|
x0s, tops, x1s, bottoms = zip(*valid_bboxes)
|
25
25
|
|
26
|
-
return (min(x0s), min(tops), max(x1s), max(bottoms))
|
26
|
+
return (min(x0s), min(tops), max(x1s), max(bottoms))
|
natural_pdf/utils/packaging.py
CHANGED
@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
|
|
18
18
|
|
19
19
|
# Import the specific PDF/Page types if possible, otherwise use Any
|
20
20
|
if TYPE_CHECKING:
|
21
|
-
from natural_pdf.collections.pdf_collection import PDFCollection
|
22
21
|
from natural_pdf.core.page import Page
|
23
22
|
from natural_pdf.core.pdf import PDF
|
23
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
24
24
|
else:
|
25
25
|
PDF = Any
|
26
26
|
Page = Any
|
@@ -145,9 +145,10 @@ def create_correction_task_package(
|
|
145
145
|
image_filename = f"{pdf_short_id}_page_{page.index}.png"
|
146
146
|
image_save_path = os.path.join(images_dir, image_filename)
|
147
147
|
try:
|
148
|
-
|
148
|
+
# Use render() for clean image without highlights
|
149
|
+
img = page.render(resolution=resolution)
|
149
150
|
if img is None:
|
150
|
-
raise ValueError("page.
|
151
|
+
raise ValueError("page.render returned None")
|
151
152
|
img.save(image_save_path, "PNG")
|
152
153
|
except Exception as e:
|
153
154
|
logger.error(
|
@@ -175,28 +175,27 @@ def filter_chars_spatially(
|
|
175
175
|
|
176
176
|
|
177
177
|
def _apply_content_filter(
|
178
|
-
char_dicts: List[Dict[str, Any]],
|
179
|
-
content_filter: Union[str, Callable[[str], bool], List[str]]
|
178
|
+
char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
|
180
179
|
) -> List[Dict[str, Any]]:
|
181
180
|
"""
|
182
181
|
Applies content filtering to character dictionaries based on their text content.
|
183
|
-
|
182
|
+
|
184
183
|
Args:
|
185
184
|
char_dicts: List of character dictionaries to filter.
|
186
185
|
content_filter: Can be:
|
187
186
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
188
187
|
- A callable that takes text and returns True to KEEP the character
|
189
188
|
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
190
|
-
|
189
|
+
|
191
190
|
Returns:
|
192
191
|
Filtered list of character dictionaries.
|
193
192
|
"""
|
194
193
|
if not char_dicts or content_filter is None:
|
195
194
|
return char_dicts
|
196
|
-
|
195
|
+
|
197
196
|
initial_count = len(char_dicts)
|
198
197
|
filtered_chars = []
|
199
|
-
|
198
|
+
|
200
199
|
# Handle different filter types
|
201
200
|
if isinstance(content_filter, str):
|
202
201
|
# Single regex pattern - exclude matching characters
|
@@ -207,9 +206,11 @@ def _apply_content_filter(
|
|
207
206
|
if not pattern.search(text):
|
208
207
|
filtered_chars.append(char_dict)
|
209
208
|
except re.error as e:
|
210
|
-
logger.warning(
|
209
|
+
logger.warning(
|
210
|
+
f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
|
211
|
+
)
|
211
212
|
return char_dicts
|
212
|
-
|
213
|
+
|
213
214
|
elif isinstance(content_filter, list):
|
214
215
|
# List of regex patterns - exclude characters matching ANY pattern
|
215
216
|
try:
|
@@ -221,7 +222,7 @@ def _apply_content_filter(
|
|
221
222
|
except re.error as e:
|
222
223
|
logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
|
223
224
|
return char_dicts
|
224
|
-
|
225
|
+
|
225
226
|
elif callable(content_filter):
|
226
227
|
# Callable filter - keep characters where function returns True
|
227
228
|
try:
|
@@ -233,13 +234,15 @@ def _apply_content_filter(
|
|
233
234
|
logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
|
234
235
|
return char_dicts
|
235
236
|
else:
|
236
|
-
logger.warning(
|
237
|
+
logger.warning(
|
238
|
+
f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
|
239
|
+
)
|
237
240
|
return char_dicts
|
238
|
-
|
241
|
+
|
239
242
|
filtered_count = initial_count - len(filtered_chars)
|
240
243
|
if filtered_count > 0:
|
241
244
|
logger.debug(f"Content filter removed {filtered_count} characters.")
|
242
|
-
|
245
|
+
|
243
246
|
return filtered_chars
|
244
247
|
|
245
248
|
|