natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -77,7 +77,6 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
|
|
77
77
|
# --- End Classification Imports --- #
|
78
78
|
|
79
79
|
|
80
|
-
|
81
80
|
# --- End Shape Detection Mixin --- #
|
82
81
|
|
83
82
|
|
@@ -94,26 +93,112 @@ logger = logging.getLogger(__name__)
|
|
94
93
|
|
95
94
|
|
96
95
|
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
97
|
-
"""
|
98
|
-
Enhanced Page wrapper built on top of pdfplumber.Page.
|
96
|
+
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
99
97
|
|
100
98
|
This class provides a fluent interface for working with PDF pages,
|
101
99
|
with improved selection, navigation, extraction, and question-answering capabilities.
|
100
|
+
It integrates multiple analysis capabilities through mixins and provides spatial
|
101
|
+
navigation with CSS-like selectors.
|
102
|
+
|
103
|
+
The Page class serves as the primary interface for document analysis, offering:
|
104
|
+
- Element selection and spatial navigation
|
105
|
+
- OCR and layout analysis integration
|
106
|
+
- Table detection and extraction
|
107
|
+
- AI-powered classification and data extraction
|
108
|
+
- Visual debugging with highlighting and cropping
|
109
|
+
- Text style analysis and structure detection
|
110
|
+
|
111
|
+
Attributes:
|
112
|
+
index: Zero-based index of this page in the PDF.
|
113
|
+
number: One-based page number (index + 1).
|
114
|
+
width: Page width in points.
|
115
|
+
height: Page height in points.
|
116
|
+
bbox: Bounding box tuple (x0, top, x1, bottom) of the page.
|
117
|
+
chars: Collection of character elements on the page.
|
118
|
+
words: Collection of word elements on the page.
|
119
|
+
lines: Collection of line elements on the page.
|
120
|
+
rects: Collection of rectangle elements on the page.
|
121
|
+
images: Collection of image elements on the page.
|
122
|
+
metadata: Dictionary for storing analysis results and custom data.
|
123
|
+
|
124
|
+
Example:
|
125
|
+
Basic usage:
|
126
|
+
```python
|
127
|
+
pdf = npdf.PDF("document.pdf")
|
128
|
+
page = pdf.pages[0]
|
129
|
+
|
130
|
+
# Find elements with CSS-like selectors
|
131
|
+
headers = page.find_all('text[size>12]:bold')
|
132
|
+
summaries = page.find('text:contains("Summary")')
|
133
|
+
|
134
|
+
# Spatial navigation
|
135
|
+
content_below = summaries.below(until='text[size>12]:bold')
|
136
|
+
|
137
|
+
# Table extraction
|
138
|
+
tables = page.extract_table()
|
139
|
+
```
|
140
|
+
|
141
|
+
Advanced usage:
|
142
|
+
```python
|
143
|
+
# Apply OCR if needed
|
144
|
+
page.apply_ocr(engine='easyocr', resolution=300)
|
145
|
+
|
146
|
+
# Layout analysis
|
147
|
+
page.analyze_layout(engine='yolo')
|
148
|
+
|
149
|
+
# AI-powered extraction
|
150
|
+
data = page.extract_structured_data(MySchema)
|
151
|
+
|
152
|
+
# Visual debugging
|
153
|
+
page.find('text:contains("Important")').show()
|
154
|
+
```
|
102
155
|
"""
|
103
156
|
|
104
|
-
def __init__(
|
105
|
-
|
106
|
-
|
157
|
+
def __init__(
|
158
|
+
self,
|
159
|
+
page: "pdfplumber.page.Page",
|
160
|
+
parent: "PDF",
|
161
|
+
index: int,
|
162
|
+
font_attrs=None,
|
163
|
+
load_text: bool = True,
|
164
|
+
):
|
165
|
+
"""Initialize a page wrapper.
|
166
|
+
|
167
|
+
Creates an enhanced Page object that wraps a pdfplumber page with additional
|
168
|
+
functionality for spatial navigation, analysis, and AI-powered extraction.
|
107
169
|
|
108
170
|
Args:
|
109
|
-
page: pdfplumber page object
|
110
|
-
parent: Parent PDF object
|
111
|
-
|
112
|
-
|
171
|
+
page: The underlying pdfplumber page object that provides raw PDF data.
|
172
|
+
parent: Parent PDF object that contains this page and provides access
|
173
|
+
to managers and global settings.
|
174
|
+
index: Zero-based index of this page in the PDF document.
|
175
|
+
font_attrs: List of font attributes to consider when grouping characters
|
176
|
+
into words. Common attributes include ['fontname', 'size', 'flags'].
|
177
|
+
If None, uses default character-to-word grouping rules.
|
178
|
+
load_text: If True, load and process text elements from the PDF's text layer.
|
179
|
+
If False, skip text layer processing (useful for OCR-only workflows).
|
180
|
+
|
181
|
+
Note:
|
182
|
+
This constructor is typically called automatically when accessing pages
|
183
|
+
through the PDF.pages collection. Direct instantiation is rarely needed.
|
184
|
+
|
185
|
+
Example:
|
186
|
+
```python
|
187
|
+
# Pages are usually accessed through the PDF object
|
188
|
+
pdf = npdf.PDF("document.pdf")
|
189
|
+
page = pdf.pages[0] # Page object created automatically
|
190
|
+
|
191
|
+
# Direct construction (advanced usage)
|
192
|
+
import pdfplumber
|
193
|
+
with pdfplumber.open("document.pdf") as plumber_pdf:
|
194
|
+
plumber_page = plumber_pdf.pages[0]
|
195
|
+
page = Page(plumber_page, pdf, 0, load_text=True)
|
196
|
+
```
|
113
197
|
"""
|
114
198
|
self._page = page
|
115
199
|
self._parent = parent
|
116
200
|
self._index = index
|
201
|
+
self._load_text = load_text
|
117
202
|
self._text_styles = None # Lazy-loaded text style analyzer results
|
118
203
|
self._exclusions = [] # List to store exclusion functions/regions
|
119
204
|
self._skew_angle: Optional[float] = None # Stores detected skew angle
|
@@ -136,7 +221,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
136
221
|
self._config = dict(getattr(self._parent, "_config", {}))
|
137
222
|
|
138
223
|
# Initialize ElementManager, passing font_attrs
|
139
|
-
self._element_mgr = ElementManager(self, font_attrs=font_attrs)
|
224
|
+
self._element_mgr = ElementManager(self, font_attrs=font_attrs, load_text=self._load_text)
|
140
225
|
# self._highlighter = HighlightingService(self) # REMOVED - Use property accessor
|
141
226
|
# --- NEW --- Central registry for analysis results
|
142
227
|
self.analyses: Dict[str, Any] = {}
|
@@ -1188,6 +1273,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1188
1273
|
if _contains_rtl(result):
|
1189
1274
|
try:
|
1190
1275
|
from bidi.algorithm import get_display # type: ignore
|
1276
|
+
|
1191
1277
|
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
1192
1278
|
|
1193
1279
|
result = "\n".join(
|
@@ -1197,8 +1283,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1197
1283
|
base_dir=(
|
1198
1284
|
"R"
|
1199
1285
|
if any(
|
1200
|
-
unicodedata.bidirectional(ch)
|
1201
|
-
in ("R", "AL", "AN")
|
1286
|
+
unicodedata.bidirectional(ch) in ("R", "AL", "AN")
|
1202
1287
|
for ch in line
|
1203
1288
|
)
|
1204
1289
|
else "L"
|
@@ -1394,11 +1479,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
1394
1479
|
table_settings.setdefault("text_y_tolerance", y_tol)
|
1395
1480
|
|
1396
1481
|
# pdfplumber's text strategy benefits from a tight snap tolerance.
|
1397
|
-
if
|
1482
|
+
if (
|
1483
|
+
"snap_tolerance" not in table_settings
|
1484
|
+
and "snap_x_tolerance" not in table_settings
|
1485
|
+
):
|
1398
1486
|
# Derive from y_tol if available, else default 1
|
1399
1487
|
snap = max(1, round((pdf_cfg.get("y_tolerance", 1)) * 0.9))
|
1400
1488
|
table_settings.setdefault("snap_tolerance", snap)
|
1401
|
-
if
|
1489
|
+
if (
|
1490
|
+
"join_tolerance" not in table_settings
|
1491
|
+
and "join_x_tolerance" not in table_settings
|
1492
|
+
):
|
1402
1493
|
join = table_settings.get("snap_tolerance", 1)
|
1403
1494
|
table_settings.setdefault("join_tolerance", join)
|
1404
1495
|
table_settings.setdefault("join_x_tolerance", join)
|
@@ -2996,7 +3087,32 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2996
3087
|
InspectionSummary with element tables showing coordinates,
|
2997
3088
|
properties, and other details for each element
|
2998
3089
|
"""
|
2999
|
-
return self.find_all(
|
3090
|
+
return self.find_all("*").inspect(limit=limit)
|
3091
|
+
|
3092
|
+
def remove_text_layer(self) -> "Page":
|
3093
|
+
"""
|
3094
|
+
Remove all text elements from this page.
|
3095
|
+
|
3096
|
+
This removes all text elements (words and characters) from the page,
|
3097
|
+
effectively clearing the text layer.
|
3098
|
+
|
3099
|
+
Returns:
|
3100
|
+
Self for method chaining
|
3101
|
+
"""
|
3102
|
+
logger.info(f"Page {self.number}: Removing all text elements...")
|
3103
|
+
|
3104
|
+
# Remove all words and chars from the element manager
|
3105
|
+
removed_words = len(self._element_mgr.words)
|
3106
|
+
removed_chars = len(self._element_mgr.chars)
|
3107
|
+
|
3108
|
+
# Clear the lists
|
3109
|
+
self._element_mgr._elements["words"] = []
|
3110
|
+
self._element_mgr._elements["chars"] = []
|
3111
|
+
|
3112
|
+
logger.info(
|
3113
|
+
f"Page {self.number}: Removed {removed_words} words and {removed_chars} characters"
|
3114
|
+
)
|
3115
|
+
return self
|
3000
3116
|
|
3001
3117
|
@property
|
3002
3118
|
def lines(self) -> List[Any]:
|