natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +125 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +907 -513
- natural_pdf/core/pdf.py +385 -287
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +708 -508
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,38 @@
|
|
1
1
|
# layout_detector_paddle.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
3
|
+
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
-
from typing import
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
|
7
8
|
from PIL import Image
|
8
9
|
|
9
10
|
# Assuming base class and options are importable
|
10
11
|
try:
|
11
12
|
from .base import LayoutDetector
|
12
|
-
from .layout_options import
|
13
|
+
from .layout_options import BaseLayoutOptions, PaddleLayoutOptions
|
13
14
|
except ImportError:
|
14
15
|
# Placeholders if run standalone or imports fail
|
15
|
-
class BaseLayoutOptions:
|
16
|
-
|
16
|
+
class BaseLayoutOptions:
|
17
|
+
pass
|
18
|
+
|
19
|
+
class PaddleLayoutOptions(BaseLayoutOptions):
|
20
|
+
pass
|
21
|
+
|
17
22
|
class LayoutDetector:
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
23
|
+
def __init__(self):
|
24
|
+
self.logger = logging.getLogger()
|
25
|
+
self.supported_classes = set()
|
26
|
+
|
27
|
+
def _get_model(self, options):
|
28
|
+
raise NotImplementedError
|
29
|
+
|
30
|
+
def _normalize_class_name(self, n):
|
31
|
+
return n
|
32
|
+
|
33
|
+
def validate_classes(self, c):
|
34
|
+
pass
|
35
|
+
|
22
36
|
logging.basicConfig()
|
23
37
|
|
24
38
|
logger = logging.getLogger(__name__)
|
@@ -27,15 +41,17 @@ logger = logging.getLogger(__name__)
|
|
27
41
|
paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
|
28
42
|
paddleocr_spec = importlib.util.find_spec("paddleocr")
|
29
43
|
PPStructure = None
|
30
|
-
PaddleOCR = None
|
44
|
+
PaddleOCR = None # For optional text detection
|
31
45
|
|
32
46
|
if paddle_spec and paddleocr_spec:
|
33
47
|
try:
|
34
|
-
from paddleocr import
|
48
|
+
from paddleocr import PaddleOCR, PPStructure
|
35
49
|
except ImportError as e:
|
36
50
|
logger.warning(f"Could not import Paddle dependencies: {e}")
|
37
51
|
else:
|
38
|
-
logger.warning(
|
52
|
+
logger.warning(
|
53
|
+
"paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
|
54
|
+
)
|
39
55
|
|
40
56
|
|
41
57
|
class PaddleLayoutDetector(LayoutDetector):
|
@@ -45,9 +61,17 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
45
61
|
super().__init__()
|
46
62
|
# Supported classes by PP-Structure (adjust based on model version/capabilities)
|
47
63
|
self.supported_classes = {
|
48
|
-
|
49
|
-
|
50
|
-
|
64
|
+
"text",
|
65
|
+
"title",
|
66
|
+
"figure",
|
67
|
+
"figure_caption",
|
68
|
+
"table",
|
69
|
+
"table_caption",
|
70
|
+
"table_cell", # Added table_cell
|
71
|
+
"header",
|
72
|
+
"footer",
|
73
|
+
"reference",
|
74
|
+
"equation",
|
51
75
|
# PP-StructureV2 might add others like list, pub_number etc.
|
52
76
|
}
|
53
77
|
# Models are loaded via _get_model
|
@@ -59,9 +83,9 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
59
83
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
60
84
|
"""Generate cache key based on language and device."""
|
61
85
|
if not isinstance(options, PaddleLayoutOptions):
|
62
|
-
options = PaddleLayoutOptions(device=options.device)
|
86
|
+
options = PaddleLayoutOptions(device=options.device) # Use base device
|
63
87
|
|
64
|
-
device_key = str(options.device).lower() if options.device else
|
88
|
+
device_key = str(options.device).lower() if options.device else "default_device"
|
65
89
|
lang_key = options.lang
|
66
90
|
# Key could also include enable_table, use_angle_cls if these affect model loading fundamentally
|
67
91
|
# For PPStructure, they are primarily runtime flags, so lang/device might suffice for caching the *instance*.
|
@@ -70,12 +94,14 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
70
94
|
def _load_model_from_options(self, options: BaseLayoutOptions) -> Any:
|
71
95
|
"""Load the PPStructure model based on options."""
|
72
96
|
if not self.is_available():
|
73
|
-
|
97
|
+
raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
|
74
98
|
|
75
99
|
if not isinstance(options, PaddleLayoutOptions):
|
76
100
|
raise TypeError("Incorrect options type provided for Paddle model loading.")
|
77
101
|
|
78
|
-
self.logger.info(
|
102
|
+
self.logger.info(
|
103
|
+
f"Loading PPStructure model (lang={options.lang}, device={options.device}, table={options.enable_table})..."
|
104
|
+
)
|
79
105
|
try:
|
80
106
|
# PPStructure init takes several arguments that control runtime behavior
|
81
107
|
# We cache the instance based on lang/device, assuming other flags don't require reloading.
|
@@ -86,15 +112,17 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
86
112
|
# However, PPStructure call signature is simple (__call__(self, img, ...))
|
87
113
|
# So, we likely need to initialize with most settings.
|
88
114
|
model_instance = PPStructure(
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
115
|
+
lang=options.lang,
|
116
|
+
use_gpu=(
|
117
|
+
"cuda" in str(options.device).lower() or "gpu" in str(options.device).lower()
|
118
|
+
),
|
119
|
+
use_angle_cls=options.use_angle_cls,
|
120
|
+
show_log=options.show_log,
|
121
|
+
layout=True, # Ensure layout analysis is on
|
122
|
+
table=options.enable_table, # Control table analysis
|
123
|
+
ocr=False, # Usually disable internal OCR if only using for layout/table
|
124
|
+
# Add other PPStructure init args from options.extra_args if needed
|
125
|
+
# **options.extra_args
|
98
126
|
)
|
99
127
|
self.logger.info("PPStructure model loaded.")
|
100
128
|
return model_instance
|
@@ -108,13 +136,17 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
108
136
|
raise RuntimeError("Paddle dependencies (paddlepaddle, paddleocr) not installed.")
|
109
137
|
|
110
138
|
if not isinstance(options, PaddleLayoutOptions):
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
139
|
+
self.logger.warning(
|
140
|
+
"Received BaseLayoutOptions, expected PaddleLayoutOptions. Using defaults."
|
141
|
+
)
|
142
|
+
options = PaddleLayoutOptions(
|
143
|
+
confidence=options.confidence,
|
144
|
+
classes=options.classes,
|
145
|
+
exclude_classes=options.exclude_classes,
|
146
|
+
device=options.device,
|
147
|
+
extra_args=options.extra_args,
|
148
|
+
# Other Paddle options will use defaults
|
149
|
+
)
|
118
150
|
|
119
151
|
self.validate_classes(options.classes or [])
|
120
152
|
if options.exclude_classes:
|
@@ -128,8 +160,10 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
128
160
|
with tempfile.TemporaryDirectory() as temp_dir:
|
129
161
|
temp_image_path = os.path.join(temp_dir, f"paddle_input_{os.getpid()}.png")
|
130
162
|
try:
|
131
|
-
self.logger.debug(
|
132
|
-
|
163
|
+
self.logger.debug(
|
164
|
+
f"Saving temporary image for Paddle detector to: {temp_image_path}"
|
165
|
+
)
|
166
|
+
image.convert("RGB").save(temp_image_path) # Ensure RGB
|
133
167
|
|
134
168
|
# Process image with PP-Structure instance
|
135
169
|
# The instance was configured during _load_model_from_options
|
@@ -141,15 +175,19 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
141
175
|
self.logger.error(f"Error during PPStructure analysis: {e}", exc_info=True)
|
142
176
|
# Clean up temp file before raising or returning
|
143
177
|
if os.path.exists(temp_image_path):
|
144
|
-
try:
|
145
|
-
|
146
|
-
|
178
|
+
try:
|
179
|
+
os.remove(temp_image_path)
|
180
|
+
except OSError as e_rm:
|
181
|
+
self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
182
|
+
raise # Re-raise error
|
147
183
|
|
148
184
|
finally:
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
185
|
+
# Ensure cleanup even if analysis worked
|
186
|
+
if os.path.exists(temp_image_path):
|
187
|
+
try:
|
188
|
+
os.remove(temp_image_path)
|
189
|
+
except OSError as e_rm:
|
190
|
+
self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
153
191
|
|
154
192
|
# --- Process Results ---
|
155
193
|
if not result:
|
@@ -157,66 +195,85 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
157
195
|
return []
|
158
196
|
|
159
197
|
# Prepare normalized class filters once
|
160
|
-
normalized_classes_req =
|
161
|
-
|
198
|
+
normalized_classes_req = (
|
199
|
+
{self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
200
|
+
)
|
201
|
+
normalized_classes_excl = (
|
202
|
+
{self._normalize_class_name(c) for c in options.exclude_classes}
|
203
|
+
if options.exclude_classes
|
204
|
+
else set()
|
205
|
+
)
|
162
206
|
|
163
207
|
for region in result:
|
164
208
|
try:
|
165
|
-
region_type_orig = region.get(
|
209
|
+
region_type_orig = region.get("type", "unknown")
|
166
210
|
# Handle potential list returns for type (seen in some versions)
|
167
211
|
if isinstance(region_type_orig, list):
|
168
|
-
|
212
|
+
region_type_orig = region_type_orig[0] if region_type_orig else "unknown"
|
169
213
|
|
170
214
|
region_type = region_type_orig.lower()
|
171
215
|
normalized_class = self._normalize_class_name(region_type)
|
172
216
|
|
173
217
|
# Apply class filtering
|
174
|
-
if normalized_classes_req and normalized_class not in normalized_classes_req:
|
175
|
-
|
218
|
+
if normalized_classes_req and normalized_class not in normalized_classes_req:
|
219
|
+
continue
|
220
|
+
if normalized_class in normalized_classes_excl:
|
221
|
+
continue
|
176
222
|
|
177
223
|
# PP-Structure results don't always have confidence, use threshold or default
|
178
|
-
confidence_score = region.get(
|
179
|
-
if confidence_score < options.confidence:
|
224
|
+
confidence_score = region.get("score", 1.0) # Default to 1.0 if missing
|
225
|
+
if confidence_score < options.confidence:
|
226
|
+
continue
|
180
227
|
|
181
|
-
bbox = region.get(
|
228
|
+
bbox = region.get("bbox")
|
182
229
|
if not bbox or len(bbox) != 4:
|
183
|
-
|
184
|
-
|
230
|
+
self.logger.warning(f"Skipping region with invalid bbox: {region}")
|
231
|
+
continue
|
185
232
|
x_min, y_min, x_max, y_max = map(float, bbox)
|
186
233
|
|
187
234
|
# Add detection
|
188
235
|
detection_data = {
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
236
|
+
"bbox": (x_min, y_min, x_max, y_max),
|
237
|
+
"class": region_type_orig, # Keep original case if needed
|
238
|
+
"confidence": confidence_score,
|
239
|
+
"normalized_class": normalized_class,
|
240
|
+
"source": "layout",
|
241
|
+
"model": "paddle",
|
195
242
|
}
|
196
243
|
detections.append(detection_data)
|
197
244
|
|
198
245
|
# --- Process Table Cells (if enabled and present) ---
|
199
|
-
if region_type ==
|
200
|
-
process_cells = (
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
246
|
+
if region_type == "table" and options.enable_table and "res" in region:
|
247
|
+
process_cells = (
|
248
|
+
normalized_classes_req is None or "table-cell" in normalized_classes_req
|
249
|
+
) and ("table-cell" not in normalized_classes_excl)
|
250
|
+
|
251
|
+
if process_cells and isinstance(region["res"], list): # V2 structure
|
252
|
+
for cell in region["res"]:
|
253
|
+
if "box" not in cell or len(cell["box"]) != 4:
|
254
|
+
continue
|
255
|
+
cell_bbox = cell["box"]
|
256
|
+
cell_x_min, cell_y_min, cell_x_max, cell_y_max = map(float, cell_bbox)
|
257
|
+
# Add cell detection (confidence often not available per cell)
|
258
|
+
detections.append(
|
259
|
+
{
|
260
|
+
"bbox": (cell_x_min, cell_y_min, cell_x_max, cell_y_max),
|
261
|
+
"class": "table cell", # Standardize name
|
262
|
+
"confidence": confidence_score
|
263
|
+
* 0.95, # Inherit table confidence (slightly reduced)
|
264
|
+
"normalized_class": "table-cell",
|
265
|
+
"text": cell.get("text", ""), # Include text if available
|
266
|
+
"source": "layout",
|
267
|
+
"model": "paddle",
|
268
|
+
}
|
269
|
+
)
|
270
|
+
elif (
|
271
|
+
process_cells
|
272
|
+
and isinstance(region["res"], dict)
|
273
|
+
and "cells" in region["res"]
|
274
|
+
): # Older structure
|
275
|
+
# Handle older 'cells' list if needed (logic from original file)
|
276
|
+
pass # Add logic based on original paddle.txt if supporting older PP-Structure
|
220
277
|
|
221
278
|
except (TypeError, KeyError, IndexError, ValueError) as e:
|
222
279
|
self.logger.warning(f"Error processing Paddle region: {region}. Error: {e}")
|
@@ -224,17 +281,17 @@ class PaddleLayoutDetector(LayoutDetector):
|
|
224
281
|
|
225
282
|
# --- Optional: Add Text Boxes from separate OCR run ---
|
226
283
|
if options.detect_text:
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
284
|
+
# This requires another model instance (PaddleOCR) and adds complexity.
|
285
|
+
# Consider if this is truly needed or if layout regions are sufficient.
|
286
|
+
# If needed, implement similar to original paddle.txt:
|
287
|
+
# - Instantiate PaddleOCR (potentially cache separately)
|
288
|
+
# - Run ocr(img_path, det=True, rec=False)
|
289
|
+
# - Process results, adding 'text' class detections
|
290
|
+
self.logger.info("Paddle detect_text=True: Running separate OCR text detection...")
|
291
|
+
# (Implementation omitted for brevity - requires PaddleOCR instance)
|
292
|
+
pass
|
293
|
+
|
294
|
+
self.logger.info(
|
295
|
+
f"PaddleLayout detected {len(detections)} layout elements matching criteria."
|
296
|
+
)
|
239
297
|
return detections
|
240
|
-
|