natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,38 @@
|
|
1
1
|
# layout_detector_docling.py
|
2
|
-
import logging
|
3
2
|
import importlib.util
|
3
|
+
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
-
from typing import
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
|
+
|
7
8
|
from PIL import Image
|
8
9
|
|
9
10
|
# Assuming base class and options are importable
|
10
11
|
try:
|
11
12
|
from .base import LayoutDetector
|
12
|
-
from .layout_options import
|
13
|
+
from .layout_options import BaseLayoutOptions, DoclingLayoutOptions
|
13
14
|
except ImportError:
|
14
15
|
# Placeholders if run standalone or imports fail
|
15
|
-
class BaseLayoutOptions:
|
16
|
-
|
16
|
+
class BaseLayoutOptions:
|
17
|
+
pass
|
18
|
+
|
19
|
+
class DoclingLayoutOptions(BaseLayoutOptions):
|
20
|
+
pass
|
21
|
+
|
17
22
|
class LayoutDetector:
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
23
|
+
def __init__(self):
|
24
|
+
self.logger = logging.getLogger()
|
25
|
+
self.supported_classes = set()
|
26
|
+
|
27
|
+
def _get_model(self, options):
|
28
|
+
raise NotImplementedError
|
29
|
+
|
30
|
+
def _normalize_class_name(self, n):
|
31
|
+
return n
|
32
|
+
|
33
|
+
def validate_classes(self, c):
|
34
|
+
pass
|
35
|
+
|
22
36
|
logging.basicConfig()
|
23
37
|
|
24
38
|
logger = logging.getLogger(__name__)
|
@@ -42,11 +56,27 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
42
56
|
super().__init__()
|
43
57
|
# Docling classes are dynamic/hierarchical, define common ones
|
44
58
|
self.supported_classes = {
|
45
|
-
|
46
|
-
|
47
|
-
|
59
|
+
"Header",
|
60
|
+
"Footer",
|
61
|
+
"Paragraph",
|
62
|
+
"Heading",
|
63
|
+
"List",
|
64
|
+
"ListItem",
|
65
|
+
"Table",
|
66
|
+
"Figure",
|
67
|
+
"Caption",
|
68
|
+
"Footnote",
|
69
|
+
"PageNumber",
|
70
|
+
"Equation",
|
71
|
+
"Code",
|
72
|
+
"Title",
|
73
|
+
"Author",
|
74
|
+
"Abstract",
|
75
|
+
"Section",
|
76
|
+
"Unknown",
|
77
|
+
"Metadata", # Add more as needed
|
48
78
|
}
|
49
|
-
self._docling_document_cache = {}
|
79
|
+
self._docling_document_cache = {} # Cache the output doc per image/options if needed
|
50
80
|
|
51
81
|
def is_available(self) -> bool:
|
52
82
|
"""Check if docling is installed."""
|
@@ -55,9 +85,9 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
55
85
|
def _get_cache_key(self, options: BaseLayoutOptions) -> str:
|
56
86
|
"""Generate cache key based on device and potentially converter args."""
|
57
87
|
if not isinstance(options, DoclingLayoutOptions):
|
58
|
-
|
88
|
+
options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
|
59
89
|
|
60
|
-
device_key = str(options.device).lower() if options.device else
|
90
|
+
device_key = str(options.device).lower() if options.device else "default_device"
|
61
91
|
# Include hash of extra_args if they affect model loading/converter init
|
62
92
|
extra_args_key = hash(frozenset(options.extra_args.items()))
|
63
93
|
return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
|
@@ -88,12 +118,17 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
88
118
|
raise RuntimeError("Docling dependency not installed.")
|
89
119
|
|
90
120
|
if not isinstance(options, DoclingLayoutOptions):
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
121
|
+
self.logger.warning(
|
122
|
+
"Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults."
|
123
|
+
)
|
124
|
+
options = DoclingLayoutOptions(
|
125
|
+
confidence=options.confidence,
|
126
|
+
classes=options.classes,
|
127
|
+
exclude_classes=options.exclude_classes,
|
128
|
+
device=options.device,
|
129
|
+
extra_args=options.extra_args,
|
130
|
+
verbose=options.extra_args.get("verbose", False),
|
131
|
+
)
|
97
132
|
|
98
133
|
# Validate classes before proceeding (note: Docling classes are case-sensitive)
|
99
134
|
# self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
|
@@ -105,18 +140,20 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
105
140
|
|
106
141
|
# Docling convert method requires an image path. Save temp file.
|
107
142
|
detections = []
|
108
|
-
docling_doc = None
|
143
|
+
docling_doc = None # To store the result
|
109
144
|
with tempfile.TemporaryDirectory() as temp_dir:
|
110
145
|
temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
|
111
146
|
try:
|
112
|
-
self.logger.debug(
|
113
|
-
|
147
|
+
self.logger.debug(
|
148
|
+
f"Saving temporary image for Docling detector to: {temp_image_path}"
|
149
|
+
)
|
150
|
+
image.convert("RGB").save(temp_image_path) # Ensure RGB
|
114
151
|
|
115
152
|
# Convert the document using Docling's DocumentConverter
|
116
153
|
self.logger.debug("Running Docling conversion...")
|
117
154
|
# Docling convert returns a Result object with a 'document' attribute
|
118
155
|
result = converter.convert(temp_image_path)
|
119
|
-
docling_doc = result.document
|
156
|
+
docling_doc = result.document # Store the DoclingDocument
|
120
157
|
self.logger.info(f"Docling conversion complete.")
|
121
158
|
|
122
159
|
# Convert Docling document to our detection format
|
@@ -124,12 +161,14 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
124
161
|
|
125
162
|
except Exception as e:
|
126
163
|
self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
|
127
|
-
raise
|
164
|
+
raise # Re-raise the exception
|
128
165
|
finally:
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
166
|
+
# Ensure temp file is removed
|
167
|
+
if os.path.exists(temp_image_path):
|
168
|
+
try:
|
169
|
+
os.remove(temp_image_path)
|
170
|
+
except OSError as e_rm:
|
171
|
+
self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
|
133
172
|
|
134
173
|
# Cache the docling document if needed elsewhere (maybe associate with page?)
|
135
174
|
# self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
|
@@ -137,26 +176,37 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
137
176
|
self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
|
138
177
|
return detections
|
139
178
|
|
140
|
-
def _convert_docling_to_detections(
|
179
|
+
def _convert_docling_to_detections(
|
180
|
+
self, doc, options: DoclingLayoutOptions
|
181
|
+
) -> List[Dict[str, Any]]:
|
141
182
|
"""Convert a Docling document to our standard detection format."""
|
142
|
-
if not doc or not hasattr(doc,
|
183
|
+
if not doc or not hasattr(doc, "pages") or not doc.pages:
|
143
184
|
self.logger.warning("Invalid or empty Docling document for conversion.")
|
144
185
|
return []
|
145
186
|
|
146
187
|
detections = []
|
147
|
-
id_to_detection_index = {}
|
188
|
+
id_to_detection_index = {} # Map Docling ID to index in detections list
|
148
189
|
|
149
190
|
# Prepare normalized class filters once
|
150
|
-
normalized_classes_req =
|
151
|
-
|
191
|
+
normalized_classes_req = (
|
192
|
+
{self._normalize_class_name(c) for c in options.classes} if options.classes else None
|
193
|
+
)
|
194
|
+
normalized_classes_excl = (
|
195
|
+
{self._normalize_class_name(c) for c in options.exclude_classes}
|
196
|
+
if options.exclude_classes
|
197
|
+
else set()
|
198
|
+
)
|
152
199
|
|
153
200
|
# --- Iterate through elements using Docling's structure ---
|
154
201
|
# This requires traversing the hierarchy (e.g., doc.body.children)
|
155
202
|
# or iterating through specific lists like doc.texts, doc.tables etc.
|
156
203
|
elements_to_process = []
|
157
|
-
if hasattr(doc,
|
158
|
-
|
159
|
-
if hasattr(doc,
|
204
|
+
if hasattr(doc, "texts"):
|
205
|
+
elements_to_process.extend(doc.texts)
|
206
|
+
if hasattr(doc, "tables"):
|
207
|
+
elements_to_process.extend(doc.tables)
|
208
|
+
if hasattr(doc, "pictures"):
|
209
|
+
elements_to_process.extend(doc.pictures)
|
160
210
|
# Add other element types from DoclingDocument as needed
|
161
211
|
|
162
212
|
self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
|
@@ -164,16 +214,19 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
164
214
|
for elem in elements_to_process:
|
165
215
|
try:
|
166
216
|
# Get Provenance (bbox and page number)
|
167
|
-
if not hasattr(elem,
|
168
|
-
|
169
|
-
|
217
|
+
if not hasattr(elem, "prov") or not elem.prov:
|
218
|
+
continue
|
219
|
+
prov = elem.prov[0] # Use first provenance
|
220
|
+
if not hasattr(prov, "bbox") or not prov.bbox:
|
221
|
+
continue
|
170
222
|
bbox = prov.bbox
|
171
223
|
page_no = prov.page_no
|
172
224
|
|
173
225
|
# Get Page Dimensions (crucial for coordinate conversion)
|
174
|
-
if not hasattr(doc.pages.get(page_no),
|
226
|
+
if not hasattr(doc.pages.get(page_no), "size"):
|
227
|
+
continue
|
175
228
|
page_height = doc.pages[page_no].size.height
|
176
|
-
page_width = doc.pages[page_no].size.width
|
229
|
+
page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
|
177
230
|
|
178
231
|
# Convert coordinates from Docling's system (often bottom-left origin)
|
179
232
|
# to standard top-left origin (0,0 at top-left)
|
@@ -182,46 +235,51 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
182
235
|
x1 = float(bbox.r)
|
183
236
|
# Convert y: top_y = page_height - bottom_left_t
|
184
237
|
# bottom_y = page_height - bottom_left_b
|
185
|
-
y0 = float(page_height - bbox.t)
|
186
|
-
y1 = float(page_height - bbox.b)
|
238
|
+
y0 = float(page_height - bbox.t) # Top y
|
239
|
+
y1 = float(page_height - bbox.b) # Bottom y
|
187
240
|
|
188
241
|
# Ensure y0 < y1
|
189
|
-
if y0 > y1:
|
242
|
+
if y0 > y1:
|
243
|
+
y0, y1 = y1, y0
|
190
244
|
# Ensure x0 < x1
|
191
|
-
if x0 > x1:
|
245
|
+
if x0 > x1:
|
246
|
+
x0, x1 = x1, x0
|
192
247
|
|
193
248
|
# Get Class Label
|
194
|
-
label_orig = str(getattr(elem,
|
249
|
+
label_orig = str(getattr(elem, "label", "Unknown")) # Default if no label
|
195
250
|
normalized_label = self._normalize_class_name(label_orig)
|
196
251
|
|
197
252
|
# Apply Class Filtering
|
198
|
-
if normalized_classes_req and normalized_label not in normalized_classes_req:
|
199
|
-
|
253
|
+
if normalized_classes_req and normalized_label not in normalized_classes_req:
|
254
|
+
continue
|
255
|
+
if normalized_label in normalized_classes_excl:
|
256
|
+
continue
|
200
257
|
|
201
258
|
# Get Confidence (Docling often doesn't provide per-element confidence)
|
202
|
-
confidence = getattr(elem,
|
203
|
-
if confidence < options.confidence:
|
259
|
+
confidence = getattr(elem, "confidence", 0.95) # Assign default confidence
|
260
|
+
if confidence < options.confidence:
|
261
|
+
continue # Apply confidence threshold
|
204
262
|
|
205
263
|
# Get Text Content
|
206
|
-
text_content = getattr(elem,
|
264
|
+
text_content = getattr(elem, "text", None)
|
207
265
|
|
208
266
|
# Get IDs for hierarchy
|
209
|
-
docling_id = getattr(elem,
|
210
|
-
parent_id_obj = getattr(elem,
|
211
|
-
parent_id = getattr(parent_id_obj,
|
267
|
+
docling_id = getattr(elem, "self_ref", None)
|
268
|
+
parent_id_obj = getattr(elem, "parent", None)
|
269
|
+
parent_id = getattr(parent_id_obj, "self_ref", None) if parent_id_obj else None
|
212
270
|
|
213
271
|
# Create Detection Dictionary
|
214
272
|
detection = {
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
273
|
+
"bbox": (x0, y0, x1, y1),
|
274
|
+
"class": label_orig,
|
275
|
+
"normalized_class": normalized_label,
|
276
|
+
"confidence": confidence,
|
277
|
+
"text": text_content,
|
278
|
+
"docling_id": docling_id,
|
279
|
+
"parent_id": parent_id,
|
280
|
+
"page_number": page_no, # Add page number if useful
|
281
|
+
"source": "layout",
|
282
|
+
"model": "docling",
|
225
283
|
}
|
226
284
|
detections.append(detection)
|
227
285
|
|
@@ -229,8 +287,8 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
229
287
|
# if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
|
230
288
|
|
231
289
|
except Exception as conv_e:
|
232
|
-
|
233
|
-
|
290
|
+
self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
|
291
|
+
continue
|
234
292
|
|
235
293
|
return detections
|
236
294
|
|
@@ -241,7 +299,8 @@ class DoclingLayoutDetector(LayoutDetector):
|
|
241
299
|
"""
|
242
300
|
# This requires caching the doc based on image/options or re-running.
|
243
301
|
# For simplicity, let's just re-run detect if needed.
|
244
|
-
self.logger.warning(
|
245
|
-
|
246
|
-
|
247
|
-
|
302
|
+
self.logger.warning(
|
303
|
+
"get_docling_document: Re-running detection to ensure document is generated."
|
304
|
+
)
|
305
|
+
self.detect(image, options) # Run detect to populate internal doc
|
306
|
+
return getattr(self, "_docling_document", None) # Return the stored doc
|