natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,38 @@
1
1
  # layout_detector_docling.py
2
- import logging
3
2
  import importlib.util
3
+ import logging
4
4
  import os
5
5
  import tempfile
6
- from typing import List, Dict, Any, Optional
6
+ from typing import Any, Dict, List, Optional
7
+
7
8
  from PIL import Image
8
9
 
9
10
  # Assuming base class and options are importable
10
11
  try:
11
12
  from .base import LayoutDetector
12
- from .layout_options import DoclingLayoutOptions, BaseLayoutOptions
13
+ from .layout_options import BaseLayoutOptions, DoclingLayoutOptions
13
14
  except ImportError:
14
15
  # Placeholders if run standalone or imports fail
15
- class BaseLayoutOptions: pass
16
- class DoclingLayoutOptions(BaseLayoutOptions): pass
16
+ class BaseLayoutOptions:
17
+ pass
18
+
19
+ class DoclingLayoutOptions(BaseLayoutOptions):
20
+ pass
21
+
17
22
  class LayoutDetector:
18
- def __init__(self): self.logger=logging.getLogger(); self.supported_classes=set()
19
- def _get_model(self, options): raise NotImplementedError
20
- def _normalize_class_name(self, n): return n
21
- def validate_classes(self, c): pass
23
+ def __init__(self):
24
+ self.logger = logging.getLogger()
25
+ self.supported_classes = set()
26
+
27
+ def _get_model(self, options):
28
+ raise NotImplementedError
29
+
30
+ def _normalize_class_name(self, n):
31
+ return n
32
+
33
+ def validate_classes(self, c):
34
+ pass
35
+
22
36
  logging.basicConfig()
23
37
 
24
38
  logger = logging.getLogger(__name__)
@@ -42,11 +56,27 @@ class DoclingLayoutDetector(LayoutDetector):
42
56
  super().__init__()
43
57
  # Docling classes are dynamic/hierarchical, define common ones
44
58
  self.supported_classes = {
45
- 'Header', 'Footer', 'Paragraph', 'Heading', 'List', 'ListItem',
46
- 'Table', 'Figure', 'Caption', 'Footnote', 'PageNumber', 'Equation',
47
- 'Code', 'Title', 'Author', 'Abstract', 'Section', 'Unknown', 'Metadata' # Add more as needed
59
+ "Header",
60
+ "Footer",
61
+ "Paragraph",
62
+ "Heading",
63
+ "List",
64
+ "ListItem",
65
+ "Table",
66
+ "Figure",
67
+ "Caption",
68
+ "Footnote",
69
+ "PageNumber",
70
+ "Equation",
71
+ "Code",
72
+ "Title",
73
+ "Author",
74
+ "Abstract",
75
+ "Section",
76
+ "Unknown",
77
+ "Metadata", # Add more as needed
48
78
  }
49
- self._docling_document_cache = {} # Cache the output doc per image/options if needed
79
+ self._docling_document_cache = {} # Cache the output doc per image/options if needed
50
80
 
51
81
  def is_available(self) -> bool:
52
82
  """Check if docling is installed."""
@@ -55,9 +85,9 @@ class DoclingLayoutDetector(LayoutDetector):
55
85
  def _get_cache_key(self, options: BaseLayoutOptions) -> str:
56
86
  """Generate cache key based on device and potentially converter args."""
57
87
  if not isinstance(options, DoclingLayoutOptions):
58
- options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
88
+ options = DoclingLayoutOptions(device=options.device, extra_args=options.extra_args)
59
89
 
60
- device_key = str(options.device).lower() if options.device else 'default_device'
90
+ device_key = str(options.device).lower() if options.device else "default_device"
61
91
  # Include hash of extra_args if they affect model loading/converter init
62
92
  extra_args_key = hash(frozenset(options.extra_args.items()))
63
93
  return f"{self.__class__.__name__}_{device_key}_{extra_args_key}"
@@ -88,12 +118,17 @@ class DoclingLayoutDetector(LayoutDetector):
88
118
  raise RuntimeError("Docling dependency not installed.")
89
119
 
90
120
  if not isinstance(options, DoclingLayoutOptions):
91
- self.logger.warning("Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults.")
92
- options = DoclingLayoutOptions(
93
- confidence=options.confidence, classes=options.classes,
94
- exclude_classes=options.exclude_classes, device=options.device,
95
- extra_args=options.extra_args, verbose=options.extra_args.get('verbose', False)
96
- )
121
+ self.logger.warning(
122
+ "Received BaseLayoutOptions, expected DoclingLayoutOptions. Using defaults."
123
+ )
124
+ options = DoclingLayoutOptions(
125
+ confidence=options.confidence,
126
+ classes=options.classes,
127
+ exclude_classes=options.exclude_classes,
128
+ device=options.device,
129
+ extra_args=options.extra_args,
130
+ verbose=options.extra_args.get("verbose", False),
131
+ )
97
132
 
98
133
  # Validate classes before proceeding (note: Docling classes are case-sensitive)
99
134
  # self.validate_classes(options.classes or []) # Validation might be tricky due to case sensitivity
@@ -105,18 +140,20 @@ class DoclingLayoutDetector(LayoutDetector):
105
140
 
106
141
  # Docling convert method requires an image path. Save temp file.
107
142
  detections = []
108
- docling_doc = None # To store the result
143
+ docling_doc = None # To store the result
109
144
  with tempfile.TemporaryDirectory() as temp_dir:
110
145
  temp_image_path = os.path.join(temp_dir, f"docling_input_{os.getpid()}.png")
111
146
  try:
112
- self.logger.debug(f"Saving temporary image for Docling detector to: {temp_image_path}")
113
- image.convert("RGB").save(temp_image_path) # Ensure RGB
147
+ self.logger.debug(
148
+ f"Saving temporary image for Docling detector to: {temp_image_path}"
149
+ )
150
+ image.convert("RGB").save(temp_image_path) # Ensure RGB
114
151
 
115
152
  # Convert the document using Docling's DocumentConverter
116
153
  self.logger.debug("Running Docling conversion...")
117
154
  # Docling convert returns a Result object with a 'document' attribute
118
155
  result = converter.convert(temp_image_path)
119
- docling_doc = result.document # Store the DoclingDocument
156
+ docling_doc = result.document # Store the DoclingDocument
120
157
  self.logger.info(f"Docling conversion complete.")
121
158
 
122
159
  # Convert Docling document to our detection format
@@ -124,12 +161,14 @@ class DoclingLayoutDetector(LayoutDetector):
124
161
 
125
162
  except Exception as e:
126
163
  self.logger.error(f"Error during Docling detection: {e}", exc_info=True)
127
- raise # Re-raise the exception
164
+ raise # Re-raise the exception
128
165
  finally:
129
- # Ensure temp file is removed
130
- if os.path.exists(temp_image_path):
131
- try: os.remove(temp_image_path)
132
- except OSError as e_rm: self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
166
+ # Ensure temp file is removed
167
+ if os.path.exists(temp_image_path):
168
+ try:
169
+ os.remove(temp_image_path)
170
+ except OSError as e_rm:
171
+ self.logger.warning(f"Could not remove temp file {temp_image_path}: {e_rm}")
133
172
 
134
173
  # Cache the docling document if needed elsewhere (maybe associate with page?)
135
174
  # self._docling_document_cache[image_hash] = docling_doc # Needs a way to key this
@@ -137,26 +176,37 @@ class DoclingLayoutDetector(LayoutDetector):
137
176
  self.logger.info(f"Docling detected {len(detections)} layout elements matching criteria.")
138
177
  return detections
139
178
 
140
- def _convert_docling_to_detections(self, doc, options: DoclingLayoutOptions) -> List[Dict[str, Any]]:
179
+ def _convert_docling_to_detections(
180
+ self, doc, options: DoclingLayoutOptions
181
+ ) -> List[Dict[str, Any]]:
141
182
  """Convert a Docling document to our standard detection format."""
142
- if not doc or not hasattr(doc, 'pages') or not doc.pages:
183
+ if not doc or not hasattr(doc, "pages") or not doc.pages:
143
184
  self.logger.warning("Invalid or empty Docling document for conversion.")
144
185
  return []
145
186
 
146
187
  detections = []
147
- id_to_detection_index = {} # Map Docling ID to index in detections list
188
+ id_to_detection_index = {} # Map Docling ID to index in detections list
148
189
 
149
190
  # Prepare normalized class filters once
150
- normalized_classes_req = {self._normalize_class_name(c) for c in options.classes} if options.classes else None
151
- normalized_classes_excl = {self._normalize_class_name(c) for c in options.exclude_classes} if options.exclude_classes else set()
191
+ normalized_classes_req = (
192
+ {self._normalize_class_name(c) for c in options.classes} if options.classes else None
193
+ )
194
+ normalized_classes_excl = (
195
+ {self._normalize_class_name(c) for c in options.exclude_classes}
196
+ if options.exclude_classes
197
+ else set()
198
+ )
152
199
 
153
200
  # --- Iterate through elements using Docling's structure ---
154
201
  # This requires traversing the hierarchy (e.g., doc.body.children)
155
202
  # or iterating through specific lists like doc.texts, doc.tables etc.
156
203
  elements_to_process = []
157
- if hasattr(doc, 'texts'): elements_to_process.extend(doc.texts)
158
- if hasattr(doc, 'tables'): elements_to_process.extend(doc.tables)
159
- if hasattr(doc, 'pictures'): elements_to_process.extend(doc.pictures)
204
+ if hasattr(doc, "texts"):
205
+ elements_to_process.extend(doc.texts)
206
+ if hasattr(doc, "tables"):
207
+ elements_to_process.extend(doc.tables)
208
+ if hasattr(doc, "pictures"):
209
+ elements_to_process.extend(doc.pictures)
160
210
  # Add other element types from DoclingDocument as needed
161
211
 
162
212
  self.logger.debug(f"Converting {len(elements_to_process)} Docling elements...")
@@ -164,16 +214,19 @@ class DoclingLayoutDetector(LayoutDetector):
164
214
  for elem in elements_to_process:
165
215
  try:
166
216
  # Get Provenance (bbox and page number)
167
- if not hasattr(elem, 'prov') or not elem.prov: continue
168
- prov = elem.prov[0] # Use first provenance
169
- if not hasattr(prov, 'bbox') or not prov.bbox: continue
217
+ if not hasattr(elem, "prov") or not elem.prov:
218
+ continue
219
+ prov = elem.prov[0] # Use first provenance
220
+ if not hasattr(prov, "bbox") or not prov.bbox:
221
+ continue
170
222
  bbox = prov.bbox
171
223
  page_no = prov.page_no
172
224
 
173
225
  # Get Page Dimensions (crucial for coordinate conversion)
174
- if not hasattr(doc.pages.get(page_no), 'size'): continue
226
+ if not hasattr(doc.pages.get(page_no), "size"):
227
+ continue
175
228
  page_height = doc.pages[page_no].size.height
176
- page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
229
+ page_width = doc.pages[page_no].size.width # Needed? Bbox seems absolute
177
230
 
178
231
  # Convert coordinates from Docling's system (often bottom-left origin)
179
232
  # to standard top-left origin (0,0 at top-left)
@@ -182,46 +235,51 @@ class DoclingLayoutDetector(LayoutDetector):
182
235
  x1 = float(bbox.r)
183
236
  # Convert y: top_y = page_height - bottom_left_t
184
237
  # bottom_y = page_height - bottom_left_b
185
- y0 = float(page_height - bbox.t) # Top y
186
- y1 = float(page_height - bbox.b) # Bottom y
238
+ y0 = float(page_height - bbox.t) # Top y
239
+ y1 = float(page_height - bbox.b) # Bottom y
187
240
 
188
241
  # Ensure y0 < y1
189
- if y0 > y1: y0, y1 = y1, y0
242
+ if y0 > y1:
243
+ y0, y1 = y1, y0
190
244
  # Ensure x0 < x1
191
- if x0 > x1: x0, x1 = x1, x0
245
+ if x0 > x1:
246
+ x0, x1 = x1, x0
192
247
 
193
248
  # Get Class Label
194
- label_orig = str(getattr(elem, 'label', 'Unknown')) # Default if no label
249
+ label_orig = str(getattr(elem, "label", "Unknown")) # Default if no label
195
250
  normalized_label = self._normalize_class_name(label_orig)
196
251
 
197
252
  # Apply Class Filtering
198
- if normalized_classes_req and normalized_label not in normalized_classes_req: continue
199
- if normalized_label in normalized_classes_excl: continue
253
+ if normalized_classes_req and normalized_label not in normalized_classes_req:
254
+ continue
255
+ if normalized_label in normalized_classes_excl:
256
+ continue
200
257
 
201
258
  # Get Confidence (Docling often doesn't provide per-element confidence)
202
- confidence = getattr(elem, 'confidence', 0.95) # Assign default confidence
203
- if confidence < options.confidence: continue # Apply confidence threshold
259
+ confidence = getattr(elem, "confidence", 0.95) # Assign default confidence
260
+ if confidence < options.confidence:
261
+ continue # Apply confidence threshold
204
262
 
205
263
  # Get Text Content
206
- text_content = getattr(elem, 'text', None)
264
+ text_content = getattr(elem, "text", None)
207
265
 
208
266
  # Get IDs for hierarchy
209
- docling_id = getattr(elem, 'self_ref', None)
210
- parent_id_obj = getattr(elem, 'parent', None)
211
- parent_id = getattr(parent_id_obj, 'self_ref', None) if parent_id_obj else None
267
+ docling_id = getattr(elem, "self_ref", None)
268
+ parent_id_obj = getattr(elem, "parent", None)
269
+ parent_id = getattr(parent_id_obj, "self_ref", None) if parent_id_obj else None
212
270
 
213
271
  # Create Detection Dictionary
214
272
  detection = {
215
- 'bbox': (x0, y0, x1, y1),
216
- 'class': label_orig,
217
- 'normalized_class': normalized_label,
218
- 'confidence': confidence,
219
- 'text': text_content,
220
- 'docling_id': docling_id,
221
- 'parent_id': parent_id,
222
- 'page_number': page_no, # Add page number if useful
223
- 'source': 'layout',
224
- 'model': 'docling'
273
+ "bbox": (x0, y0, x1, y1),
274
+ "class": label_orig,
275
+ "normalized_class": normalized_label,
276
+ "confidence": confidence,
277
+ "text": text_content,
278
+ "docling_id": docling_id,
279
+ "parent_id": parent_id,
280
+ "page_number": page_no, # Add page number if useful
281
+ "source": "layout",
282
+ "model": "docling",
225
283
  }
226
284
  detections.append(detection)
227
285
 
@@ -229,8 +287,8 @@ class DoclingLayoutDetector(LayoutDetector):
229
287
  # if docling_id: id_to_detection_index[docling_id] = len(detections) - 1
230
288
 
231
289
  except Exception as conv_e:
232
- self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
233
- continue
290
+ self.logger.warning(f"Could not convert Docling element: {elem}. Error: {conv_e}")
291
+ continue
234
292
 
235
293
  return detections
236
294
 
@@ -241,7 +299,8 @@ class DoclingLayoutDetector(LayoutDetector):
241
299
  """
242
300
  # This requires caching the doc based on image/options or re-running.
243
301
  # For simplicity, let's just re-run detect if needed.
244
- self.logger.warning("get_docling_document: Re-running detection to ensure document is generated.")
245
- self.detect(image, options) # Run detect to populate internal doc
246
- return getattr(self, '_docling_document', None) # Return the stored doc
247
-
302
+ self.logger.warning(
303
+ "get_docling_document: Re-running detection to ensure document is generated."
304
+ )
305
+ self.detect(image, options) # Run detect to populate internal doc
306
+ return getattr(self, "_docling_document", None) # Return the stored doc