natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,39 +1,50 @@
|
|
1
|
-
import
|
1
|
+
import copy # Added for copying options
|
2
2
|
import glob as py_glob
|
3
3
|
import logging
|
4
|
-
|
4
|
+
import os
|
5
|
+
import re # Added for safe path generation
|
5
6
|
from pathlib import Path
|
7
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
|
8
|
+
|
6
9
|
from PIL import Image
|
7
|
-
import re # Added for safe path generation
|
8
|
-
import copy # Added for copying options
|
9
10
|
from tqdm import tqdm
|
10
11
|
|
11
12
|
# Set up logger early
|
12
13
|
logger = logging.getLogger(__name__)
|
13
14
|
|
14
15
|
from natural_pdf.core.pdf import PDF
|
15
|
-
from natural_pdf.elements.region import Region
|
16
|
+
from natural_pdf.elements.region import Region
|
16
17
|
|
17
18
|
# --- Search Imports ---
|
18
19
|
try:
|
19
20
|
from natural_pdf.search.search_service_protocol import (
|
20
|
-
|
21
|
-
|
21
|
+
Indexable,
|
22
|
+
SearchOptions,
|
23
|
+
SearchServiceProtocol,
|
24
|
+
)
|
22
25
|
from natural_pdf.search.searchable_mixin import SearchableMixin
|
23
26
|
except ImportError as e:
|
24
27
|
logger_init = logging.getLogger(__name__)
|
25
|
-
logger_init.
|
28
|
+
logger_init.warning(
|
29
|
+
f"Failed to import Haystack components. Semantic search functionality disabled.",
|
30
|
+
)
|
31
|
+
|
26
32
|
# Dummy definitions
|
27
|
-
class SearchableMixin:
|
33
|
+
class SearchableMixin:
|
34
|
+
pass
|
35
|
+
|
28
36
|
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
29
37
|
|
30
|
-
from natural_pdf.search.searchable_mixin import SearchableMixin
|
38
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
31
39
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
40
|
+
|
41
|
+
class PDFCollection(SearchableMixin): # Inherit from the mixin
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
source: Union[str, Iterable[Union[str, "PDF"]]],
|
45
|
+
recursive: bool = True,
|
46
|
+
**pdf_options: Any,
|
47
|
+
):
|
37
48
|
"""
|
38
49
|
Initializes a collection of PDF documents from various sources.
|
39
50
|
|
@@ -46,27 +57,29 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
46
57
|
whether to search recursively (default: True).
|
47
58
|
**pdf_options: Keyword arguments passed to the PDF constructor.
|
48
59
|
"""
|
49
|
-
self._pdfs: List[
|
50
|
-
self._pdf_options = pdf_options
|
51
|
-
self._recursive = recursive
|
60
|
+
self._pdfs: List["PDF"] = []
|
61
|
+
self._pdf_options = pdf_options # Store options for potential slicing later
|
62
|
+
self._recursive = recursive # Store setting for potential slicing
|
52
63
|
|
53
64
|
# Dynamically import PDF class within methods to avoid circular import at module load time
|
54
65
|
PDF = self._get_pdf_class()
|
55
66
|
|
56
|
-
if hasattr(source,
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
67
|
+
if hasattr(source, "__iter__") and not isinstance(source, str):
|
68
|
+
source_list = list(source)
|
69
|
+
if not source_list:
|
70
|
+
return # Empty list source
|
71
|
+
if isinstance(source_list[0], PDF):
|
72
|
+
if all(isinstance(item, PDF) for item in source_list):
|
73
|
+
self._pdfs = source_list # Direct assignment
|
74
|
+
# Don't adopt search context anymore
|
75
|
+
return
|
76
|
+
else:
|
77
|
+
raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
|
78
|
+
# If it's an iterable but not PDFs, fall through to resolve sources
|
66
79
|
|
67
80
|
# Resolve string, iterable of strings, or single string source to paths/URLs
|
68
81
|
resolved_paths_or_urls = self._resolve_sources_to_paths(source)
|
69
|
-
self._initialize_pdfs(resolved_paths_or_urls, PDF)
|
82
|
+
self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
|
70
83
|
|
71
84
|
self._iter_index = 0
|
72
85
|
|
@@ -79,15 +92,21 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
79
92
|
try:
|
80
93
|
# Import needs to resolve path correctly
|
81
94
|
from natural_pdf.core.pdf import PDF
|
95
|
+
|
82
96
|
return PDF
|
83
97
|
except ImportError as e:
|
84
|
-
logger.error(
|
98
|
+
logger.error(
|
99
|
+
"Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
|
100
|
+
)
|
85
101
|
raise ImportError("PDF class is required but could not be imported.") from e
|
86
102
|
|
87
103
|
# --- Internal Helpers ---
|
88
104
|
|
89
|
-
def _is_url(self, s: str) -> bool:
|
90
|
-
|
105
|
+
def _is_url(self, s: str) -> bool:
|
106
|
+
return s.startswith(("http://", "https://"))
|
107
|
+
|
108
|
+
def _has_glob_magic(self, s: str) -> bool:
|
109
|
+
return py_glob.has_magic(s)
|
91
110
|
|
92
111
|
def _execute_glob(self, pattern: str) -> Set[str]:
|
93
112
|
"""Glob for paths and return a set of valid PDF paths."""
|
@@ -96,10 +115,10 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
96
115
|
# Use iglob for potentially large directories/matches
|
97
116
|
paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
|
98
117
|
for path_str in paths_iter:
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
118
|
+
# Use Path object for easier checking
|
119
|
+
p = Path(path_str)
|
120
|
+
if p.is_file() and p.suffix.lower() == ".pdf":
|
121
|
+
found_paths.add(str(p.resolve())) # Store resolved absolute path
|
103
122
|
except Exception as e:
|
104
123
|
logger.error(f"Error processing glob pattern '{pattern}': {e}")
|
105
124
|
return found_paths
|
@@ -111,33 +130,37 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
111
130
|
|
112
131
|
if isinstance(source, str):
|
113
132
|
sources_to_process.append(source)
|
114
|
-
elif hasattr(source,
|
133
|
+
elif hasattr(source, "__iter__"):
|
115
134
|
sources_to_process.extend(list(source))
|
116
|
-
else:
|
117
|
-
|
135
|
+
else: # Should not happen based on __init__ checks, but safeguard
|
136
|
+
raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
|
118
137
|
|
119
138
|
for item in sources_to_process:
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
139
|
+
if not isinstance(item, str):
|
140
|
+
logger.warning(f"Skipping non-string item in source list: {type(item)}")
|
141
|
+
continue
|
142
|
+
|
143
|
+
item_path = Path(item)
|
144
|
+
|
145
|
+
if self._is_url(item):
|
146
|
+
final_paths.add(item) # Add URL directly
|
147
|
+
elif self._has_glob_magic(item):
|
148
|
+
glob_results = self._execute_glob(item)
|
149
|
+
final_paths.update(glob_results)
|
150
|
+
elif item_path.is_dir():
|
151
|
+
# Use glob to find PDFs in directory, respecting recursive flag
|
152
|
+
dir_pattern = (
|
153
|
+
str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
|
154
|
+
)
|
155
|
+
dir_glob_results = self._execute_glob(dir_pattern)
|
156
|
+
final_paths.update(dir_glob_results)
|
157
|
+
elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
|
158
|
+
final_paths.add(str(item_path.resolve())) # Add resolved file path
|
159
|
+
else:
|
160
|
+
logger.warning(
|
161
|
+
f"Source item ignored (not a valid URL, directory, file, or glob): {item}"
|
162
|
+
)
|
163
|
+
|
141
164
|
return sorted(list(final_paths))
|
142
165
|
|
143
166
|
def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
|
@@ -149,32 +172,38 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
149
172
|
pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
|
150
173
|
self._pdfs.append(pdf_instance)
|
151
174
|
except Exception as e:
|
152
|
-
|
153
|
-
|
175
|
+
logger.error(
|
176
|
+
f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False
|
177
|
+
) # Keep log concise
|
178
|
+
failed_count += 1
|
154
179
|
logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
|
155
180
|
|
156
181
|
# --- Public Factory Class Methods (Simplified) ---
|
157
182
|
|
158
183
|
@classmethod
|
159
|
-
def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) ->
|
184
|
+
def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> "PDFCollection":
|
160
185
|
"""Creates a PDFCollection explicitly from a list of file paths or URLs."""
|
161
186
|
# __init__ can handle List[str] directly now
|
162
187
|
return cls(paths_or_urls, **pdf_options)
|
163
188
|
|
164
189
|
@classmethod
|
165
|
-
def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) ->
|
190
|
+
def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> "PDFCollection":
|
166
191
|
"""Creates a PDFCollection explicitly from a single glob pattern."""
|
167
192
|
# __init__ can handle single glob string directly
|
168
193
|
return cls(pattern, recursive=recursive, **pdf_options)
|
169
194
|
|
170
195
|
@classmethod
|
171
|
-
def from_globs(
|
196
|
+
def from_globs(
|
197
|
+
cls, patterns: List[str], recursive: bool = True, **pdf_options: Any
|
198
|
+
) -> "PDFCollection":
|
172
199
|
"""Creates a PDFCollection explicitly from a list of glob patterns."""
|
173
|
-
|
200
|
+
# __init__ can handle List[str] containing globs directly
|
174
201
|
return cls(patterns, recursive=recursive, **pdf_options)
|
175
202
|
|
176
203
|
@classmethod
|
177
|
-
def from_directory(
|
204
|
+
def from_directory(
|
205
|
+
cls, directory_path: str, recursive: bool = True, **pdf_options: Any
|
206
|
+
) -> "PDFCollection":
|
178
207
|
"""Creates a PDFCollection explicitly from PDF files within a directory."""
|
179
208
|
# __init__ can handle single directory string directly
|
180
209
|
return cls(directory_path, recursive=recursive, **pdf_options)
|
@@ -183,12 +212,12 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
183
212
|
def __len__(self) -> int:
|
184
213
|
return len(self._pdfs)
|
185
214
|
|
186
|
-
def __getitem__(self, key) -> Union[
|
215
|
+
def __getitem__(self, key) -> Union["PDF", "PDFCollection"]:
|
187
216
|
# Use dynamic import here as well
|
188
217
|
PDF = self._get_pdf_class()
|
189
218
|
if isinstance(key, slice):
|
190
219
|
# Create a new collection with the sliced PDFs and original options
|
191
|
-
new_collection = PDFCollection.__new__(PDFCollection)
|
220
|
+
new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
|
192
221
|
new_collection._pdfs = self._pdfs[key]
|
193
222
|
new_collection._pdf_options = self._pdf_options
|
194
223
|
new_collection._recursive = self._recursive
|
@@ -199,9 +228,9 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
199
228
|
if 0 <= key < len(self._pdfs):
|
200
229
|
return self._pdfs[key]
|
201
230
|
else:
|
202
|
-
|
231
|
+
raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
|
203
232
|
else:
|
204
|
-
|
233
|
+
raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
|
205
234
|
|
206
235
|
def __iter__(self):
|
207
236
|
return iter(self._pdfs)
|
@@ -211,24 +240,23 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
211
240
|
return f"<PDFCollection(count={len(self)})>"
|
212
241
|
|
213
242
|
@property
|
214
|
-
def pdfs(self) -> List[
|
215
|
-
|
216
|
-
|
243
|
+
def pdfs(self) -> List["PDF"]:
|
244
|
+
"""Returns the list of PDF objects held by the collection."""
|
245
|
+
return self._pdfs
|
217
246
|
|
218
|
-
# --- Other Methods (e.g., apply_ocr - could leverage service in future?) ---
|
219
247
|
def apply_ocr(self, *args, **kwargs):
|
220
248
|
PDF = self._get_pdf_class()
|
221
249
|
# Delegate to individual PDF objects
|
222
250
|
logger.info("Applying OCR to relevant PDFs in collection...")
|
223
251
|
results = []
|
224
252
|
for pdf in self._pdfs:
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
253
|
+
# We need to figure out which pages belong to which PDF if batching here
|
254
|
+
# For now, simpler to call on each PDF
|
255
|
+
try:
|
256
|
+
# Assume apply_ocr exists on PDF and accepts similar args
|
257
|
+
pdf.apply_ocr(*args, **kwargs)
|
258
|
+
except Exception as e:
|
259
|
+
logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
|
232
260
|
return self
|
233
261
|
|
234
262
|
# --- Advanced Method Placeholders ---
|
@@ -237,23 +265,44 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
237
265
|
def categorize(self, categories: List[str], **kwargs):
|
238
266
|
"""Categorizes PDFs in the collection based on content or features."""
|
239
267
|
# Implementation requires integrating with classification models or logic
|
240
|
-
raise NotImplementedError("categorize requires classification implementation.")
|
268
|
+
raise NotImplementedError("categorize requires classification implementation.")
|
269
|
+
|
270
|
+
def export_ocr_correction_task(self, output_zip_path: str, **kwargs):
|
271
|
+
"""
|
272
|
+
Exports OCR results from all PDFs in this collection into a single
|
273
|
+
correction task package (zip file).
|
274
|
+
|
275
|
+
Args:
|
276
|
+
output_zip_path: The path to save the output zip file.
|
277
|
+
**kwargs: Additional arguments passed to create_correction_task_package
|
278
|
+
(e.g., image_render_scale, overwrite).
|
279
|
+
"""
|
280
|
+
try:
|
281
|
+
from natural_pdf.utils.packaging import create_correction_task_package
|
282
|
+
# Pass the collection itself (self) as the source
|
283
|
+
create_correction_task_package(source=self, output_zip_path=output_zip_path, **kwargs)
|
284
|
+
except ImportError:
|
285
|
+
logger.error("Failed to import 'create_correction_task_package'. Packaging utility might be missing.")
|
286
|
+
# Or raise
|
287
|
+
except Exception as e:
|
288
|
+
logger.error(f"Failed to export correction task for collection: {e}", exc_info=True)
|
289
|
+
raise # Re-raise the exception from the utility function
|
241
290
|
|
242
|
-
# --- Mixin Required Implementation ---
|
291
|
+
# --- Mixin Required Implementation ---
|
243
292
|
def get_indexable_items(self) -> Iterable[Indexable]:
|
244
293
|
"""Yields Page objects from the collection, conforming to Indexable."""
|
245
294
|
if not self._pdfs:
|
246
|
-
|
247
|
-
|
295
|
+
return # Return empty iterator if no PDFs
|
296
|
+
|
248
297
|
for pdf in self._pdfs:
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
298
|
+
if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
|
299
|
+
logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
|
300
|
+
continue
|
301
|
+
for page in pdf.pages:
|
302
|
+
# Optional: Add filtering here if needed (e.g., skip empty pages)
|
303
|
+
# Assuming Page object conforms to Indexable
|
304
|
+
# We might still want the empty page check here for efficiency
|
305
|
+
# if not page.extract_text(use_exclusions=False).strip():
|
306
|
+
# logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
|
307
|
+
# continue
|
308
|
+
yield page
|
natural_pdf/core/__init__.py
CHANGED