natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,62 +1,86 @@
|
|
1
|
-
import
|
1
|
+
import copy # Add import for deepcopy
|
2
2
|
import logging
|
3
|
-
import tempfile
|
4
3
|
import os
|
5
4
|
import re
|
5
|
+
import tempfile
|
6
6
|
import urllib.request
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
7
|
+
from pathlib import Path # Added Path
|
8
|
+
from typing import ( # Added Iterable and TYPE_CHECKING
|
9
|
+
TYPE_CHECKING,
|
10
|
+
Any,
|
11
|
+
Callable,
|
12
|
+
Dict,
|
13
|
+
Iterable,
|
14
|
+
List,
|
15
|
+
Optional,
|
16
|
+
Tuple,
|
17
|
+
Type,
|
18
|
+
Union,
|
19
|
+
)
|
20
|
+
|
21
|
+
import pdfplumber
|
10
22
|
from PIL import Image
|
11
23
|
|
24
|
+
from natural_pdf.analyzers.layout.layout_manager import ( # Import the new LayoutManager
|
25
|
+
LayoutManager,
|
26
|
+
)
|
27
|
+
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
12
28
|
from natural_pdf.core.page import Page
|
13
|
-
from natural_pdf.selectors.parser import parse_selector
|
14
29
|
from natural_pdf.elements.collections import ElementCollection
|
15
30
|
from natural_pdf.elements.region import Region
|
16
31
|
from natural_pdf.ocr import OCRManager, OCROptions
|
17
|
-
from natural_pdf.
|
18
|
-
from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
|
32
|
+
from natural_pdf.selectors.parser import parse_selector
|
19
33
|
|
20
34
|
# Import the flag directly - this should always work
|
21
35
|
|
22
36
|
# --- Add Search Service Imports (needed for new methods) ---
|
23
37
|
try:
|
38
|
+
from typing import Any as TypingAny # Import Any if not already
|
39
|
+
|
40
|
+
from natural_pdf.search import TextSearchOptions # Keep for ask default
|
24
41
|
from natural_pdf.search import (
|
25
|
-
|
26
|
-
SearchServiceProtocol,
|
42
|
+
BaseSearchOptions,
|
27
43
|
SearchOptions,
|
28
|
-
|
29
|
-
|
44
|
+
SearchServiceProtocol,
|
45
|
+
get_search_service,
|
30
46
|
)
|
31
|
-
from typing import Any as TypingAny # Import Any if not already
|
32
47
|
except ImportError:
|
33
48
|
# Define dummies if needed for type hints within the class
|
34
49
|
SearchServiceProtocol = object
|
35
50
|
SearchOptions, TextSearchOptions, BaseSearchOptions = object, object, object
|
36
51
|
TypingAny = object
|
52
|
+
|
37
53
|
# Dummy factory needed for default arg in methods
|
38
54
|
def get_search_service(**kwargs) -> SearchServiceProtocol:
|
39
|
-
raise ImportError(
|
55
|
+
raise ImportError(
|
56
|
+
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
57
|
+
)
|
58
|
+
|
40
59
|
|
41
60
|
# --- End Search Service Imports ---
|
42
61
|
|
43
62
|
# Set up logger early
|
44
63
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
45
64
|
|
65
|
+
|
46
66
|
class PDF:
|
47
67
|
"""
|
48
68
|
Enhanced PDF wrapper built on top of pdfplumber.
|
49
|
-
|
69
|
+
|
50
70
|
This class provides a fluent interface for working with PDF documents,
|
51
71
|
with improved selection, navigation, and extraction capabilities.
|
52
72
|
"""
|
53
|
-
|
54
|
-
def __init__(
|
55
|
-
|
56
|
-
|
73
|
+
|
74
|
+
def __init__(
|
75
|
+
self,
|
76
|
+
path_or_url: str,
|
77
|
+
reading_order: bool = True,
|
78
|
+
font_attrs: Optional[List[str]] = None,
|
79
|
+
keep_spaces: bool = True,
|
80
|
+
):
|
57
81
|
"""
|
58
82
|
Initialize the enhanced PDF object.
|
59
|
-
|
83
|
+
|
60
84
|
Args:
|
61
85
|
path_or_url: Path to the PDF file or a URL to a PDF
|
62
86
|
reading_order: Whether to use natural reading order
|
@@ -69,30 +93,30 @@ class PDF:
|
|
69
93
|
False: Break text at spaces, each word is separate (legacy behavior)
|
70
94
|
"""
|
71
95
|
# Check if the input is a URL
|
72
|
-
is_url = path_or_url.startswith(
|
73
|
-
|
96
|
+
is_url = path_or_url.startswith("http://") or path_or_url.startswith("https://")
|
97
|
+
|
74
98
|
# Initialize path-related attributes
|
75
99
|
self._original_path = path_or_url
|
76
100
|
self._temp_file = None
|
77
|
-
self._resolved_path = None
|
101
|
+
self._resolved_path = None # Store the actual path used by pdfplumber
|
78
102
|
|
79
103
|
if is_url:
|
80
104
|
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
81
105
|
try:
|
82
106
|
# Create a temporary file to store the downloaded PDF
|
83
|
-
self._temp_file = tempfile.NamedTemporaryFile(suffix=
|
84
|
-
|
107
|
+
self._temp_file = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
|
108
|
+
|
85
109
|
# Download the PDF
|
86
110
|
with urllib.request.urlopen(path_or_url) as response:
|
87
111
|
self._temp_file.write(response.read())
|
88
112
|
self._temp_file.flush()
|
89
113
|
self._temp_file.close()
|
90
|
-
|
114
|
+
|
91
115
|
# Use the temporary file path
|
92
116
|
self._resolved_path = self._temp_file.name
|
93
117
|
logger.info(f"PDF downloaded to temporary file: {self._resolved_path}")
|
94
118
|
except Exception as e:
|
95
|
-
if self._temp_file and hasattr(self._temp_file,
|
119
|
+
if self._temp_file and hasattr(self._temp_file, "name"):
|
96
120
|
try:
|
97
121
|
os.unlink(self._temp_file.name)
|
98
122
|
except:
|
@@ -104,40 +128,46 @@ class PDF:
|
|
104
128
|
self._resolved_path = path_or_url
|
105
129
|
|
106
130
|
logger.info(f"Initializing PDF from {self._resolved_path}")
|
107
|
-
logger.debug(
|
108
|
-
|
131
|
+
logger.debug(
|
132
|
+
f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}"
|
133
|
+
)
|
134
|
+
|
109
135
|
try:
|
110
136
|
self._pdf = pdfplumber.open(self._resolved_path)
|
111
137
|
except Exception as e:
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
138
|
+
logger.error(
|
139
|
+
f"Failed to open PDF with pdfplumber: {self._resolved_path}. Error: {e}",
|
140
|
+
exc_info=True,
|
141
|
+
)
|
142
|
+
# Clean up temp file if creation failed
|
143
|
+
self.close()
|
144
|
+
raise IOError(f"Failed to open PDF file/URL: {path_or_url}") from e
|
116
145
|
|
117
|
-
self._path = self._resolved_path
|
118
|
-
self.path = self._resolved_path
|
119
|
-
self.source_path = self._original_path
|
146
|
+
self._path = self._resolved_path # Keep original path too?
|
147
|
+
self.path = self._resolved_path # Public attribute for the resolved path
|
148
|
+
self.source_path = self._original_path # Public attribute for the user-provided path/URL
|
120
149
|
|
121
150
|
self._reading_order = reading_order
|
122
|
-
self._config = {
|
123
|
-
'keep_spaces': keep_spaces
|
124
|
-
}
|
151
|
+
self._config = {"keep_spaces": keep_spaces}
|
125
152
|
|
126
153
|
self._font_attrs = font_attrs # Store the font attribute configuration
|
127
154
|
|
128
155
|
# Initialize Managers and Services (conditionally available)
|
129
156
|
self._ocr_manager = OCRManager() if OCRManager else None
|
130
157
|
self._layout_manager = LayoutManager() if LayoutManager else None
|
131
|
-
self.highlighter = HighlightingService(self)
|
158
|
+
self.highlighter = HighlightingService(self)
|
132
159
|
|
133
160
|
# Initialize pages last, passing necessary refs
|
134
|
-
self._pages = [
|
161
|
+
self._pages = [
|
162
|
+
Page(p, parent=self, index=i, font_attrs=font_attrs)
|
163
|
+
for i, p in enumerate(self._pdf.pages)
|
164
|
+
]
|
135
165
|
|
136
166
|
# Other state
|
137
167
|
self._element_cache = {}
|
138
168
|
self._exclusions = [] # List to store exclusion functions/regions
|
139
169
|
self._regions = [] # List to store region functions/definitions
|
140
|
-
|
170
|
+
|
141
171
|
logger.info("Initialized HighlightingService.")
|
142
172
|
logger.info(f"PDF '{self.source_path}' initialized with {len(self._pages)} pages.")
|
143
173
|
|
@@ -147,45 +177,48 @@ class PDF:
|
|
147
177
|
return self._pdf.metadata
|
148
178
|
|
149
179
|
@property
|
150
|
-
def pages(self) ->
|
180
|
+
def pages(self) -> "PageCollection":
|
151
181
|
"""Access pages as a PageCollection object."""
|
152
182
|
from natural_pdf.elements.collections import PageCollection
|
183
|
+
|
153
184
|
# Ensure _pages is initialized
|
154
|
-
if not hasattr(self,
|
155
|
-
|
185
|
+
if not hasattr(self, "_pages"):
|
186
|
+
raise AttributeError("PDF pages not yet initialized.")
|
156
187
|
return PageCollection(self._pages)
|
157
|
-
|
158
|
-
def clear_exclusions(self) ->
|
188
|
+
|
189
|
+
def clear_exclusions(self) -> "PDF":
|
159
190
|
"""
|
160
191
|
Clear all exclusion functions from the PDF.
|
161
|
-
|
192
|
+
|
162
193
|
Returns:
|
163
194
|
Self for method chaining
|
164
195
|
"""
|
165
196
|
# Ensure _pages is initialized
|
166
|
-
if not hasattr(self,
|
167
|
-
|
197
|
+
if not hasattr(self, "_pages"):
|
198
|
+
raise AttributeError("PDF pages not yet initialized.")
|
168
199
|
|
169
200
|
self._exclusions = []
|
170
201
|
# Also clear from pages
|
171
202
|
for page in self._pages:
|
172
203
|
page.clear_exclusions()
|
173
204
|
return self
|
174
|
-
|
175
|
-
def add_exclusion(
|
205
|
+
|
206
|
+
def add_exclusion(
|
207
|
+
self, exclusion_func: Callable[["Page"], Optional[Region]], label: str = None
|
208
|
+
) -> "PDF":
|
176
209
|
"""
|
177
210
|
Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
|
178
|
-
|
211
|
+
|
179
212
|
Args:
|
180
213
|
exclusion_func: A function that takes a Page and returns a Region to exclude, or None.
|
181
214
|
label: Optional label for this exclusion
|
182
|
-
|
215
|
+
|
183
216
|
Returns:
|
184
217
|
Self for method chaining
|
185
218
|
"""
|
186
219
|
# Ensure _pages is initialized
|
187
|
-
if not hasattr(self,
|
188
|
-
|
220
|
+
if not hasattr(self, "_pages"):
|
221
|
+
raise AttributeError("PDF pages not yet initialized.")
|
189
222
|
|
190
223
|
# Store exclusion with its label at PDF level
|
191
224
|
exclusion_data = (exclusion_func, label)
|
@@ -198,16 +231,16 @@ class PDF:
|
|
198
231
|
|
199
232
|
return self
|
200
233
|
|
201
|
-
def
|
234
|
+
def apply_ocr(
|
202
235
|
self,
|
203
236
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
204
237
|
engine: Optional[str] = None,
|
205
|
-
options: Optional[
|
238
|
+
options: Optional["OCROptions"] = None,
|
206
239
|
languages: Optional[List[str]] = None,
|
207
240
|
min_confidence: Optional[float] = None,
|
208
241
|
device: Optional[str] = None,
|
209
242
|
# Add other simple mode args if needed
|
210
|
-
) ->
|
243
|
+
) -> "PDF":
|
211
244
|
"""
|
212
245
|
Applies OCR to specified pages (or all pages) of the PDF using batch processing.
|
213
246
|
|
@@ -234,9 +267,9 @@ class PDF:
|
|
234
267
|
RuntimeError: If the OCRManager or selected engine is not available.
|
235
268
|
"""
|
236
269
|
if not self._ocr_manager:
|
237
|
-
|
238
|
-
|
239
|
-
|
270
|
+
logger.error("OCRManager not available. Cannot apply OCR.")
|
271
|
+
# Or raise RuntimeError("OCRManager not initialized.")
|
272
|
+
return self
|
240
273
|
|
241
274
|
# --- Determine Target Pages ---
|
242
275
|
target_pages: List[Page] = []
|
@@ -244,15 +277,17 @@ class PDF:
|
|
244
277
|
target_pages = self._pages
|
245
278
|
elif isinstance(pages, slice):
|
246
279
|
target_pages = self._pages[pages]
|
247
|
-
elif hasattr(pages,
|
280
|
+
elif hasattr(pages, "__iter__"): # Check if it's iterable (list, range, tuple, etc.)
|
248
281
|
try:
|
249
282
|
target_pages = [self._pages[i] for i in pages]
|
250
283
|
except IndexError:
|
251
284
|
raise ValueError("Invalid page index provided in 'pages' iterable.")
|
252
285
|
except TypeError:
|
253
|
-
|
286
|
+
raise TypeError(
|
287
|
+
"'pages' must be None, a slice, or an iterable of page indices (int)."
|
288
|
+
)
|
254
289
|
else:
|
255
|
-
|
290
|
+
raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
|
256
291
|
|
257
292
|
if not target_pages:
|
258
293
|
logger.warning("No pages selected for OCR processing.")
|
@@ -263,33 +298,36 @@ class PDF:
|
|
263
298
|
|
264
299
|
# --- Render Images for Batch ---
|
265
300
|
images_pil: List[Image.Image] = []
|
266
|
-
page_image_map: List[Tuple[Page, Image.Image]] = []
|
301
|
+
page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
|
267
302
|
logger.info(f"Rendering {len(target_pages)} pages to images...")
|
268
|
-
failed_page_num =
|
303
|
+
failed_page_num = "unknown" # Keep track of potentially failing page
|
269
304
|
try:
|
270
|
-
ocr_scale = getattr(self,
|
305
|
+
ocr_scale = getattr(self, "_config", {}).get("ocr_image_scale", 2.0)
|
271
306
|
for i, page in enumerate(target_pages):
|
272
|
-
failed_page_num = page.number
|
307
|
+
failed_page_num = page.number # Update current page number in case of error
|
273
308
|
logger.debug(f" Rendering page {page.number} (index {page.index})...")
|
274
309
|
# Use page.to_image but ensure highlights are off for OCR base image
|
275
310
|
img = page.to_image(scale=ocr_scale, include_highlights=False)
|
276
311
|
images_pil.append(img)
|
277
|
-
page_image_map.append((page, img))
|
312
|
+
page_image_map.append((page, img)) # Store pair
|
278
313
|
except Exception as e:
|
279
314
|
logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
|
280
315
|
raise RuntimeError(f"Failed to render page {failed_page_num} for OCR.") from e
|
281
316
|
|
282
317
|
if not images_pil:
|
283
|
-
|
284
|
-
|
318
|
+
logger.error("No images were successfully rendered for batch OCR.")
|
319
|
+
return self
|
285
320
|
|
286
321
|
# --- Prepare Arguments for Manager ---
|
287
|
-
manager_args = {
|
322
|
+
manager_args = {"images": images_pil, "options": options, "engine": engine}
|
288
323
|
simple_args = {}
|
289
|
-
if languages is not None:
|
290
|
-
|
291
|
-
if
|
292
|
-
|
324
|
+
if languages is not None:
|
325
|
+
simple_args["languages"] = languages
|
326
|
+
if min_confidence is not None:
|
327
|
+
simple_args["min_confidence"] = min_confidence
|
328
|
+
if device is not None:
|
329
|
+
simple_args["device"] = device
|
330
|
+
manager_args.update(simple_args) # Add simple args if options not provided
|
293
331
|
|
294
332
|
# --- Call OCR Manager for Batch Processing ---
|
295
333
|
logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
|
@@ -298,17 +336,19 @@ class PDF:
|
|
298
336
|
batch_results = self._ocr_manager.apply_ocr(**manager_args)
|
299
337
|
|
300
338
|
if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
|
301
|
-
logger.error(
|
302
|
-
|
303
|
-
|
339
|
+
logger.error(
|
340
|
+
f"OCR Manager returned unexpected result format or length for batch processing. "
|
341
|
+
f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
|
342
|
+
f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}."
|
343
|
+
)
|
304
344
|
# Handle error - maybe return early or try processing valid parts?
|
305
|
-
return self
|
345
|
+
return self # Return self without adding elements
|
306
346
|
|
307
347
|
logger.info("OCR Manager batch processing complete.")
|
308
348
|
|
309
349
|
except Exception as e:
|
310
|
-
|
311
|
-
|
350
|
+
logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
|
351
|
+
return self # Return self without adding elements
|
312
352
|
|
313
353
|
# --- Distribute Results and Add Elements to Pages ---
|
314
354
|
logger.info("Adding OCR results to respective pages...")
|
@@ -316,45 +356,55 @@ class PDF:
|
|
316
356
|
for i, (page, img) in enumerate(page_image_map):
|
317
357
|
results_for_page = batch_results[i]
|
318
358
|
if not isinstance(results_for_page, list):
|
319
|
-
|
320
|
-
|
359
|
+
logger.warning(
|
360
|
+
f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}"
|
361
|
+
)
|
362
|
+
continue
|
321
363
|
|
322
364
|
logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
|
323
365
|
# Use the page's element manager to create elements from its results
|
324
366
|
# Changed from page._create_text_elements_from_ocr to use element_mgr
|
325
367
|
try:
|
326
368
|
# Calculate scale factors based on rendered image vs page dims
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
369
|
+
img_scale_x = page.width / img.width if img.width > 0 else 1
|
370
|
+
img_scale_y = page.height / img.height if img.height > 0 else 1
|
371
|
+
elements = page._element_mgr.create_text_elements_from_ocr(
|
372
|
+
results_for_page, img_scale_x, img_scale_y
|
373
|
+
)
|
374
|
+
|
375
|
+
if elements:
|
376
|
+
# Note: element_mgr.create_text_elements_from_ocr already adds them
|
377
|
+
total_elements_added += len(elements)
|
378
|
+
logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
|
379
|
+
else:
|
380
|
+
logger.debug(f" No valid TextElements created for page {page.number}.")
|
337
381
|
except Exception as e:
|
338
|
-
|
339
|
-
|
382
|
+
logger.error(
|
383
|
+
f" Error adding OCR elements to page {page.number}: {e}", exc_info=True
|
384
|
+
)
|
385
|
+
# Continue to next page
|
340
386
|
|
341
|
-
logger.info(
|
387
|
+
logger.info(
|
388
|
+
f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}"
|
389
|
+
)
|
342
390
|
return self
|
343
|
-
|
344
|
-
def add_region(
|
391
|
+
|
392
|
+
def add_region(
|
393
|
+
self, region_func: Callable[["Page"], Optional[Region]], name: str = None
|
394
|
+
) -> "PDF":
|
345
395
|
"""
|
346
396
|
Add a region function to the PDF. This creates regions on all pages using the provided function.
|
347
|
-
|
397
|
+
|
348
398
|
Args:
|
349
399
|
region_func: A function that takes a Page and returns a Region, or None.
|
350
400
|
name: Optional name for the region
|
351
|
-
|
401
|
+
|
352
402
|
Returns:
|
353
403
|
Self for method chaining
|
354
404
|
"""
|
355
405
|
# Ensure _pages is initialized
|
356
|
-
if not hasattr(self,
|
357
|
-
|
406
|
+
if not hasattr(self, "_pages"):
|
407
|
+
raise AttributeError("PDF pages not yet initialized.")
|
358
408
|
|
359
409
|
# Store region with its name at PDF level
|
360
410
|
region_data = (region_func, name)
|
@@ -367,93 +417,108 @@ class PDF:
|
|
367
417
|
region_instance = region_func(page)
|
368
418
|
if region_instance and isinstance(region_instance, Region):
|
369
419
|
# If a valid region is returned, add it to the page
|
370
|
-
page.add_region(region_instance, name=name, source=
|
420
|
+
page.add_region(region_instance, name=name, source="named")
|
371
421
|
elif region_instance is not None:
|
372
|
-
|
422
|
+
logger.warning(
|
423
|
+
f"Region function did not return a valid Region object for page {page.number}. Got: {type(region_instance)}"
|
424
|
+
)
|
373
425
|
except Exception as e:
|
374
|
-
logger.error(
|
426
|
+
logger.error(
|
427
|
+
f"Error executing or adding region function for page {page.number}: {e}",
|
428
|
+
exc_info=True,
|
429
|
+
)
|
375
430
|
|
376
431
|
return self
|
377
|
-
|
378
|
-
def find(
|
432
|
+
|
433
|
+
def find(
|
434
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
435
|
+
) -> Optional[Any]:
|
379
436
|
"""
|
380
437
|
Find the first element matching the selector.
|
381
|
-
|
438
|
+
|
382
439
|
Args:
|
383
440
|
selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
|
384
441
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
385
442
|
regex: Whether to use regex for text search in :contains (default: False)
|
386
443
|
case: Whether to do case-sensitive text search (default: True)
|
387
444
|
**kwargs: Additional filter parameters
|
388
|
-
|
445
|
+
|
389
446
|
Returns:
|
390
447
|
Element object or None if not found
|
391
448
|
"""
|
392
449
|
# Ensure _pages is initialized
|
393
|
-
if not hasattr(self,
|
394
|
-
|
450
|
+
if not hasattr(self, "_pages"):
|
451
|
+
raise AttributeError("PDF pages not yet initialized.")
|
395
452
|
|
396
453
|
selector_obj = parse_selector(selector)
|
397
|
-
|
454
|
+
|
398
455
|
# Pass regex and case flags to selector function
|
399
|
-
kwargs[
|
400
|
-
kwargs[
|
401
|
-
|
402
|
-
results = self._apply_selector(
|
456
|
+
kwargs["regex"] = regex
|
457
|
+
kwargs["case"] = case
|
458
|
+
|
459
|
+
results = self._apply_selector(
|
460
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=True, **kwargs
|
461
|
+
)
|
403
462
|
return results.first if results else None
|
404
|
-
|
405
|
-
def find_all(
|
463
|
+
|
464
|
+
def find_all(
|
465
|
+
self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs
|
466
|
+
) -> ElementCollection:
|
406
467
|
"""
|
407
468
|
Find all elements matching the selector.
|
408
|
-
|
469
|
+
|
409
470
|
Args:
|
410
471
|
selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
|
411
472
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
412
473
|
regex: Whether to use regex for text search in :contains (default: False)
|
413
474
|
case: Whether to do case-sensitive text search (default: True)
|
414
475
|
**kwargs: Additional filter parameters
|
415
|
-
|
476
|
+
|
416
477
|
Returns:
|
417
478
|
ElementCollection with matching elements
|
418
479
|
"""
|
419
480
|
# Ensure _pages is initialized
|
420
|
-
if not hasattr(self,
|
421
|
-
|
481
|
+
if not hasattr(self, "_pages"):
|
482
|
+
raise AttributeError("PDF pages not yet initialized.")
|
422
483
|
|
423
484
|
selector_obj = parse_selector(selector)
|
424
|
-
|
485
|
+
|
425
486
|
# Pass regex and case flags to selector function
|
426
|
-
kwargs[
|
427
|
-
kwargs[
|
428
|
-
|
429
|
-
results = self._apply_selector(
|
487
|
+
kwargs["regex"] = regex
|
488
|
+
kwargs["case"] = case
|
489
|
+
|
490
|
+
results = self._apply_selector(
|
491
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=False, **kwargs
|
492
|
+
)
|
430
493
|
return results
|
431
|
-
|
432
|
-
def _apply_selector(
|
494
|
+
|
495
|
+
def _apply_selector(
|
496
|
+
self, selector_obj: Dict, apply_exclusions=True, first_only=False, **kwargs
|
497
|
+
) -> ElementCollection:
|
433
498
|
"""
|
434
499
|
Apply selector to PDF elements across all pages.
|
435
|
-
|
500
|
+
|
436
501
|
Args:
|
437
502
|
selector_obj: Parsed selector dictionary
|
438
503
|
apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
|
439
504
|
first_only: If True, stop searching after the first match is found.
|
440
505
|
**kwargs: Additional filter parameters
|
441
|
-
|
506
|
+
|
442
507
|
Returns:
|
443
508
|
ElementCollection of matching elements
|
444
509
|
"""
|
445
510
|
from natural_pdf.elements.collections import ElementCollection
|
446
|
-
|
511
|
+
|
447
512
|
# Determine page range to search
|
448
|
-
page_indices = kwargs.get(
|
513
|
+
page_indices = kwargs.get("pages", range(len(self._pages)))
|
449
514
|
if isinstance(page_indices, int):
|
450
515
|
page_indices = [page_indices]
|
451
516
|
elif isinstance(page_indices, slice):
|
452
517
|
page_indices = range(*page_indices.indices(len(self._pages)))
|
453
518
|
|
454
519
|
# Check for cross-page pseudo-classes (currently not supported)
|
455
|
-
for pseudo in selector_obj.get(
|
456
|
-
if pseudo.get(
|
520
|
+
for pseudo in selector_obj.get("pseudo_classes", []):
|
521
|
+
if pseudo.get("name") in ("spans", "continues"):
|
457
522
|
logger.warning("Cross-page selectors ('spans', 'continues') are not yet supported.")
|
458
523
|
return ElementCollection([])
|
459
524
|
|
@@ -464,141 +529,155 @@ class PDF:
|
|
464
529
|
page = self._pages[page_idx]
|
465
530
|
# Pass first_only down to page._apply_selector
|
466
531
|
page_elements_collection = page._apply_selector(
|
467
|
-
selector_obj,
|
468
|
-
apply_exclusions=apply_exclusions,
|
469
|
-
first_only=first_only,
|
470
|
-
**kwargs
|
532
|
+
selector_obj, apply_exclusions=apply_exclusions, first_only=first_only, **kwargs
|
471
533
|
)
|
472
534
|
if page_elements_collection:
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
535
|
+
page_elements = page_elements_collection.elements
|
536
|
+
all_elements.extend(page_elements)
|
537
|
+
# If we only need the first match overall, and we found one on this page, stop
|
538
|
+
if first_only and page_elements:
|
539
|
+
break # Stop iterating through pages
|
478
540
|
else:
|
479
|
-
|
541
|
+
logger.warning(f"Page index {page_idx} out of range (0-{len(self._pages)-1}).")
|
480
542
|
|
481
543
|
# Create a combined collection
|
482
544
|
combined = ElementCollection(all_elements)
|
483
545
|
|
484
546
|
# Sort in document order if requested and not first_only (already sorted by page)
|
485
|
-
if not first_only and kwargs.get(
|
547
|
+
if not first_only and kwargs.get("document_order", True):
|
486
548
|
# Check if elements have page, top, x0 before sorting
|
487
|
-
if all(
|
488
|
-
|
549
|
+
if all(
|
550
|
+
hasattr(el, "page") and hasattr(el, "top") and hasattr(el, "x0")
|
551
|
+
for el in combined.elements
|
552
|
+
):
|
553
|
+
combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
|
489
554
|
else:
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
555
|
+
# Elements might be Regions without inherent sorting order yet
|
556
|
+
# Attempt sorting by page index if possible
|
557
|
+
try:
|
558
|
+
combined.sort(key=lambda el: el.page.index)
|
559
|
+
except AttributeError:
|
560
|
+
logger.warning(
|
561
|
+
"Cannot sort elements in document order: Missing required attributes (e.g., page)."
|
562
|
+
)
|
496
563
|
|
497
564
|
return combined
|
498
|
-
|
499
|
-
def extract_text(
|
500
|
-
|
565
|
+
|
566
|
+
def extract_text(
|
567
|
+
self,
|
568
|
+
selector: Optional[str] = None,
|
569
|
+
preserve_whitespace=True,
|
570
|
+
use_exclusions=True,
|
571
|
+
debug_exclusions=False,
|
572
|
+
**kwargs,
|
573
|
+
) -> str:
|
501
574
|
"""
|
502
575
|
Extract text from the entire document or matching elements.
|
503
|
-
|
576
|
+
|
504
577
|
Args:
|
505
578
|
selector: Optional selector to filter elements
|
506
579
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
507
580
|
use_exclusions: Whether to apply exclusion regions (default: True)
|
508
581
|
debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
|
509
582
|
**kwargs: Additional extraction parameters
|
510
|
-
|
583
|
+
|
511
584
|
Returns:
|
512
585
|
Extracted text as string
|
513
586
|
"""
|
514
587
|
# Ensure _pages is initialized
|
515
|
-
if not hasattr(self,
|
516
|
-
|
588
|
+
if not hasattr(self, "_pages"):
|
589
|
+
raise AttributeError("PDF pages not yet initialized.")
|
517
590
|
|
518
591
|
# If selector is provided, find elements first
|
519
592
|
if selector:
|
520
593
|
elements = self.find_all(selector, apply_exclusions=use_exclusions, **kwargs)
|
521
594
|
return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
|
522
|
-
|
595
|
+
|
523
596
|
# Otherwise extract from all pages
|
524
597
|
if debug_exclusions:
|
525
598
|
print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
|
526
599
|
print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
|
527
|
-
|
600
|
+
|
528
601
|
texts = []
|
529
602
|
for page in self.pages:
|
530
|
-
texts.append(
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
603
|
+
texts.append(
|
604
|
+
page.extract_text(
|
605
|
+
preserve_whitespace=preserve_whitespace,
|
606
|
+
use_exclusions=use_exclusions,
|
607
|
+
debug_exclusions=debug_exclusions,
|
608
|
+
**kwargs,
|
609
|
+
)
|
610
|
+
)
|
611
|
+
|
537
612
|
if debug_exclusions:
|
538
613
|
print(f"PDF: Combined {len(texts)} pages of text")
|
539
|
-
|
614
|
+
|
540
615
|
return "\n".join(texts)
|
541
|
-
|
616
|
+
|
542
617
|
def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
|
543
618
|
"""
|
544
619
|
Shorthand for finding elements and extracting their text.
|
545
|
-
|
620
|
+
|
546
621
|
Args:
|
547
622
|
selector: CSS-like selector string
|
548
623
|
preserve_whitespace: Whether to keep blank characters (default: True)
|
549
624
|
**kwargs: Additional extraction parameters
|
550
|
-
|
625
|
+
|
551
626
|
Returns:
|
552
627
|
Extracted text from matching elements
|
553
628
|
"""
|
554
629
|
# Ensure _pages is initialized
|
555
|
-
if not hasattr(self,
|
556
|
-
|
557
|
-
return self.extract_text(
|
558
|
-
|
559
|
-
|
630
|
+
if not hasattr(self, "_pages"):
|
631
|
+
raise AttributeError("PDF pages not yet initialized.")
|
632
|
+
return self.extract_text(
|
633
|
+
selector, preserve_whitespace=preserve_whitespace, use_exclusions=True, **kwargs
|
634
|
+
) # apply_exclusions is handled by find_all in extract_text
|
635
|
+
|
636
|
+
def extract_tables(
|
637
|
+
self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs
|
638
|
+
) -> List[Any]:
|
560
639
|
"""
|
561
640
|
Extract tables from the document or matching elements.
|
562
|
-
|
641
|
+
|
563
642
|
Args:
|
564
643
|
selector: Optional selector to filter tables
|
565
644
|
merge_across_pages: Whether to merge tables that span across pages
|
566
645
|
**kwargs: Additional extraction parameters
|
567
|
-
|
646
|
+
|
568
647
|
Returns:
|
569
648
|
List of extracted tables
|
570
649
|
"""
|
571
650
|
# Ensure _pages is initialized
|
572
|
-
if not hasattr(self,
|
573
|
-
|
651
|
+
if not hasattr(self, "_pages"):
|
652
|
+
raise AttributeError("PDF pages not yet initialized.")
|
574
653
|
# TODO: Implement table extraction
|
575
654
|
logger.warning("PDF.extract_tables is not fully implemented yet.")
|
576
655
|
all_tables = []
|
577
656
|
for page in self.pages:
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
657
|
+
# Assuming page.extract_tables(**kwargs) exists or is added
|
658
|
+
if hasattr(page, "extract_tables"):
|
659
|
+
all_tables.extend(page.extract_tables(**kwargs))
|
660
|
+
else:
|
661
|
+
logger.debug(f"Page {page.number} does not have extract_tables method.")
|
583
662
|
# Placeholder filtering
|
584
663
|
if selector:
|
585
664
|
logger.warning("Filtering extracted tables by selector is not implemented.")
|
586
665
|
# Would need to parse selector and filter the list `all_tables`
|
587
666
|
# Placeholder merging
|
588
667
|
if merge_across_pages:
|
589
|
-
|
590
|
-
|
668
|
+
logger.warning("Merging tables across pages is not implemented.")
|
669
|
+
# Would need logic to detect and merge related tables
|
591
670
|
return all_tables
|
592
|
-
|
671
|
+
|
593
672
|
# --- New Method: save_searchable ---
|
594
|
-
def save_searchable(self, output_path: Union[str,
|
673
|
+
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
595
674
|
"""
|
596
675
|
Saves the PDF with an OCR text layer, making content searchable.
|
597
676
|
|
598
677
|
Requires optional dependencies. Install with: pip install "natural-pdf[ocr-save]"
|
599
678
|
|
600
679
|
Note: OCR must have been applied to the pages beforehand
|
601
|
-
(e.g., using pdf.
|
680
|
+
(e.g., using pdf.apply_ocr()).
|
602
681
|
|
603
682
|
Args:
|
604
683
|
output_path: Path to save the searchable PDF.
|
@@ -608,15 +687,6 @@ class PDF:
|
|
608
687
|
# Import moved here, assuming it's always available now
|
609
688
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
610
689
|
|
611
|
-
# TODO: Need a reliable way for Page to signal if it has OCR elements.
|
612
|
-
# This requires adding a method/attribute to the Page class, e.g., page.has_ocr_elements()
|
613
|
-
# or checking if page.get_elements(source='ocr') returns anything.
|
614
|
-
# For now, we pass through and let the exporter handle pages without OCR elements.
|
615
|
-
# if not any(page.get_elements(source='ocr') for page in self.pages):
|
616
|
-
# logger.warning("No OCR elements found on pages. "
|
617
|
-
# "Ensure apply_ocr_to_pages() was called. "
|
618
|
-
# "Output PDF might not be searchable.")
|
619
|
-
|
620
690
|
# Convert pathlib.Path to string if necessary
|
621
691
|
output_path_str = str(output_path)
|
622
692
|
|
@@ -625,15 +695,18 @@ class PDF:
|
|
625
695
|
|
626
696
|
# --- End New Method ---
|
627
697
|
|
628
|
-
def ask(
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
698
|
+
def ask(
|
699
|
+
self,
|
700
|
+
question: str,
|
701
|
+
mode: str = "extractive",
|
702
|
+
pages: Union[int, List[int], range] = None,
|
703
|
+
min_confidence: float = 0.1,
|
704
|
+
model: str = None,
|
705
|
+
**kwargs,
|
706
|
+
) -> Dict[str, Any]:
|
634
707
|
"""
|
635
708
|
Ask a question about the document content.
|
636
|
-
|
709
|
+
|
637
710
|
Args:
|
638
711
|
question: Question to ask about the document
|
639
712
|
mode: "extractive" to extract answer from document, "generative" to generate
|
@@ -641,16 +714,16 @@ class PDF:
|
|
641
714
|
min_confidence: Minimum confidence threshold for answers
|
642
715
|
model: Optional model name for question answering
|
643
716
|
**kwargs: Additional parameters passed to the QA engine
|
644
|
-
|
717
|
+
|
645
718
|
Returns:
|
646
719
|
A dictionary containing the answer, confidence, and other metadata.
|
647
720
|
Result will have an 'answer' key containing the answer text.
|
648
721
|
"""
|
649
722
|
from natural_pdf.qa import get_qa_engine
|
650
|
-
|
723
|
+
|
651
724
|
# Initialize or get QA engine
|
652
725
|
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
653
|
-
|
726
|
+
|
654
727
|
# Determine which pages to query
|
655
728
|
if pages is None:
|
656
729
|
target_pages = list(range(len(self.pages)))
|
@@ -662,43 +735,40 @@ class PDF:
|
|
662
735
|
target_pages = pages
|
663
736
|
else:
|
664
737
|
raise ValueError(f"Invalid pages parameter: {pages}")
|
665
|
-
|
738
|
+
|
666
739
|
# Actually query each page and gather results
|
667
740
|
results = []
|
668
741
|
for page_idx in target_pages:
|
669
742
|
if 0 <= page_idx < len(self.pages):
|
670
743
|
page = self.pages[page_idx]
|
671
744
|
page_result = qa_engine.ask_pdf_page(
|
672
|
-
page=page,
|
673
|
-
question=question,
|
674
|
-
min_confidence=min_confidence,
|
675
|
-
**kwargs
|
745
|
+
page=page, question=question, min_confidence=min_confidence, **kwargs
|
676
746
|
)
|
677
|
-
|
747
|
+
|
678
748
|
# Add to results if it found an answer
|
679
749
|
if page_result and page_result.get("found", False):
|
680
750
|
results.append(page_result)
|
681
|
-
|
751
|
+
|
682
752
|
# Sort results by confidence
|
683
753
|
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
684
|
-
|
754
|
+
|
685
755
|
# Return the best result, or a default result if none found
|
686
756
|
if results:
|
687
757
|
return results[0]
|
688
758
|
else:
|
689
759
|
# Return a structure indicating no answer found
|
690
760
|
return {
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
761
|
+
"answer": None,
|
762
|
+
"confidence": 0.0,
|
763
|
+
"found": False,
|
764
|
+
"page_num": None, # Or maybe the pages searched?
|
765
|
+
"source_elements": [],
|
696
766
|
}
|
697
767
|
|
698
768
|
def search_within_index(
|
699
769
|
self,
|
700
770
|
query: Union[str, Path, Image.Image, Region],
|
701
|
-
search_service: SearchServiceProtocol,
|
771
|
+
search_service: SearchServiceProtocol, # Now required
|
702
772
|
options: Optional[SearchOptions] = None,
|
703
773
|
) -> List[Dict[str, Any]]:
|
704
774
|
"""
|
@@ -730,14 +800,16 @@ class PDF:
|
|
730
800
|
RuntimeError: For other search failures.
|
731
801
|
"""
|
732
802
|
if not search_service:
|
733
|
-
|
803
|
+
raise ValueError("A configured SearchServiceProtocol instance must be provided.")
|
734
804
|
# Optional stricter check:
|
735
805
|
# if not isinstance(search_service, SearchServiceProtocol):
|
736
806
|
# raise TypeError("Provided search_service does not conform to SearchServiceProtocol.")
|
737
807
|
|
738
808
|
# Get collection name from service for logging
|
739
|
-
collection_name = getattr(search_service,
|
740
|
-
logger.info(
|
809
|
+
collection_name = getattr(search_service, "collection_name", "<Unknown Collection>")
|
810
|
+
logger.info(
|
811
|
+
f"Searching within index '{collection_name}' (via provided service) for content from PDF '{self.path}'. Query type: {type(query).__name__}."
|
812
|
+
)
|
741
813
|
|
742
814
|
# --- 1. Get Search Service Instance --- (REMOVED - provided directly)
|
743
815
|
# service: SearchServiceProtocol
|
@@ -748,7 +820,7 @@ class PDF:
|
|
748
820
|
# factory_args = {**kwargs, 'collection_name': collection_name, 'persist': persist}
|
749
821
|
# # TODO: Pass embedding model from options/pdf config if needed?
|
750
822
|
# service = get_search_service(**factory_args)
|
751
|
-
service = search_service
|
823
|
+
service = search_service # Use validated provided service
|
752
824
|
|
753
825
|
# --- 2. Prepare Query and Options ---
|
754
826
|
query_input = query
|
@@ -757,119 +829,145 @@ class PDF:
|
|
757
829
|
|
758
830
|
# Handle Region query - extract text for now
|
759
831
|
if isinstance(query, Region):
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
832
|
+
logger.debug("Query is a Region object. Extracting text.")
|
833
|
+
if not isinstance(effective_options, TextSearchOptions):
|
834
|
+
logger.warning(
|
835
|
+
"Querying with Region image requires MultiModalSearchOptions (Not fully implemented). Falling back to text extraction."
|
836
|
+
)
|
837
|
+
query_input = query.extract_text()
|
838
|
+
if not query_input or query_input.isspace():
|
839
|
+
logger.error("Region has no extractable text for query.")
|
840
|
+
return []
|
767
841
|
|
768
842
|
# --- 3. Add Filter to Scope Search to THIS PDF ---
|
769
843
|
# Assume metadata field 'pdf_path' stores the resolved path used during indexing
|
770
844
|
pdf_scope_filter = {
|
771
|
-
|
772
|
-
|
773
|
-
|
845
|
+
"field": "pdf_path", # Or potentially "source_path" depending on indexing metadata
|
846
|
+
"operator": "eq",
|
847
|
+
"value": self.path, # Use the resolved path of this PDF instance
|
774
848
|
}
|
775
849
|
logger.debug(f"Applying filter to scope search to PDF: {pdf_scope_filter}")
|
776
850
|
|
777
851
|
# Combine with existing filters in options (if any)
|
778
852
|
if effective_options.filters:
|
779
|
-
logger.debug(
|
853
|
+
logger.debug(
|
854
|
+
f"Combining PDF scope filter with existing filters: {effective_options.filters}"
|
855
|
+
)
|
780
856
|
# Assume filters are compatible with the underlying search service
|
781
857
|
# If existing filters aren't already in an AND block, wrap them
|
782
|
-
if
|
783
|
-
|
784
|
-
|
858
|
+
if (
|
859
|
+
isinstance(effective_options.filters, dict)
|
860
|
+
and effective_options.filters.get("operator") == "AND"
|
861
|
+
):
|
862
|
+
# Already an AND block, just append the condition
|
863
|
+
effective_options.filters["conditions"].append(pdf_scope_filter)
|
785
864
|
elif isinstance(effective_options.filters, list):
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
865
|
+
# Assume list represents implicit AND conditions
|
866
|
+
effective_options.filters = {
|
867
|
+
"operator": "AND",
|
868
|
+
"conditions": effective_options.filters + [pdf_scope_filter],
|
869
|
+
}
|
870
|
+
elif isinstance(effective_options.filters, dict): # Single filter dict
|
871
|
+
effective_options.filters = {
|
872
|
+
"operator": "AND",
|
873
|
+
"conditions": [effective_options.filters, pdf_scope_filter],
|
874
|
+
}
|
790
875
|
else:
|
791
|
-
|
792
|
-
|
876
|
+
logger.warning(
|
877
|
+
f"Unsupported format for existing filters: {type(effective_options.filters)}. Overwriting with PDF scope filter."
|
878
|
+
)
|
879
|
+
effective_options.filters = pdf_scope_filter
|
793
880
|
else:
|
794
881
|
effective_options.filters = pdf_scope_filter
|
795
882
|
|
796
883
|
logger.debug(f"Final filters for service search: {effective_options.filters}")
|
797
884
|
|
798
|
-
# --- 4. Call SearchService ---
|
885
|
+
# --- 4. Call SearchService ---
|
799
886
|
try:
|
800
887
|
# Call the service's search method (no collection_name needed)
|
801
888
|
results = service.search(
|
802
889
|
query=query_input,
|
803
890
|
options=effective_options,
|
804
891
|
)
|
805
|
-
logger.info(
|
892
|
+
logger.info(
|
893
|
+
f"SearchService returned {len(results)} results scoped to PDF '{self.path}' within collection '{collection_name}'."
|
894
|
+
)
|
806
895
|
return results
|
807
896
|
except FileNotFoundError as fnf:
|
808
|
-
|
809
|
-
|
897
|
+
logger.error(
|
898
|
+
f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
|
899
|
+
)
|
900
|
+
raise # Re-raise specific error
|
810
901
|
except Exception as e:
|
811
|
-
|
812
|
-
|
902
|
+
logger.error(
|
903
|
+
f"SearchService search failed for PDF '{self.path}' in collection '{collection_name}': {e}",
|
904
|
+
exc_info=True,
|
905
|
+
)
|
906
|
+
raise RuntimeError(
|
907
|
+
f"Search within index failed for PDF '{self.path}'. See logs for details."
|
908
|
+
) from e
|
813
909
|
|
814
910
|
def __len__(self) -> int:
|
815
911
|
"""Return the number of pages in the PDF."""
|
816
912
|
# Ensure _pages is initialized
|
817
|
-
if not hasattr(self,
|
913
|
+
if not hasattr(self, "_pages"):
|
818
914
|
# Return 0 or raise error if not fully initialized? Let's return 0.
|
819
|
-
|
915
|
+
return 0
|
820
916
|
return len(self._pages)
|
821
|
-
|
822
|
-
def __getitem__(self, key) -> Union[Page,
|
917
|
+
|
918
|
+
def __getitem__(self, key) -> Union[Page, "PageCollection"]: # Return PageCollection for slice
|
823
919
|
"""Access pages by index or slice."""
|
824
920
|
# Check if self._pages has been initialized
|
825
|
-
if not hasattr(self,
|
826
|
-
|
921
|
+
if not hasattr(self, "_pages"):
|
922
|
+
raise AttributeError("PDF pages not initialized yet.")
|
827
923
|
if isinstance(key, slice):
|
828
|
-
|
829
|
-
|
830
|
-
|
924
|
+
# Return a PageCollection slice
|
925
|
+
from natural_pdf.elements.collections import PageCollection
|
926
|
+
|
927
|
+
return PageCollection(self._pages[key])
|
831
928
|
# Check index bounds before accessing
|
832
929
|
if isinstance(key, int):
|
833
930
|
if 0 <= key < len(self._pages):
|
834
|
-
|
931
|
+
return self._pages[key]
|
835
932
|
else:
|
836
|
-
|
933
|
+
raise IndexError(f"Page index {key} out of range (0-{len(self._pages)-1}).")
|
837
934
|
else:
|
838
|
-
|
839
|
-
|
935
|
+
raise TypeError(f"Page indices must be integers or slices, not {type(key)}.")
|
936
|
+
|
840
937
|
def close(self):
|
841
938
|
"""Close the underlying PDF file and clean up any temporary files."""
|
842
|
-
if hasattr(self,
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
939
|
+
if hasattr(self, "_pdf") and self._pdf is not None:
|
940
|
+
try:
|
941
|
+
self._pdf.close()
|
942
|
+
logger.debug(f"Closed underlying pdfplumber PDF object for {self.source_path}")
|
943
|
+
except Exception as e:
|
944
|
+
logger.warning(f"Error closing pdfplumber object: {e}")
|
945
|
+
finally:
|
946
|
+
self._pdf = None
|
850
947
|
|
851
948
|
# Clean up temporary file if it exists
|
852
|
-
if hasattr(self,
|
949
|
+
if hasattr(self, "_temp_file") and self._temp_file is not None:
|
853
950
|
temp_file_path = None
|
854
951
|
try:
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
952
|
+
if hasattr(self._temp_file, "name") and self._temp_file.name:
|
953
|
+
temp_file_path = self._temp_file.name
|
954
|
+
if os.path.exists(temp_file_path):
|
955
|
+
os.unlink(temp_file_path)
|
956
|
+
logger.debug(f"Removed temporary PDF file: {temp_file_path}")
|
860
957
|
except Exception as e:
|
861
|
-
|
958
|
+
logger.warning(f"Failed to clean up temporary PDF file '{temp_file_path}': {e}")
|
862
959
|
finally:
|
863
|
-
|
960
|
+
self._temp_file = None
|
864
961
|
|
865
962
|
def __enter__(self):
|
866
963
|
"""Context manager entry."""
|
867
964
|
return self
|
868
|
-
|
965
|
+
|
869
966
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
870
967
|
"""Context manager exit."""
|
871
968
|
self.close()
|
872
969
|
|
970
|
+
|
873
971
|
# --- Added TYPE_CHECKING import (if not already present) ---
|
874
972
|
if TYPE_CHECKING:
|
875
|
-
from pathlib import Path
|
973
|
+
from pathlib import Path # Assuming Path is used for type hint
|