natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +222 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +260 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +409 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +484 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +586 -0
- docs/tutorials/12-ocr-integration.md +188 -0
- docs/tutorials/13-semantic-search.ipynb +1888 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +39 -20
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +98 -58
- natural_pdf/analyzers/layout/layout_options.py +32 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +84 -44
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +126 -98
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +416 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +910 -516
- natural_pdf/core/pdf.py +387 -289
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +302 -214
- natural_pdf/elements/collections.py +714 -514
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +854 -883
- natural_pdf/elements/text.py +122 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +23 -14
- natural_pdf/ocr/engine.py +17 -8
- natural_pdf/ocr/engine_easyocr.py +63 -47
- natural_pdf/ocr/engine_paddle.py +97 -68
- natural_pdf/ocr/engine_surya.py +54 -44
- natural_pdf/ocr/ocr_manager.py +88 -62
- natural_pdf/ocr/ocr_options.py +16 -10
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
- natural_pdf-0.1.5.dist-info/RECORD +134 -0
- natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- tests/test_loading.py +50 -0
- tests/test_optional_deps.py +298 -0
- natural_pdf-0.1.3.dist-info/RECORD +0 -61
- natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
natural_pdf/qa/document_qa.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
import json
|
1
2
|
import logging
|
2
|
-
from typing import List, Dict, Any, Optional, Union, Tuple
|
3
|
-
import numpy as np
|
4
|
-
from PIL import Image, ImageDraw
|
5
3
|
import os
|
6
4
|
import tempfile
|
7
|
-
import
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from PIL import Image, ImageDraw
|
9
|
+
|
8
10
|
from natural_pdf.elements.collections import ElementCollection
|
9
11
|
|
10
12
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
@@ -12,41 +14,42 @@ logger = logging.getLogger("natural_pdf.qa.document_qa")
|
|
12
14
|
# Global QA engine instance
|
13
15
|
_QA_ENGINE_INSTANCE = None
|
14
16
|
|
17
|
+
|
15
18
|
def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
|
16
19
|
"""
|
17
20
|
Get or create a global QA engine instance.
|
18
|
-
|
21
|
+
|
19
22
|
Args:
|
20
23
|
model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
|
21
24
|
**kwargs: Additional parameters to pass to the DocumentQA constructor
|
22
|
-
|
25
|
+
|
23
26
|
Returns:
|
24
27
|
DocumentQA instance
|
25
28
|
"""
|
26
29
|
global _QA_ENGINE_INSTANCE
|
27
|
-
|
30
|
+
|
28
31
|
if _QA_ENGINE_INSTANCE is None:
|
29
32
|
try:
|
30
33
|
_QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
|
31
34
|
except Exception as e:
|
32
35
|
logger.error(f"Failed to initialize QA engine: {e}")
|
33
36
|
raise
|
34
|
-
|
37
|
+
|
35
38
|
return _QA_ENGINE_INSTANCE
|
36
39
|
|
37
40
|
|
38
41
|
class DocumentQA:
|
39
42
|
"""
|
40
43
|
Document Question Answering using LayoutLM.
|
41
|
-
|
44
|
+
|
42
45
|
This class provides the ability to ask natural language questions about document content,
|
43
46
|
leveraging the spatial layout information from PDF pages.
|
44
47
|
"""
|
45
|
-
|
48
|
+
|
46
49
|
def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
|
47
50
|
"""
|
48
51
|
Initialize the Document QA engine.
|
49
|
-
|
52
|
+
|
50
53
|
Args:
|
51
54
|
model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
|
52
55
|
device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
|
@@ -54,20 +57,20 @@ class DocumentQA:
|
|
54
57
|
try:
|
55
58
|
import torch
|
56
59
|
from transformers import pipeline
|
57
|
-
|
60
|
+
|
58
61
|
# Determine device
|
59
62
|
if device is None:
|
60
|
-
device =
|
61
|
-
|
63
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
64
|
+
|
62
65
|
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
63
|
-
|
66
|
+
|
64
67
|
# Initialize the pipeline
|
65
68
|
self.pipe = pipeline("document-question-answering", model=model_name, device=device)
|
66
|
-
|
69
|
+
|
67
70
|
self.model_name = model_name
|
68
71
|
self.device = device
|
69
72
|
self._is_initialized = True
|
70
|
-
|
73
|
+
|
71
74
|
except ImportError as e:
|
72
75
|
logger.error(f"Failed to import required packages: {e}")
|
73
76
|
self._is_initialized = False
|
@@ -79,56 +82,55 @@ class DocumentQA:
|
|
79
82
|
logger.error(f"Failed to initialize DocumentQA: {e}")
|
80
83
|
self._is_initialized = False
|
81
84
|
raise
|
82
|
-
|
85
|
+
|
83
86
|
def is_available(self) -> bool:
|
84
87
|
"""Check if the QA engine is properly initialized."""
|
85
88
|
return self._is_initialized
|
86
|
-
|
89
|
+
|
87
90
|
def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
|
88
91
|
"""
|
89
92
|
Extract word boxes from text elements.
|
90
|
-
|
93
|
+
|
91
94
|
Args:
|
92
95
|
elements: List of TextElement objects
|
93
96
|
offset_x: X-coordinate offset to subtract (for region cropping)
|
94
97
|
offset_y: Y-coordinate offset to subtract (for region cropping)
|
95
|
-
|
98
|
+
|
96
99
|
Returns:
|
97
100
|
List of [text, [x0, top, x1, bottom]] entries
|
98
101
|
"""
|
99
102
|
word_boxes = []
|
100
|
-
|
103
|
+
|
101
104
|
for element in elements:
|
102
|
-
if hasattr(element,
|
105
|
+
if hasattr(element, "text") and element.text.strip():
|
103
106
|
# Apply offset for cropped regions
|
104
107
|
x0 = int(element.x0) - offset_x
|
105
108
|
top = int(element.top) - offset_y
|
106
109
|
x1 = int(element.x1) - offset_x
|
107
110
|
bottom = int(element.bottom) - offset_y
|
108
|
-
|
111
|
+
|
109
112
|
# Ensure coordinates are valid (non-negative)
|
110
113
|
x0 = max(0, x0)
|
111
114
|
top = max(0, top)
|
112
115
|
x1 = max(0, x1)
|
113
116
|
bottom = max(0, bottom)
|
114
|
-
|
115
|
-
word_boxes.append([
|
116
|
-
|
117
|
-
[x0, top, x1, bottom]
|
118
|
-
])
|
119
|
-
|
117
|
+
|
118
|
+
word_boxes.append([element.text, [x0, top, x1, bottom]])
|
119
|
+
|
120
120
|
return word_boxes
|
121
|
-
|
122
|
-
def ask(
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
121
|
+
|
122
|
+
def ask(
|
123
|
+
self,
|
124
|
+
image: Union[str, Image.Image, np.ndarray],
|
125
|
+
question: str,
|
126
|
+
word_boxes: List = None,
|
127
|
+
min_confidence: float = 0.1,
|
128
|
+
debug: bool = False,
|
129
|
+
debug_output_dir: str = "output",
|
130
|
+
) -> Dict[str, Any]:
|
129
131
|
"""
|
130
132
|
Ask a question about document content.
|
131
|
-
|
133
|
+
|
132
134
|
Args:
|
133
135
|
image: PIL Image, numpy array, or path to image file
|
134
136
|
question: Question to ask about the document
|
@@ -136,7 +138,7 @@ class DocumentQA:
|
|
136
138
|
min_confidence: Minimum confidence threshold for answers
|
137
139
|
debug: Whether to save debug information
|
138
140
|
debug_output_dir: Directory to save debug files
|
139
|
-
|
141
|
+
|
140
142
|
Returns:
|
141
143
|
Dictionary with answer details: {
|
142
144
|
"answer": extracted text,
|
@@ -147,7 +149,7 @@ class DocumentQA:
|
|
147
149
|
"""
|
148
150
|
if not self._is_initialized:
|
149
151
|
raise RuntimeError("DocumentQA is not properly initialized")
|
150
|
-
|
152
|
+
|
151
153
|
# Process the image
|
152
154
|
if isinstance(image, str):
|
153
155
|
# It's a file path
|
@@ -162,65 +164,68 @@ class DocumentQA:
|
|
162
164
|
image_obj = image
|
163
165
|
else:
|
164
166
|
raise TypeError("Image must be a PIL Image, numpy array, or file path")
|
165
|
-
|
167
|
+
|
166
168
|
# Prepare the query
|
167
|
-
query = {
|
168
|
-
|
169
|
-
"question": question
|
170
|
-
}
|
171
|
-
|
169
|
+
query = {"image": image_obj, "question": question}
|
170
|
+
|
172
171
|
# Add word boxes if provided
|
173
172
|
if word_boxes:
|
174
173
|
query["word_boxes"] = word_boxes
|
175
|
-
|
174
|
+
|
176
175
|
# Save debug information if requested
|
177
|
-
if debug:
|
176
|
+
if debug:
|
178
177
|
# Create debug directory
|
179
178
|
os.makedirs(debug_output_dir, exist_ok=True)
|
180
|
-
|
179
|
+
|
181
180
|
# Save the image
|
182
181
|
image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
|
183
182
|
image_obj.save(image_debug_path)
|
184
|
-
|
183
|
+
|
185
184
|
# Save word boxes
|
186
185
|
if word_boxes:
|
187
186
|
word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
|
188
|
-
with open(word_boxes_path,
|
187
|
+
with open(word_boxes_path, "w") as f:
|
189
188
|
json.dump(word_boxes, f, indent=2)
|
190
|
-
|
189
|
+
|
191
190
|
# Generate a visualization of the boxes on the image
|
192
191
|
vis_image = image_obj.copy()
|
193
192
|
draw = ImageDraw.Draw(vis_image)
|
194
|
-
|
193
|
+
|
195
194
|
for i, (text, box) in enumerate(word_boxes):
|
196
195
|
x0, y0, x1, y1 = box
|
197
196
|
draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
|
198
197
|
# Add text index for reference
|
199
198
|
draw.text((x0, y0), str(i), fill=(255, 0, 0))
|
200
|
-
|
199
|
+
|
201
200
|
vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
|
202
201
|
vis_image.save(vis_path)
|
203
|
-
|
202
|
+
|
204
203
|
logger.info(f"Saved debug files to {debug_output_dir}")
|
205
204
|
logger.info(f"Question: {question}")
|
206
205
|
logger.info(f"Image: {image_debug_path}")
|
207
206
|
logger.info(f"Word boxes: {word_boxes_path}")
|
208
207
|
logger.info(f"Visualization: {vis_path}")
|
209
|
-
|
208
|
+
|
210
209
|
# Run the query through the pipeline
|
211
210
|
logger.info(f"Running document QA pipeline with question: {question}")
|
212
211
|
result = self.pipe(query)[0]
|
213
212
|
logger.info(f"Raw result: {result}")
|
214
|
-
|
213
|
+
|
215
214
|
# Save the result if debugging
|
216
215
|
if debug:
|
217
216
|
result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
|
218
|
-
with open(result_path,
|
217
|
+
with open(result_path, "w") as f:
|
219
218
|
# Convert any non-serializable data
|
220
|
-
serializable_result = {
|
221
|
-
|
219
|
+
serializable_result = {
|
220
|
+
k: (
|
221
|
+
str(v)
|
222
|
+
if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
|
223
|
+
else v
|
224
|
+
)
|
225
|
+
for k, v in result.items()
|
226
|
+
}
|
222
227
|
json.dump(serializable_result, f, indent=2)
|
223
|
-
|
228
|
+
|
224
229
|
# Check confidence against threshold
|
225
230
|
if result["score"] < min_confidence:
|
226
231
|
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
@@ -229,48 +234,49 @@ class DocumentQA:
|
|
229
234
|
"confidence": result["score"],
|
230
235
|
"start": result.get("start", -1),
|
231
236
|
"end": result.get("end", -1),
|
232
|
-
"found": False
|
237
|
+
"found": False,
|
233
238
|
}
|
234
|
-
|
239
|
+
|
235
240
|
return {
|
236
241
|
"answer": result["answer"],
|
237
242
|
"confidence": result["score"],
|
238
243
|
"start": result.get("start", 0),
|
239
244
|
"end": result.get("end", 0),
|
240
|
-
"found": True
|
245
|
+
"found": True,
|
241
246
|
}
|
242
|
-
|
243
|
-
|
244
|
-
|
247
|
+
|
248
|
+
def ask_pdf_page(
|
249
|
+
self, page, question: str, min_confidence: float = 0.1, debug: bool = False
|
250
|
+
) -> Dict[str, Any]:
|
245
251
|
"""
|
246
252
|
Ask a question about a specific PDF page.
|
247
|
-
|
253
|
+
|
248
254
|
Args:
|
249
255
|
page: natural_pdf.core.page.Page object
|
250
256
|
question: Question to ask about the page
|
251
257
|
min_confidence: Minimum confidence threshold for answers
|
252
|
-
|
258
|
+
|
253
259
|
Returns:
|
254
260
|
Dictionary with answer details
|
255
261
|
"""
|
256
262
|
# Ensure we have text elements on the page
|
257
|
-
if not page.find_all(
|
263
|
+
if not page.find_all("text"):
|
258
264
|
# Apply OCR if no text is available
|
259
265
|
logger.info(f"No text elements found on page {page.index}, applying OCR")
|
260
266
|
page.apply_ocr()
|
261
|
-
|
267
|
+
|
262
268
|
# Extract word boxes
|
263
|
-
elements = page.find_all(
|
269
|
+
elements = page.find_all("text")
|
264
270
|
word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
|
265
|
-
|
271
|
+
|
266
272
|
# Generate a high-resolution image of the page
|
267
|
-
with tempfile.NamedTemporaryFile(suffix=
|
273
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
268
274
|
temp_path = temp_file.name
|
269
|
-
|
275
|
+
|
270
276
|
# Save a high resolution image (300 DPI)
|
271
277
|
page_image = page.to_image(resolution=300, include_highlights=False)
|
272
278
|
page_image.save(temp_path)
|
273
|
-
|
279
|
+
|
274
280
|
try:
|
275
281
|
# Ask the question
|
276
282
|
result = self.ask(
|
@@ -278,79 +284,81 @@ class DocumentQA:
|
|
278
284
|
question=question,
|
279
285
|
word_boxes=word_boxes,
|
280
286
|
min_confidence=min_confidence,
|
281
|
-
debug=debug
|
287
|
+
debug=debug,
|
282
288
|
)
|
283
|
-
|
289
|
+
|
284
290
|
# Add page reference to the result
|
285
291
|
result["page_num"] = page.index
|
286
|
-
|
292
|
+
|
287
293
|
# Add element references if possible
|
288
294
|
if result.get("found", False) and "start" in result and "end" in result:
|
289
295
|
start_idx = result["start"]
|
290
296
|
end_idx = result["end"]
|
291
|
-
|
297
|
+
|
292
298
|
# Make sure we have valid indices and elements to work with
|
293
299
|
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
294
300
|
# Find the actual source elements in the original list
|
295
301
|
# Since word_boxes may have filtered out some elements, we need to map indices
|
296
|
-
|
302
|
+
|
297
303
|
# Get the text from result word boxes
|
298
|
-
matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
|
299
|
-
|
304
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
305
|
+
|
300
306
|
# Find corresponding elements in the full element list
|
301
307
|
source_elements = []
|
302
308
|
for element in elements:
|
303
|
-
if hasattr(element,
|
309
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
304
310
|
source_elements.append(element)
|
305
311
|
# Remove from matched texts to avoid duplicates
|
306
312
|
if element.text in matched_texts:
|
307
313
|
matched_texts.remove(element.text)
|
308
314
|
|
309
315
|
result["source_elements"] = ElementCollection(source_elements)
|
310
|
-
|
316
|
+
|
311
317
|
return result
|
312
|
-
|
318
|
+
|
313
319
|
finally:
|
314
320
|
# Clean up temporary file
|
315
321
|
if os.path.exists(temp_path):
|
316
322
|
os.remove(temp_path)
|
317
|
-
|
318
|
-
def ask_pdf_region(
|
323
|
+
|
324
|
+
def ask_pdf_region(
|
325
|
+
self, region, question: str, min_confidence: float = 0.1, debug: bool = False
|
326
|
+
) -> Dict[str, Any]:
|
319
327
|
"""
|
320
328
|
Ask a question about a specific region of a PDF page.
|
321
|
-
|
329
|
+
|
322
330
|
Args:
|
323
331
|
region: natural_pdf.elements.region.Region object
|
324
332
|
question: Question to ask about the region
|
325
333
|
min_confidence: Minimum confidence threshold for answers
|
326
|
-
|
334
|
+
|
327
335
|
Returns:
|
328
336
|
Dictionary with answer details
|
329
337
|
"""
|
330
338
|
# Get all text elements within the region
|
331
|
-
elements = region.find_all(
|
332
|
-
|
339
|
+
elements = region.find_all("text")
|
340
|
+
|
333
341
|
# Apply OCR if needed
|
334
342
|
if not elements:
|
335
343
|
logger.info(f"No text elements found in region, applying OCR")
|
336
344
|
elements = region.apply_ocr()
|
337
|
-
|
345
|
+
|
338
346
|
# Extract word boxes adjusted for the cropped region
|
339
347
|
x0, top = int(region.x0), int(region.top)
|
340
348
|
word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
|
341
|
-
|
349
|
+
|
342
350
|
# Generate a cropped image of the region
|
343
|
-
with tempfile.NamedTemporaryFile(suffix=
|
351
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
|
344
352
|
temp_path = temp_file.name
|
345
|
-
|
353
|
+
|
346
354
|
# Get page image at high resolution - this returns a PIL Image directly
|
347
355
|
page_image = region.page.to_image(resolution=300, include_highlights=False)
|
348
|
-
|
356
|
+
|
349
357
|
# Crop to region
|
350
358
|
x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
|
351
359
|
region_image = page_image.crop((x0, top, x1, bottom))
|
352
360
|
region_image.save(temp_path)
|
353
|
-
|
361
|
+
|
354
362
|
try:
|
355
363
|
# Ask the question
|
356
364
|
result = self.ask(
|
@@ -358,40 +366,40 @@ class DocumentQA:
|
|
358
366
|
question=question,
|
359
367
|
word_boxes=word_boxes,
|
360
368
|
min_confidence=min_confidence,
|
361
|
-
debug=debug
|
369
|
+
debug=debug,
|
362
370
|
)
|
363
|
-
|
371
|
+
|
364
372
|
# Add region reference to the result
|
365
373
|
result["region"] = region
|
366
374
|
result["page_num"] = region.page.index
|
367
|
-
|
375
|
+
|
368
376
|
# Add element references if possible
|
369
377
|
if result.get("found", False) and "start" in result and "end" in result:
|
370
378
|
start_idx = result["start"]
|
371
379
|
end_idx = result["end"]
|
372
|
-
|
380
|
+
|
373
381
|
# Make sure we have valid indices and elements to work with
|
374
382
|
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
375
383
|
# Find the actual source elements in the original list
|
376
384
|
# Since word_boxes may have filtered out some elements, we need to map indices
|
377
|
-
|
385
|
+
|
378
386
|
# Get the text from result word boxes
|
379
|
-
matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
|
380
|
-
|
387
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
|
388
|
+
|
381
389
|
# Find corresponding elements in the full element list
|
382
390
|
source_elements = []
|
383
391
|
for element in elements:
|
384
|
-
if hasattr(element,
|
392
|
+
if hasattr(element, "text") and element.text in matched_texts:
|
385
393
|
source_elements.append(element)
|
386
394
|
# Remove from matched texts to avoid duplicates
|
387
395
|
if element.text in matched_texts:
|
388
396
|
matched_texts.remove(element.text)
|
389
|
-
|
397
|
+
|
390
398
|
result["source_elements"] = ElementCollection(source_elements)
|
391
|
-
|
399
|
+
|
392
400
|
return result
|
393
|
-
|
401
|
+
|
394
402
|
finally:
|
395
403
|
# Clean up temporary file
|
396
404
|
if os.path.exists(temp_path):
|
397
|
-
os.remove(temp_path)
|
405
|
+
os.remove(temp_path)
|
natural_pdf/search/__init__.py
CHANGED
@@ -7,32 +7,29 @@ from typing import Optional
|
|
7
7
|
# Import the concrete implementation
|
8
8
|
from .haystack_search_service import HaystackSearchService
|
9
9
|
|
10
|
-
# ---
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
IndexConfigurationError,
|
15
|
-
Indexable
|
10
|
+
# --- Utils Import ---
|
11
|
+
from .haystack_utils import ( # Re-export flag and helper
|
12
|
+
HAS_HAYSTACK_EXTRAS,
|
13
|
+
check_haystack_availability,
|
16
14
|
)
|
17
15
|
|
18
16
|
# --- Option Imports (for convenience) ---
|
19
17
|
# Make options easily available via `from natural_pdf.search import ...`
|
20
|
-
from .search_options import
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
# --- Utils Import ---
|
27
|
-
from .haystack_utils import HAS_HAYSTACK_EXTRAS, check_haystack_availability # Re-export flag and helper
|
18
|
+
from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
|
19
|
+
from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
|
20
|
+
|
21
|
+
# --- Protocol Import ---
|
22
|
+
# Import the protocol for type hinting
|
23
|
+
from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
|
28
24
|
|
29
25
|
logger = logging.getLogger(__name__)
|
30
26
|
|
31
27
|
# --- Factory Function ---
|
32
28
|
|
29
|
+
|
33
30
|
def get_search_service(
|
34
|
-
collection_name: str,
|
35
|
-
persist: bool = False,
|
31
|
+
collection_name: str, # Add collection_name as a required argument
|
32
|
+
persist: bool = False, # Default to In-Memory
|
36
33
|
# Configuration for the service itself
|
37
34
|
default_persist_path: Optional[str] = None,
|
38
35
|
default_embedding_model: Optional[str] = None,
|
@@ -56,39 +53,48 @@ def get_search_service(
|
|
56
53
|
Returns:
|
57
54
|
An instance conforming to the SearchServiceProtocol for the specified collection.
|
58
55
|
"""
|
59
|
-
logger.debug(
|
60
|
-
|
56
|
+
logger.debug(
|
57
|
+
f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})..."
|
58
|
+
)
|
59
|
+
|
61
60
|
# For now, we only have one implementation
|
62
61
|
# Collect arguments relevant to HaystackSearchService.__init__
|
63
62
|
service_args = {}
|
64
|
-
service_args[
|
65
|
-
service_args[
|
63
|
+
service_args["collection_name"] = collection_name # Pass collection_name
|
64
|
+
service_args["persist"] = persist # Pass persist flag to service constructor
|
66
65
|
if default_persist_path is not None:
|
67
|
-
service_args[
|
66
|
+
service_args["default_persist_path"] = default_persist_path
|
68
67
|
if default_embedding_model is not None:
|
69
|
-
service_args[
|
70
|
-
|
68
|
+
service_args["default_embedding_model"] = default_embedding_model
|
69
|
+
|
71
70
|
# TODO: Implement caching/registry if needed to return the same instance
|
72
71
|
# for the same configuration instead of always creating a new one.
|
73
72
|
# cache_key = tuple(sorted(service_args.items()))
|
74
73
|
# if cache_key in _service_instance_cache:
|
75
74
|
# return _service_instance_cache[cache_key]
|
76
|
-
|
75
|
+
|
77
76
|
try:
|
78
77
|
service_instance = HaystackSearchService(**service_args)
|
79
78
|
# _service_instance_cache[cache_key] = service_instance
|
80
|
-
logger.info(
|
79
|
+
logger.info(
|
80
|
+
f"Created new HaystackSearchService instance for collection '{collection_name}'."
|
81
|
+
)
|
81
82
|
return service_instance
|
82
83
|
except ImportError as e:
|
83
|
-
|
84
|
-
|
84
|
+
logger.error(
|
85
|
+
f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
|
86
|
+
)
|
87
|
+
raise ImportError(
|
88
|
+
"Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
|
89
|
+
) from e
|
85
90
|
except Exception as e:
|
86
|
-
|
87
|
-
|
91
|
+
logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
|
92
|
+
raise RuntimeError("Could not create Search Service instance.") from e
|
93
|
+
|
88
94
|
|
89
95
|
# --- Optional: Define a default instance for extreme ease of use? ---
|
90
96
|
# try:
|
91
97
|
# default_search_service = get_search_service()
|
92
98
|
# except Exception:
|
93
|
-
# default_search_service = None
|
94
|
-
# logger.warning("Could not create default search service instance on import.")
|
99
|
+
# default_search_service = None
|
100
|
+
# logger.warning("Could not create default search service instance on import.")
|