natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,405 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import List, Dict, Any, Optional, Union, Tuple
|
3
|
+
import numpy as np
|
4
|
+
from PIL import Image
|
5
|
+
import os
|
6
|
+
import tempfile
|
7
|
+
import json
|
8
|
+
|
9
|
+
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
10
|
+
|
11
|
+
# Global QA engine instance
|
12
|
+
_QA_ENGINE_INSTANCE = None
|
13
|
+
|
14
|
+
def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
|
15
|
+
"""
|
16
|
+
Get or create a global QA engine instance.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
|
20
|
+
**kwargs: Additional parameters to pass to the DocumentQA constructor
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
DocumentQA instance
|
24
|
+
"""
|
25
|
+
global _QA_ENGINE_INSTANCE
|
26
|
+
|
27
|
+
if _QA_ENGINE_INSTANCE is None:
|
28
|
+
try:
|
29
|
+
_QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
|
30
|
+
except Exception as e:
|
31
|
+
logger.error(f"Failed to initialize QA engine: {e}")
|
32
|
+
raise
|
33
|
+
|
34
|
+
return _QA_ENGINE_INSTANCE
|
35
|
+
|
36
|
+
|
37
|
+
class DocumentQA:
|
38
|
+
"""
|
39
|
+
Document Question Answering using LayoutLM.
|
40
|
+
|
41
|
+
This class provides the ability to ask natural language questions about document content,
|
42
|
+
leveraging the spatial layout information from PDF pages.
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
|
46
|
+
"""
|
47
|
+
Initialize the Document QA engine.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
|
51
|
+
device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
|
52
|
+
"""
|
53
|
+
try:
|
54
|
+
import torch
|
55
|
+
from transformers import pipeline
|
56
|
+
|
57
|
+
# Determine device
|
58
|
+
if device is None:
|
59
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
60
|
+
|
61
|
+
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
62
|
+
|
63
|
+
# Initialize the pipeline
|
64
|
+
self.pipe = pipeline("document-question-answering", model=model_name, device=device)
|
65
|
+
|
66
|
+
self.model_name = model_name
|
67
|
+
self.device = device
|
68
|
+
self._is_initialized = True
|
69
|
+
|
70
|
+
except ImportError as e:
|
71
|
+
logger.error(f"Failed to import required packages: {e}")
|
72
|
+
self._is_initialized = False
|
73
|
+
raise ImportError(
|
74
|
+
"DocumentQA requires transformers and torch to be installed. "
|
75
|
+
"Install with pip install transformers torch"
|
76
|
+
)
|
77
|
+
except Exception as e:
|
78
|
+
logger.error(f"Failed to initialize DocumentQA: {e}")
|
79
|
+
self._is_initialized = False
|
80
|
+
raise
|
81
|
+
|
82
|
+
def is_available(self) -> bool:
|
83
|
+
"""Check if the QA engine is properly initialized."""
|
84
|
+
return self._is_initialized
|
85
|
+
|
86
|
+
def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
|
87
|
+
"""
|
88
|
+
Extract word boxes from text elements.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
elements: List of TextElement objects
|
92
|
+
offset_x: X-coordinate offset to subtract (for region cropping)
|
93
|
+
offset_y: Y-coordinate offset to subtract (for region cropping)
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
List of [text, [x0, top, x1, bottom]] entries
|
97
|
+
"""
|
98
|
+
word_boxes = []
|
99
|
+
|
100
|
+
for element in elements:
|
101
|
+
if hasattr(element, 'text') and element.text.strip():
|
102
|
+
# Apply offset for cropped regions
|
103
|
+
x0 = int(element.x0) - offset_x
|
104
|
+
top = int(element.top) - offset_y
|
105
|
+
x1 = int(element.x1) - offset_x
|
106
|
+
bottom = int(element.bottom) - offset_y
|
107
|
+
|
108
|
+
# Ensure coordinates are valid (non-negative)
|
109
|
+
x0 = max(0, x0)
|
110
|
+
top = max(0, top)
|
111
|
+
x1 = max(0, x1)
|
112
|
+
bottom = max(0, bottom)
|
113
|
+
|
114
|
+
word_boxes.append([
|
115
|
+
element.text,
|
116
|
+
[x0, top, x1, bottom]
|
117
|
+
])
|
118
|
+
|
119
|
+
return word_boxes
|
120
|
+
|
121
|
+
def ask(self,
|
122
|
+
image: Union[str, Image.Image, np.ndarray],
|
123
|
+
question: str,
|
124
|
+
word_boxes: List = None,
|
125
|
+
min_confidence: float = 0.1,
|
126
|
+
debug: bool = False,
|
127
|
+
debug_output_dir: str = "output") -> Dict[str, Any]:
|
128
|
+
"""
|
129
|
+
Ask a question about document content.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
image: PIL Image, numpy array, or path to image file
|
133
|
+
question: Question to ask about the document
|
134
|
+
word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
|
135
|
+
min_confidence: Minimum confidence threshold for answers
|
136
|
+
debug: Whether to save debug information
|
137
|
+
debug_output_dir: Directory to save debug files
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Dictionary with answer details: {
|
141
|
+
"answer": extracted text,
|
142
|
+
"confidence": confidence score,
|
143
|
+
"start": start word index,
|
144
|
+
"end": end word index
|
145
|
+
}
|
146
|
+
"""
|
147
|
+
if not self._is_initialized:
|
148
|
+
raise RuntimeError("DocumentQA is not properly initialized")
|
149
|
+
|
150
|
+
# Process the image
|
151
|
+
if isinstance(image, str):
|
152
|
+
# It's a file path
|
153
|
+
if not os.path.exists(image):
|
154
|
+
raise FileNotFoundError(f"Image file not found: {image}")
|
155
|
+
image_obj = Image.open(image)
|
156
|
+
elif isinstance(image, np.ndarray):
|
157
|
+
# Convert numpy array to PIL Image
|
158
|
+
image_obj = Image.fromarray(image)
|
159
|
+
elif isinstance(image, Image.Image):
|
160
|
+
# Already a PIL Image
|
161
|
+
image_obj = image
|
162
|
+
else:
|
163
|
+
raise TypeError("Image must be a PIL Image, numpy array, or file path")
|
164
|
+
|
165
|
+
# Prepare the query
|
166
|
+
query = {
|
167
|
+
"image": image_obj,
|
168
|
+
"question": question
|
169
|
+
}
|
170
|
+
|
171
|
+
# Add word boxes if provided
|
172
|
+
if word_boxes:
|
173
|
+
query["word_boxes"] = word_boxes
|
174
|
+
|
175
|
+
# Save debug information if requested
|
176
|
+
if debug:
|
177
|
+
# Create debug directory
|
178
|
+
os.makedirs(debug_output_dir, exist_ok=True)
|
179
|
+
|
180
|
+
# Save the image
|
181
|
+
image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
|
182
|
+
image_obj.save(image_debug_path)
|
183
|
+
|
184
|
+
# Save word boxes
|
185
|
+
if word_boxes:
|
186
|
+
word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
|
187
|
+
with open(word_boxes_path, 'w') as f:
|
188
|
+
json.dump(word_boxes, f, indent=2)
|
189
|
+
|
190
|
+
# Generate a visualization of the boxes on the image
|
191
|
+
vis_image = image_obj.copy()
|
192
|
+
draw = ImageDraw.Draw(vis_image)
|
193
|
+
|
194
|
+
for i, (text, box) in enumerate(word_boxes):
|
195
|
+
x0, y0, x1, y1 = box
|
196
|
+
draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
|
197
|
+
# Add text index for reference
|
198
|
+
draw.text((x0, y0), str(i), fill=(255, 0, 0))
|
199
|
+
|
200
|
+
vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
|
201
|
+
vis_image.save(vis_path)
|
202
|
+
|
203
|
+
logger.info(f"Saved debug files to {debug_output_dir}")
|
204
|
+
logger.info(f"Question: {question}")
|
205
|
+
logger.info(f"Image: {image_debug_path}")
|
206
|
+
logger.info(f"Word boxes: {word_boxes_path}")
|
207
|
+
logger.info(f"Visualization: {vis_path}")
|
208
|
+
|
209
|
+
# Run the query through the pipeline
|
210
|
+
try:
|
211
|
+
logger.info(f"Running document QA pipeline with question: {question}")
|
212
|
+
result = self.pipe(query)[0]
|
213
|
+
logger.info(f"Raw result: {result}")
|
214
|
+
|
215
|
+
# Save the result if debugging
|
216
|
+
if debug:
|
217
|
+
result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
|
218
|
+
with open(result_path, 'w') as f:
|
219
|
+
# Convert any non-serializable data
|
220
|
+
serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
|
221
|
+
for k, v in result.items()}
|
222
|
+
json.dump(serializable_result, f, indent=2)
|
223
|
+
|
224
|
+
# Check confidence against threshold
|
225
|
+
if result["score"] < min_confidence:
|
226
|
+
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
227
|
+
return {
|
228
|
+
"answer": "",
|
229
|
+
"confidence": result["score"],
|
230
|
+
"start": result.get("start", -1),
|
231
|
+
"end": result.get("end", -1),
|
232
|
+
"found": False
|
233
|
+
}
|
234
|
+
|
235
|
+
return {
|
236
|
+
"answer": result["answer"],
|
237
|
+
"confidence": result["score"],
|
238
|
+
"start": result.get("start", 0),
|
239
|
+
"end": result.get("end", 0),
|
240
|
+
"found": True
|
241
|
+
}
|
242
|
+
|
243
|
+
except Exception as e:
|
244
|
+
logger.error(f"Error in document QA: {e}")
|
245
|
+
return {
|
246
|
+
"answer": "",
|
247
|
+
"confidence": 0.0,
|
248
|
+
"error": str(e),
|
249
|
+
"found": False
|
250
|
+
}
|
251
|
+
|
252
|
+
def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
|
253
|
+
"""
|
254
|
+
Ask a question about a specific PDF page.
|
255
|
+
|
256
|
+
Args:
|
257
|
+
page: natural_pdf.core.page.Page object
|
258
|
+
question: Question to ask about the page
|
259
|
+
min_confidence: Minimum confidence threshold for answers
|
260
|
+
|
261
|
+
Returns:
|
262
|
+
Dictionary with answer details
|
263
|
+
"""
|
264
|
+
# Ensure we have text elements on the page
|
265
|
+
if not page.find_all('text'):
|
266
|
+
# Apply OCR if no text is available
|
267
|
+
logger.info(f"No text elements found on page {page.index}, applying OCR")
|
268
|
+
page.apply_ocr()
|
269
|
+
|
270
|
+
# Extract word boxes
|
271
|
+
elements = page.find_all('text')
|
272
|
+
word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
|
273
|
+
|
274
|
+
# Generate a high-resolution image of the page
|
275
|
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
|
276
|
+
temp_path = temp_file.name
|
277
|
+
|
278
|
+
# Save a high resolution image (300 DPI)
|
279
|
+
page_image = page.to_image(resolution=300, include_highlights=False)
|
280
|
+
page_image.save(temp_path)
|
281
|
+
|
282
|
+
try:
|
283
|
+
# Ask the question
|
284
|
+
result = self.ask(
|
285
|
+
image=temp_path,
|
286
|
+
question=question,
|
287
|
+
word_boxes=word_boxes,
|
288
|
+
min_confidence=min_confidence,
|
289
|
+
debug=debug
|
290
|
+
)
|
291
|
+
|
292
|
+
# Add page reference to the result
|
293
|
+
result["page_num"] = page.index
|
294
|
+
|
295
|
+
# Add element references if possible
|
296
|
+
if result.get("found", False) and "start" in result and "end" in result:
|
297
|
+
start_idx = result["start"]
|
298
|
+
end_idx = result["end"]
|
299
|
+
|
300
|
+
# Make sure we have valid indices and elements to work with
|
301
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
302
|
+
# Find the actual source elements in the original list
|
303
|
+
# Since word_boxes may have filtered out some elements, we need to map indices
|
304
|
+
|
305
|
+
# Get the text from result word boxes
|
306
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
|
307
|
+
|
308
|
+
# Find corresponding elements in the full element list
|
309
|
+
source_elements = []
|
310
|
+
for element in elements:
|
311
|
+
if hasattr(element, 'text') and element.text in matched_texts:
|
312
|
+
source_elements.append(element)
|
313
|
+
# Remove from matched texts to avoid duplicates
|
314
|
+
if element.text in matched_texts:
|
315
|
+
matched_texts.remove(element.text)
|
316
|
+
|
317
|
+
result["source_elements"] = source_elements
|
318
|
+
|
319
|
+
return result
|
320
|
+
|
321
|
+
finally:
|
322
|
+
# Clean up temporary file
|
323
|
+
if os.path.exists(temp_path):
|
324
|
+
os.remove(temp_path)
|
325
|
+
|
326
|
+
def ask_pdf_region(self, region, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
|
327
|
+
"""
|
328
|
+
Ask a question about a specific region of a PDF page.
|
329
|
+
|
330
|
+
Args:
|
331
|
+
region: natural_pdf.elements.region.Region object
|
332
|
+
question: Question to ask about the region
|
333
|
+
min_confidence: Minimum confidence threshold for answers
|
334
|
+
|
335
|
+
Returns:
|
336
|
+
Dictionary with answer details
|
337
|
+
"""
|
338
|
+
# Get all text elements within the region
|
339
|
+
elements = region.find_all('text')
|
340
|
+
|
341
|
+
# Apply OCR if needed
|
342
|
+
if not elements:
|
343
|
+
logger.info(f"No text elements found in region, applying OCR")
|
344
|
+
elements = region.apply_ocr()
|
345
|
+
|
346
|
+
# Extract word boxes adjusted for the cropped region
|
347
|
+
x0, top = int(region.x0), int(region.top)
|
348
|
+
word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
|
349
|
+
|
350
|
+
# Generate a cropped image of the region
|
351
|
+
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
|
352
|
+
temp_path = temp_file.name
|
353
|
+
|
354
|
+
# Get page image at high resolution - this returns a PIL Image directly
|
355
|
+
page_image = region.page.to_image(resolution=300, include_highlights=False)
|
356
|
+
|
357
|
+
# Crop to region
|
358
|
+
x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
|
359
|
+
region_image = page_image.crop((x0, top, x1, bottom))
|
360
|
+
region_image.save(temp_path)
|
361
|
+
|
362
|
+
try:
|
363
|
+
# Ask the question
|
364
|
+
result = self.ask(
|
365
|
+
image=temp_path,
|
366
|
+
question=question,
|
367
|
+
word_boxes=word_boxes,
|
368
|
+
min_confidence=min_confidence,
|
369
|
+
debug=debug
|
370
|
+
)
|
371
|
+
|
372
|
+
# Add region reference to the result
|
373
|
+
result["region"] = region
|
374
|
+
result["page_num"] = region.page.index
|
375
|
+
|
376
|
+
# Add element references if possible
|
377
|
+
if result.get("found", False) and "start" in result and "end" in result:
|
378
|
+
start_idx = result["start"]
|
379
|
+
end_idx = result["end"]
|
380
|
+
|
381
|
+
# Make sure we have valid indices and elements to work with
|
382
|
+
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
383
|
+
# Find the actual source elements in the original list
|
384
|
+
# Since word_boxes may have filtered out some elements, we need to map indices
|
385
|
+
|
386
|
+
# Get the text from result word boxes
|
387
|
+
matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
|
388
|
+
|
389
|
+
# Find corresponding elements in the full element list
|
390
|
+
source_elements = []
|
391
|
+
for element in elements:
|
392
|
+
if hasattr(element, 'text') and element.text in matched_texts:
|
393
|
+
source_elements.append(element)
|
394
|
+
# Remove from matched texts to avoid duplicates
|
395
|
+
if element.text in matched_texts:
|
396
|
+
matched_texts.remove(element.text)
|
397
|
+
|
398
|
+
result["source_elements"] = source_elements
|
399
|
+
|
400
|
+
return result
|
401
|
+
|
402
|
+
finally:
|
403
|
+
# Clean up temporary file
|
404
|
+
if os.path.exists(temp_path):
|
405
|
+
os.remove(temp_path)
|