natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,3 @@
1
+ from natural_pdf.qa.document_qa import DocumentQA, get_qa_engine
2
+
3
+ __all__ = ["DocumentQA", "get_qa_engine"]
@@ -0,0 +1,396 @@
1
+ import logging
2
+ from typing import List, Dict, Any, Optional, Union, Tuple
3
+ import numpy as np
4
+ from PIL import Image, ImageDraw
5
+ import os
6
+ import tempfile
7
+ import json
8
+
9
+ logger = logging.getLogger("natural_pdf.qa.document_qa")
10
+
11
+ # Global QA engine instance
12
+ _QA_ENGINE_INSTANCE = None
13
+
14
+ def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
15
+ """
16
+ Get or create a global QA engine instance.
17
+
18
+ Args:
19
+ model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
20
+ **kwargs: Additional parameters to pass to the DocumentQA constructor
21
+
22
+ Returns:
23
+ DocumentQA instance
24
+ """
25
+ global _QA_ENGINE_INSTANCE
26
+
27
+ if _QA_ENGINE_INSTANCE is None:
28
+ try:
29
+ _QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
30
+ except Exception as e:
31
+ logger.error(f"Failed to initialize QA engine: {e}")
32
+ raise
33
+
34
+ return _QA_ENGINE_INSTANCE
35
+
36
+
37
+ class DocumentQA:
38
+ """
39
+ Document Question Answering using LayoutLM.
40
+
41
+ This class provides the ability to ask natural language questions about document content,
42
+ leveraging the spatial layout information from PDF pages.
43
+ """
44
+
45
+ def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
46
+ """
47
+ Initialize the Document QA engine.
48
+
49
+ Args:
50
+ model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
51
+ device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
52
+ """
53
+ try:
54
+ import torch
55
+ from transformers import pipeline
56
+
57
+ # Determine device
58
+ if device is None:
59
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
60
+
61
+ logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
62
+
63
+ # Initialize the pipeline
64
+ self.pipe = pipeline("document-question-answering", model=model_name, device=device)
65
+
66
+ self.model_name = model_name
67
+ self.device = device
68
+ self._is_initialized = True
69
+
70
+ except ImportError as e:
71
+ logger.error(f"Failed to import required packages: {e}")
72
+ self._is_initialized = False
73
+ raise ImportError(
74
+ "DocumentQA requires transformers and torch to be installed. "
75
+ "Install with pip install transformers torch"
76
+ )
77
+ except Exception as e:
78
+ logger.error(f"Failed to initialize DocumentQA: {e}")
79
+ self._is_initialized = False
80
+ raise
81
+
82
+ def is_available(self) -> bool:
83
+ """Check if the QA engine is properly initialized."""
84
+ return self._is_initialized
85
+
86
+ def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
87
+ """
88
+ Extract word boxes from text elements.
89
+
90
+ Args:
91
+ elements: List of TextElement objects
92
+ offset_x: X-coordinate offset to subtract (for region cropping)
93
+ offset_y: Y-coordinate offset to subtract (for region cropping)
94
+
95
+ Returns:
96
+ List of [text, [x0, top, x1, bottom]] entries
97
+ """
98
+ word_boxes = []
99
+
100
+ for element in elements:
101
+ if hasattr(element, 'text') and element.text.strip():
102
+ # Apply offset for cropped regions
103
+ x0 = int(element.x0) - offset_x
104
+ top = int(element.top) - offset_y
105
+ x1 = int(element.x1) - offset_x
106
+ bottom = int(element.bottom) - offset_y
107
+
108
+ # Ensure coordinates are valid (non-negative)
109
+ x0 = max(0, x0)
110
+ top = max(0, top)
111
+ x1 = max(0, x1)
112
+ bottom = max(0, bottom)
113
+
114
+ word_boxes.append([
115
+ element.text,
116
+ [x0, top, x1, bottom]
117
+ ])
118
+
119
+ return word_boxes
120
+
121
+ def ask(self,
122
+ image: Union[str, Image.Image, np.ndarray],
123
+ question: str,
124
+ word_boxes: List = None,
125
+ min_confidence: float = 0.1,
126
+ debug: bool = False,
127
+ debug_output_dir: str = "output") -> Dict[str, Any]:
128
+ """
129
+ Ask a question about document content.
130
+
131
+ Args:
132
+ image: PIL Image, numpy array, or path to image file
133
+ question: Question to ask about the document
134
+ word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
135
+ min_confidence: Minimum confidence threshold for answers
136
+ debug: Whether to save debug information
137
+ debug_output_dir: Directory to save debug files
138
+
139
+ Returns:
140
+ Dictionary with answer details: {
141
+ "answer": extracted text,
142
+ "confidence": confidence score,
143
+ "start": start word index,
144
+ "end": end word index
145
+ }
146
+ """
147
+ if not self._is_initialized:
148
+ raise RuntimeError("DocumentQA is not properly initialized")
149
+
150
+ # Process the image
151
+ if isinstance(image, str):
152
+ # It's a file path
153
+ if not os.path.exists(image):
154
+ raise FileNotFoundError(f"Image file not found: {image}")
155
+ image_obj = Image.open(image)
156
+ elif isinstance(image, np.ndarray):
157
+ # Convert numpy array to PIL Image
158
+ image_obj = Image.fromarray(image)
159
+ elif isinstance(image, Image.Image):
160
+ # Already a PIL Image
161
+ image_obj = image
162
+ else:
163
+ raise TypeError("Image must be a PIL Image, numpy array, or file path")
164
+
165
+ # Prepare the query
166
+ query = {
167
+ "image": image_obj,
168
+ "question": question
169
+ }
170
+
171
+ # Add word boxes if provided
172
+ if word_boxes:
173
+ query["word_boxes"] = word_boxes
174
+
175
+ # Save debug information if requested
176
+ if debug:
177
+ # Create debug directory
178
+ os.makedirs(debug_output_dir, exist_ok=True)
179
+
180
+ # Save the image
181
+ image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
182
+ image_obj.save(image_debug_path)
183
+
184
+ # Save word boxes
185
+ if word_boxes:
186
+ word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
187
+ with open(word_boxes_path, 'w') as f:
188
+ json.dump(word_boxes, f, indent=2)
189
+
190
+ # Generate a visualization of the boxes on the image
191
+ vis_image = image_obj.copy()
192
+ draw = ImageDraw.Draw(vis_image)
193
+
194
+ for i, (text, box) in enumerate(word_boxes):
195
+ x0, y0, x1, y1 = box
196
+ draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
197
+ # Add text index for reference
198
+ draw.text((x0, y0), str(i), fill=(255, 0, 0))
199
+
200
+ vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
201
+ vis_image.save(vis_path)
202
+
203
+ logger.info(f"Saved debug files to {debug_output_dir}")
204
+ logger.info(f"Question: {question}")
205
+ logger.info(f"Image: {image_debug_path}")
206
+ logger.info(f"Word boxes: {word_boxes_path}")
207
+ logger.info(f"Visualization: {vis_path}")
208
+
209
+ # Run the query through the pipeline
210
+ logger.info(f"Running document QA pipeline with question: {question}")
211
+ result = self.pipe(query)[0]
212
+ logger.info(f"Raw result: {result}")
213
+
214
+ # Save the result if debugging
215
+ if debug:
216
+ result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
217
+ with open(result_path, 'w') as f:
218
+ # Convert any non-serializable data
219
+ serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
220
+ for k, v in result.items()}
221
+ json.dump(serializable_result, f, indent=2)
222
+
223
+ # Check confidence against threshold
224
+ if result["score"] < min_confidence:
225
+ logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
226
+ return {
227
+ "answer": "",
228
+ "confidence": result["score"],
229
+ "start": result.get("start", -1),
230
+ "end": result.get("end", -1),
231
+ "found": False
232
+ }
233
+
234
+ return {
235
+ "answer": result["answer"],
236
+ "confidence": result["score"],
237
+ "start": result.get("start", 0),
238
+ "end": result.get("end", 0),
239
+ "found": True
240
+ }
241
+
242
+
243
+ def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
244
+ """
245
+ Ask a question about a specific PDF page.
246
+
247
+ Args:
248
+ page: natural_pdf.core.page.Page object
249
+ question: Question to ask about the page
250
+ min_confidence: Minimum confidence threshold for answers
251
+
252
+ Returns:
253
+ Dictionary with answer details
254
+ """
255
+ # Ensure we have text elements on the page
256
+ if not page.find_all('text'):
257
+ # Apply OCR if no text is available
258
+ logger.info(f"No text elements found on page {page.index}, applying OCR")
259
+ page.apply_ocr()
260
+
261
+ # Extract word boxes
262
+ elements = page.find_all('text')
263
+ word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
264
+
265
+ # Generate a high-resolution image of the page
266
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
267
+ temp_path = temp_file.name
268
+
269
+ # Save a high resolution image (300 DPI)
270
+ page_image = page.to_image(resolution=300, include_highlights=False)
271
+ page_image.save(temp_path)
272
+
273
+ try:
274
+ # Ask the question
275
+ result = self.ask(
276
+ image=temp_path,
277
+ question=question,
278
+ word_boxes=word_boxes,
279
+ min_confidence=min_confidence,
280
+ debug=debug
281
+ )
282
+
283
+ # Add page reference to the result
284
+ result["page_num"] = page.index
285
+
286
+ # Add element references if possible
287
+ if result.get("found", False) and "start" in result and "end" in result:
288
+ start_idx = result["start"]
289
+ end_idx = result["end"]
290
+
291
+ # Make sure we have valid indices and elements to work with
292
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
293
+ # Find the actual source elements in the original list
294
+ # Since word_boxes may have filtered out some elements, we need to map indices
295
+
296
+ # Get the text from result word boxes
297
+ matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
298
+
299
+ # Find corresponding elements in the full element list
300
+ source_elements = []
301
+ for element in elements:
302
+ if hasattr(element, 'text') and element.text in matched_texts:
303
+ source_elements.append(element)
304
+ # Remove from matched texts to avoid duplicates
305
+ if element.text in matched_texts:
306
+ matched_texts.remove(element.text)
307
+
308
+ result["source_elements"] = source_elements
309
+
310
+ return result
311
+
312
+ finally:
313
+ # Clean up temporary file
314
+ if os.path.exists(temp_path):
315
+ os.remove(temp_path)
316
+
317
+ def ask_pdf_region(self, region, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
318
+ """
319
+ Ask a question about a specific region of a PDF page.
320
+
321
+ Args:
322
+ region: natural_pdf.elements.region.Region object
323
+ question: Question to ask about the region
324
+ min_confidence: Minimum confidence threshold for answers
325
+
326
+ Returns:
327
+ Dictionary with answer details
328
+ """
329
+ # Get all text elements within the region
330
+ elements = region.find_all('text')
331
+
332
+ # Apply OCR if needed
333
+ if not elements:
334
+ logger.info(f"No text elements found in region, applying OCR")
335
+ elements = region.apply_ocr()
336
+
337
+ # Extract word boxes adjusted for the cropped region
338
+ x0, top = int(region.x0), int(region.top)
339
+ word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
340
+
341
+ # Generate a cropped image of the region
342
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
343
+ temp_path = temp_file.name
344
+
345
+ # Get page image at high resolution - this returns a PIL Image directly
346
+ page_image = region.page.to_image(resolution=300, include_highlights=False)
347
+
348
+ # Crop to region
349
+ x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
350
+ region_image = page_image.crop((x0, top, x1, bottom))
351
+ region_image.save(temp_path)
352
+
353
+ try:
354
+ # Ask the question
355
+ result = self.ask(
356
+ image=temp_path,
357
+ question=question,
358
+ word_boxes=word_boxes,
359
+ min_confidence=min_confidence,
360
+ debug=debug
361
+ )
362
+
363
+ # Add region reference to the result
364
+ result["region"] = region
365
+ result["page_num"] = region.page.index
366
+
367
+ # Add element references if possible
368
+ if result.get("found", False) and "start" in result and "end" in result:
369
+ start_idx = result["start"]
370
+ end_idx = result["end"]
371
+
372
+ # Make sure we have valid indices and elements to work with
373
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
374
+ # Find the actual source elements in the original list
375
+ # Since word_boxes may have filtered out some elements, we need to map indices
376
+
377
+ # Get the text from result word boxes
378
+ matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
379
+
380
+ # Find corresponding elements in the full element list
381
+ source_elements = []
382
+ for element in elements:
383
+ if hasattr(element, 'text') and element.text in matched_texts:
384
+ source_elements.append(element)
385
+ # Remove from matched texts to avoid duplicates
386
+ if element.text in matched_texts:
387
+ matched_texts.remove(element.text)
388
+
389
+ result["source_elements"] = source_elements
390
+
391
+ return result
392
+
393
+ finally:
394
+ # Clean up temporary file
395
+ if os.path.exists(temp_path):
396
+ os.remove(temp_path)
@@ -0,0 +1,4 @@
1
+ """
2
+ Selector module for natural-pdf.
3
+ """
4
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func