natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,405 @@
1
+ import logging
2
+ from typing import List, Dict, Any, Optional, Union, Tuple
3
+ import numpy as np
4
+ from PIL import Image
5
+ import os
6
+ import tempfile
7
+ import json
8
+
9
+ logger = logging.getLogger("natural_pdf.qa.document_qa")
10
+
11
+ # Global QA engine instance
12
+ _QA_ENGINE_INSTANCE = None
13
+
14
+ def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
15
+ """
16
+ Get or create a global QA engine instance.
17
+
18
+ Args:
19
+ model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
20
+ **kwargs: Additional parameters to pass to the DocumentQA constructor
21
+
22
+ Returns:
23
+ DocumentQA instance
24
+ """
25
+ global _QA_ENGINE_INSTANCE
26
+
27
+ if _QA_ENGINE_INSTANCE is None:
28
+ try:
29
+ _QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
30
+ except Exception as e:
31
+ logger.error(f"Failed to initialize QA engine: {e}")
32
+ raise
33
+
34
+ return _QA_ENGINE_INSTANCE
35
+
36
+
37
+ class DocumentQA:
38
+ """
39
+ Document Question Answering using LayoutLM.
40
+
41
+ This class provides the ability to ask natural language questions about document content,
42
+ leveraging the spatial layout information from PDF pages.
43
+ """
44
+
45
+ def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
46
+ """
47
+ Initialize the Document QA engine.
48
+
49
+ Args:
50
+ model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
51
+ device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
52
+ """
53
+ try:
54
+ import torch
55
+ from transformers import pipeline
56
+
57
+ # Determine device
58
+ if device is None:
59
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
60
+
61
+ logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
62
+
63
+ # Initialize the pipeline
64
+ self.pipe = pipeline("document-question-answering", model=model_name, device=device)
65
+
66
+ self.model_name = model_name
67
+ self.device = device
68
+ self._is_initialized = True
69
+
70
+ except ImportError as e:
71
+ logger.error(f"Failed to import required packages: {e}")
72
+ self._is_initialized = False
73
+ raise ImportError(
74
+ "DocumentQA requires transformers and torch to be installed. "
75
+ "Install with pip install transformers torch"
76
+ )
77
+ except Exception as e:
78
+ logger.error(f"Failed to initialize DocumentQA: {e}")
79
+ self._is_initialized = False
80
+ raise
81
+
82
+ def is_available(self) -> bool:
83
+ """Check if the QA engine is properly initialized."""
84
+ return self._is_initialized
85
+
86
+ def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
87
+ """
88
+ Extract word boxes from text elements.
89
+
90
+ Args:
91
+ elements: List of TextElement objects
92
+ offset_x: X-coordinate offset to subtract (for region cropping)
93
+ offset_y: Y-coordinate offset to subtract (for region cropping)
94
+
95
+ Returns:
96
+ List of [text, [x0, top, x1, bottom]] entries
97
+ """
98
+ word_boxes = []
99
+
100
+ for element in elements:
101
+ if hasattr(element, 'text') and element.text.strip():
102
+ # Apply offset for cropped regions
103
+ x0 = int(element.x0) - offset_x
104
+ top = int(element.top) - offset_y
105
+ x1 = int(element.x1) - offset_x
106
+ bottom = int(element.bottom) - offset_y
107
+
108
+ # Ensure coordinates are valid (non-negative)
109
+ x0 = max(0, x0)
110
+ top = max(0, top)
111
+ x1 = max(0, x1)
112
+ bottom = max(0, bottom)
113
+
114
+ word_boxes.append([
115
+ element.text,
116
+ [x0, top, x1, bottom]
117
+ ])
118
+
119
+ return word_boxes
120
+
121
+ def ask(self,
122
+ image: Union[str, Image.Image, np.ndarray],
123
+ question: str,
124
+ word_boxes: List = None,
125
+ min_confidence: float = 0.1,
126
+ debug: bool = False,
127
+ debug_output_dir: str = "output") -> Dict[str, Any]:
128
+ """
129
+ Ask a question about document content.
130
+
131
+ Args:
132
+ image: PIL Image, numpy array, or path to image file
133
+ question: Question to ask about the document
134
+ word_boxes: Optional pre-extracted word boxes [[text, [x0, y0, x1, y1]], ...]
135
+ min_confidence: Minimum confidence threshold for answers
136
+ debug: Whether to save debug information
137
+ debug_output_dir: Directory to save debug files
138
+
139
+ Returns:
140
+ Dictionary with answer details: {
141
+ "answer": extracted text,
142
+ "confidence": confidence score,
143
+ "start": start word index,
144
+ "end": end word index
145
+ }
146
+ """
147
+ if not self._is_initialized:
148
+ raise RuntimeError("DocumentQA is not properly initialized")
149
+
150
+ # Process the image
151
+ if isinstance(image, str):
152
+ # It's a file path
153
+ if not os.path.exists(image):
154
+ raise FileNotFoundError(f"Image file not found: {image}")
155
+ image_obj = Image.open(image)
156
+ elif isinstance(image, np.ndarray):
157
+ # Convert numpy array to PIL Image
158
+ image_obj = Image.fromarray(image)
159
+ elif isinstance(image, Image.Image):
160
+ # Already a PIL Image
161
+ image_obj = image
162
+ else:
163
+ raise TypeError("Image must be a PIL Image, numpy array, or file path")
164
+
165
+ # Prepare the query
166
+ query = {
167
+ "image": image_obj,
168
+ "question": question
169
+ }
170
+
171
+ # Add word boxes if provided
172
+ if word_boxes:
173
+ query["word_boxes"] = word_boxes
174
+
175
+ # Save debug information if requested
176
+ if debug:
177
+ # Create debug directory
178
+ os.makedirs(debug_output_dir, exist_ok=True)
179
+
180
+ # Save the image
181
+ image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
182
+ image_obj.save(image_debug_path)
183
+
184
+ # Save word boxes
185
+ if word_boxes:
186
+ word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
187
+ with open(word_boxes_path, 'w') as f:
188
+ json.dump(word_boxes, f, indent=2)
189
+
190
+ # Generate a visualization of the boxes on the image
191
+ vis_image = image_obj.copy()
192
+ draw = ImageDraw.Draw(vis_image)
193
+
194
+ for i, (text, box) in enumerate(word_boxes):
195
+ x0, y0, x1, y1 = box
196
+ draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
197
+ # Add text index for reference
198
+ draw.text((x0, y0), str(i), fill=(255, 0, 0))
199
+
200
+ vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
201
+ vis_image.save(vis_path)
202
+
203
+ logger.info(f"Saved debug files to {debug_output_dir}")
204
+ logger.info(f"Question: {question}")
205
+ logger.info(f"Image: {image_debug_path}")
206
+ logger.info(f"Word boxes: {word_boxes_path}")
207
+ logger.info(f"Visualization: {vis_path}")
208
+
209
+ # Run the query through the pipeline
210
+ try:
211
+ logger.info(f"Running document QA pipeline with question: {question}")
212
+ result = self.pipe(query)[0]
213
+ logger.info(f"Raw result: {result}")
214
+
215
+ # Save the result if debugging
216
+ if debug:
217
+ result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
218
+ with open(result_path, 'w') as f:
219
+ # Convert any non-serializable data
220
+ serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
221
+ for k, v in result.items()}
222
+ json.dump(serializable_result, f, indent=2)
223
+
224
+ # Check confidence against threshold
225
+ if result["score"] < min_confidence:
226
+ logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
227
+ return {
228
+ "answer": "",
229
+ "confidence": result["score"],
230
+ "start": result.get("start", -1),
231
+ "end": result.get("end", -1),
232
+ "found": False
233
+ }
234
+
235
+ return {
236
+ "answer": result["answer"],
237
+ "confidence": result["score"],
238
+ "start": result.get("start", 0),
239
+ "end": result.get("end", 0),
240
+ "found": True
241
+ }
242
+
243
+ except Exception as e:
244
+ logger.error(f"Error in document QA: {e}")
245
+ return {
246
+ "answer": "",
247
+ "confidence": 0.0,
248
+ "error": str(e),
249
+ "found": False
250
+ }
251
+
252
+ def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
253
+ """
254
+ Ask a question about a specific PDF page.
255
+
256
+ Args:
257
+ page: natural_pdf.core.page.Page object
258
+ question: Question to ask about the page
259
+ min_confidence: Minimum confidence threshold for answers
260
+
261
+ Returns:
262
+ Dictionary with answer details
263
+ """
264
+ # Ensure we have text elements on the page
265
+ if not page.find_all('text'):
266
+ # Apply OCR if no text is available
267
+ logger.info(f"No text elements found on page {page.index}, applying OCR")
268
+ page.apply_ocr()
269
+
270
+ # Extract word boxes
271
+ elements = page.find_all('text')
272
+ word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
273
+
274
+ # Generate a high-resolution image of the page
275
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
276
+ temp_path = temp_file.name
277
+
278
+ # Save a high resolution image (300 DPI)
279
+ page_image = page.to_image(resolution=300, include_highlights=False)
280
+ page_image.save(temp_path)
281
+
282
+ try:
283
+ # Ask the question
284
+ result = self.ask(
285
+ image=temp_path,
286
+ question=question,
287
+ word_boxes=word_boxes,
288
+ min_confidence=min_confidence,
289
+ debug=debug
290
+ )
291
+
292
+ # Add page reference to the result
293
+ result["page_num"] = page.index
294
+
295
+ # Add element references if possible
296
+ if result.get("found", False) and "start" in result and "end" in result:
297
+ start_idx = result["start"]
298
+ end_idx = result["end"]
299
+
300
+ # Make sure we have valid indices and elements to work with
301
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
302
+ # Find the actual source elements in the original list
303
+ # Since word_boxes may have filtered out some elements, we need to map indices
304
+
305
+ # Get the text from result word boxes
306
+ matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
307
+
308
+ # Find corresponding elements in the full element list
309
+ source_elements = []
310
+ for element in elements:
311
+ if hasattr(element, 'text') and element.text in matched_texts:
312
+ source_elements.append(element)
313
+ # Remove from matched texts to avoid duplicates
314
+ if element.text in matched_texts:
315
+ matched_texts.remove(element.text)
316
+
317
+ result["source_elements"] = source_elements
318
+
319
+ return result
320
+
321
+ finally:
322
+ # Clean up temporary file
323
+ if os.path.exists(temp_path):
324
+ os.remove(temp_path)
325
+
326
+ def ask_pdf_region(self, region, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
327
+ """
328
+ Ask a question about a specific region of a PDF page.
329
+
330
+ Args:
331
+ region: natural_pdf.elements.region.Region object
332
+ question: Question to ask about the region
333
+ min_confidence: Minimum confidence threshold for answers
334
+
335
+ Returns:
336
+ Dictionary with answer details
337
+ """
338
+ # Get all text elements within the region
339
+ elements = region.find_all('text')
340
+
341
+ # Apply OCR if needed
342
+ if not elements:
343
+ logger.info(f"No text elements found in region, applying OCR")
344
+ elements = region.apply_ocr()
345
+
346
+ # Extract word boxes adjusted for the cropped region
347
+ x0, top = int(region.x0), int(region.top)
348
+ word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
349
+
350
+ # Generate a cropped image of the region
351
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
352
+ temp_path = temp_file.name
353
+
354
+ # Get page image at high resolution - this returns a PIL Image directly
355
+ page_image = region.page.to_image(resolution=300, include_highlights=False)
356
+
357
+ # Crop to region
358
+ x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
359
+ region_image = page_image.crop((x0, top, x1, bottom))
360
+ region_image.save(temp_path)
361
+
362
+ try:
363
+ # Ask the question
364
+ result = self.ask(
365
+ image=temp_path,
366
+ question=question,
367
+ word_boxes=word_boxes,
368
+ min_confidence=min_confidence,
369
+ debug=debug
370
+ )
371
+
372
+ # Add region reference to the result
373
+ result["region"] = region
374
+ result["page_num"] = region.page.index
375
+
376
+ # Add element references if possible
377
+ if result.get("found", False) and "start" in result and "end" in result:
378
+ start_idx = result["start"]
379
+ end_idx = result["end"]
380
+
381
+ # Make sure we have valid indices and elements to work with
382
+ if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
383
+ # Find the actual source elements in the original list
384
+ # Since word_boxes may have filtered out some elements, we need to map indices
385
+
386
+ # Get the text from result word boxes
387
+ matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
388
+
389
+ # Find corresponding elements in the full element list
390
+ source_elements = []
391
+ for element in elements:
392
+ if hasattr(element, 'text') and element.text in matched_texts:
393
+ source_elements.append(element)
394
+ # Remove from matched texts to avoid duplicates
395
+ if element.text in matched_texts:
396
+ matched_texts.remove(element.text)
397
+
398
+ result["source_elements"] = source_elements
399
+
400
+ return result
401
+
402
+ finally:
403
+ # Clean up temporary file
404
+ if os.path.exists(temp_path):
405
+ os.remove(temp_path)
@@ -0,0 +1,4 @@
1
+ """
2
+ Selector module for natural-pdf.
3
+ """
4
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func