natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,12 @@
1
+ import json
1
2
  import logging
2
- from typing import List, Dict, Any, Optional, Union, Tuple
3
- import numpy as np
4
- from PIL import Image, ImageDraw
5
3
  import os
6
4
  import tempfile
7
- import json
5
+ from typing import Any, Dict, List, Optional, Tuple, Union
6
+
7
+ import numpy as np
8
+ from PIL import Image, ImageDraw
9
+
8
10
  from natural_pdf.elements.collections import ElementCollection
9
11
 
10
12
  logger = logging.getLogger("natural_pdf.qa.document_qa")
@@ -12,41 +14,42 @@ logger = logging.getLogger("natural_pdf.qa.document_qa")
12
14
  # Global QA engine instance
13
15
  _QA_ENGINE_INSTANCE = None
14
16
 
17
+
15
18
  def get_qa_engine(model_name: str = "impira/layoutlm-document-qa", **kwargs):
16
19
  """
17
20
  Get or create a global QA engine instance.
18
-
21
+
19
22
  Args:
20
23
  model_name: Name of the model to use (default: "impira/layoutlm-document-qa")
21
24
  **kwargs: Additional parameters to pass to the DocumentQA constructor
22
-
25
+
23
26
  Returns:
24
27
  DocumentQA instance
25
28
  """
26
29
  global _QA_ENGINE_INSTANCE
27
-
30
+
28
31
  if _QA_ENGINE_INSTANCE is None:
29
32
  try:
30
33
  _QA_ENGINE_INSTANCE = DocumentQA(model_name=model_name, **kwargs)
31
34
  except Exception as e:
32
35
  logger.error(f"Failed to initialize QA engine: {e}")
33
36
  raise
34
-
37
+
35
38
  return _QA_ENGINE_INSTANCE
36
39
 
37
40
 
38
41
  class DocumentQA:
39
42
  """
40
43
  Document Question Answering using LayoutLM.
41
-
44
+
42
45
  This class provides the ability to ask natural language questions about document content,
43
46
  leveraging the spatial layout information from PDF pages.
44
47
  """
45
-
48
+
46
49
  def __init__(self, model_name: str = "impira/layoutlm-document-qa", device: str = None):
47
50
  """
48
51
  Initialize the Document QA engine.
49
-
52
+
50
53
  Args:
51
54
  model_name: HuggingFace model name to use (default: "impira/layoutlm-document-qa")
52
55
  device: Device to run the model on ('cuda' or 'cpu'). If None, will use cuda if available.
@@ -54,20 +57,20 @@ class DocumentQA:
54
57
  try:
55
58
  import torch
56
59
  from transformers import pipeline
57
-
60
+
58
61
  # Determine device
59
62
  if device is None:
60
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
61
-
63
+ device = "cuda" if torch.cuda.is_available() else "cpu"
64
+
62
65
  logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
63
-
66
+
64
67
  # Initialize the pipeline
65
68
  self.pipe = pipeline("document-question-answering", model=model_name, device=device)
66
-
69
+
67
70
  self.model_name = model_name
68
71
  self.device = device
69
72
  self._is_initialized = True
70
-
73
+
71
74
  except ImportError as e:
72
75
  logger.error(f"Failed to import required packages: {e}")
73
76
  self._is_initialized = False
@@ -79,56 +82,55 @@ class DocumentQA:
79
82
  logger.error(f"Failed to initialize DocumentQA: {e}")
80
83
  self._is_initialized = False
81
84
  raise
82
-
85
+
83
86
  def is_available(self) -> bool:
84
87
  """Check if the QA engine is properly initialized."""
85
88
  return self._is_initialized
86
-
89
+
87
90
  def _get_word_boxes_from_elements(self, elements, offset_x=0, offset_y=0) -> List[List]:
88
91
  """
89
92
  Extract word boxes from text elements.
90
-
93
+
91
94
  Args:
92
95
  elements: List of TextElement objects
93
96
  offset_x: X-coordinate offset to subtract (for region cropping)
94
97
  offset_y: Y-coordinate offset to subtract (for region cropping)
95
-
98
+
96
99
  Returns:
97
100
  List of [text, [x0, top, x1, bottom]] entries
98
101
  """
99
102
  word_boxes = []
100
-
103
+
101
104
  for element in elements:
102
- if hasattr(element, 'text') and element.text.strip():
105
+ if hasattr(element, "text") and element.text.strip():
103
106
  # Apply offset for cropped regions
104
107
  x0 = int(element.x0) - offset_x
105
108
  top = int(element.top) - offset_y
106
109
  x1 = int(element.x1) - offset_x
107
110
  bottom = int(element.bottom) - offset_y
108
-
111
+
109
112
  # Ensure coordinates are valid (non-negative)
110
113
  x0 = max(0, x0)
111
114
  top = max(0, top)
112
115
  x1 = max(0, x1)
113
116
  bottom = max(0, bottom)
114
-
115
- word_boxes.append([
116
- element.text,
117
- [x0, top, x1, bottom]
118
- ])
119
-
117
+
118
+ word_boxes.append([element.text, [x0, top, x1, bottom]])
119
+
120
120
  return word_boxes
121
-
122
- def ask(self,
123
- image: Union[str, Image.Image, np.ndarray],
124
- question: str,
125
- word_boxes: List = None,
126
- min_confidence: float = 0.1,
127
- debug: bool = False,
128
- debug_output_dir: str = "output") -> Dict[str, Any]:
121
+
122
+ def ask(
123
+ self,
124
+ image: Union[str, Image.Image, np.ndarray],
125
+ question: str,
126
+ word_boxes: List = None,
127
+ min_confidence: float = 0.1,
128
+ debug: bool = False,
129
+ debug_output_dir: str = "output",
130
+ ) -> Dict[str, Any]:
129
131
  """
130
132
  Ask a question about document content.
131
-
133
+
132
134
  Args:
133
135
  image: PIL Image, numpy array, or path to image file
134
136
  question: Question to ask about the document
@@ -136,7 +138,7 @@ class DocumentQA:
136
138
  min_confidence: Minimum confidence threshold for answers
137
139
  debug: Whether to save debug information
138
140
  debug_output_dir: Directory to save debug files
139
-
141
+
140
142
  Returns:
141
143
  Dictionary with answer details: {
142
144
  "answer": extracted text,
@@ -147,7 +149,7 @@ class DocumentQA:
147
149
  """
148
150
  if not self._is_initialized:
149
151
  raise RuntimeError("DocumentQA is not properly initialized")
150
-
152
+
151
153
  # Process the image
152
154
  if isinstance(image, str):
153
155
  # It's a file path
@@ -162,65 +164,68 @@ class DocumentQA:
162
164
  image_obj = image
163
165
  else:
164
166
  raise TypeError("Image must be a PIL Image, numpy array, or file path")
165
-
167
+
166
168
  # Prepare the query
167
- query = {
168
- "image": image_obj,
169
- "question": question
170
- }
171
-
169
+ query = {"image": image_obj, "question": question}
170
+
172
171
  # Add word boxes if provided
173
172
  if word_boxes:
174
173
  query["word_boxes"] = word_boxes
175
-
174
+
176
175
  # Save debug information if requested
177
- if debug:
176
+ if debug:
178
177
  # Create debug directory
179
178
  os.makedirs(debug_output_dir, exist_ok=True)
180
-
179
+
181
180
  # Save the image
182
181
  image_debug_path = os.path.join(debug_output_dir, "debug_qa_image.png")
183
182
  image_obj.save(image_debug_path)
184
-
183
+
185
184
  # Save word boxes
186
185
  if word_boxes:
187
186
  word_boxes_path = os.path.join(debug_output_dir, "debug_qa_word_boxes.json")
188
- with open(word_boxes_path, 'w') as f:
187
+ with open(word_boxes_path, "w") as f:
189
188
  json.dump(word_boxes, f, indent=2)
190
-
189
+
191
190
  # Generate a visualization of the boxes on the image
192
191
  vis_image = image_obj.copy()
193
192
  draw = ImageDraw.Draw(vis_image)
194
-
193
+
195
194
  for i, (text, box) in enumerate(word_boxes):
196
195
  x0, y0, x1, y1 = box
197
196
  draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
198
197
  # Add text index for reference
199
198
  draw.text((x0, y0), str(i), fill=(255, 0, 0))
200
-
199
+
201
200
  vis_path = os.path.join(debug_output_dir, "debug_qa_boxes_vis.png")
202
201
  vis_image.save(vis_path)
203
-
202
+
204
203
  logger.info(f"Saved debug files to {debug_output_dir}")
205
204
  logger.info(f"Question: {question}")
206
205
  logger.info(f"Image: {image_debug_path}")
207
206
  logger.info(f"Word boxes: {word_boxes_path}")
208
207
  logger.info(f"Visualization: {vis_path}")
209
-
208
+
210
209
  # Run the query through the pipeline
211
210
  logger.info(f"Running document QA pipeline with question: {question}")
212
211
  result = self.pipe(query)[0]
213
212
  logger.info(f"Raw result: {result}")
214
-
213
+
215
214
  # Save the result if debugging
216
215
  if debug:
217
216
  result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
218
- with open(result_path, 'w') as f:
217
+ with open(result_path, "w") as f:
219
218
  # Convert any non-serializable data
220
- serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
221
- for k, v in result.items()}
219
+ serializable_result = {
220
+ k: (
221
+ str(v)
222
+ if not isinstance(v, (str, int, float, bool, list, dict, type(None)))
223
+ else v
224
+ )
225
+ for k, v in result.items()
226
+ }
222
227
  json.dump(serializable_result, f, indent=2)
223
-
228
+
224
229
  # Check confidence against threshold
225
230
  if result["score"] < min_confidence:
226
231
  logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
@@ -229,48 +234,49 @@ class DocumentQA:
229
234
  "confidence": result["score"],
230
235
  "start": result.get("start", -1),
231
236
  "end": result.get("end", -1),
232
- "found": False
237
+ "found": False,
233
238
  }
234
-
239
+
235
240
  return {
236
241
  "answer": result["answer"],
237
242
  "confidence": result["score"],
238
243
  "start": result.get("start", 0),
239
244
  "end": result.get("end", 0),
240
- "found": True
245
+ "found": True,
241
246
  }
242
-
243
-
244
- def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
247
+
248
+ def ask_pdf_page(
249
+ self, page, question: str, min_confidence: float = 0.1, debug: bool = False
250
+ ) -> Dict[str, Any]:
245
251
  """
246
252
  Ask a question about a specific PDF page.
247
-
253
+
248
254
  Args:
249
255
  page: natural_pdf.core.page.Page object
250
256
  question: Question to ask about the page
251
257
  min_confidence: Minimum confidence threshold for answers
252
-
258
+
253
259
  Returns:
254
260
  Dictionary with answer details
255
261
  """
256
262
  # Ensure we have text elements on the page
257
- if not page.find_all('text'):
263
+ if not page.find_all("text"):
258
264
  # Apply OCR if no text is available
259
265
  logger.info(f"No text elements found on page {page.index}, applying OCR")
260
266
  page.apply_ocr()
261
-
267
+
262
268
  # Extract word boxes
263
- elements = page.find_all('text')
269
+ elements = page.find_all("text")
264
270
  word_boxes = self._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
265
-
271
+
266
272
  # Generate a high-resolution image of the page
267
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
273
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
268
274
  temp_path = temp_file.name
269
-
275
+
270
276
  # Save a high resolution image (300 DPI)
271
277
  page_image = page.to_image(resolution=300, include_highlights=False)
272
278
  page_image.save(temp_path)
273
-
279
+
274
280
  try:
275
281
  # Ask the question
276
282
  result = self.ask(
@@ -278,79 +284,81 @@ class DocumentQA:
278
284
  question=question,
279
285
  word_boxes=word_boxes,
280
286
  min_confidence=min_confidence,
281
- debug=debug
287
+ debug=debug,
282
288
  )
283
-
289
+
284
290
  # Add page reference to the result
285
291
  result["page_num"] = page.index
286
-
292
+
287
293
  # Add element references if possible
288
294
  if result.get("found", False) and "start" in result and "end" in result:
289
295
  start_idx = result["start"]
290
296
  end_idx = result["end"]
291
-
297
+
292
298
  # Make sure we have valid indices and elements to work with
293
299
  if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
294
300
  # Find the actual source elements in the original list
295
301
  # Since word_boxes may have filtered out some elements, we need to map indices
296
-
302
+
297
303
  # Get the text from result word boxes
298
- matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
299
-
304
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
305
+
300
306
  # Find corresponding elements in the full element list
301
307
  source_elements = []
302
308
  for element in elements:
303
- if hasattr(element, 'text') and element.text in matched_texts:
309
+ if hasattr(element, "text") and element.text in matched_texts:
304
310
  source_elements.append(element)
305
311
  # Remove from matched texts to avoid duplicates
306
312
  if element.text in matched_texts:
307
313
  matched_texts.remove(element.text)
308
314
 
309
315
  result["source_elements"] = ElementCollection(source_elements)
310
-
316
+
311
317
  return result
312
-
318
+
313
319
  finally:
314
320
  # Clean up temporary file
315
321
  if os.path.exists(temp_path):
316
322
  os.remove(temp_path)
317
-
318
- def ask_pdf_region(self, region, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
323
+
324
+ def ask_pdf_region(
325
+ self, region, question: str, min_confidence: float = 0.1, debug: bool = False
326
+ ) -> Dict[str, Any]:
319
327
  """
320
328
  Ask a question about a specific region of a PDF page.
321
-
329
+
322
330
  Args:
323
331
  region: natural_pdf.elements.region.Region object
324
332
  question: Question to ask about the region
325
333
  min_confidence: Minimum confidence threshold for answers
326
-
334
+
327
335
  Returns:
328
336
  Dictionary with answer details
329
337
  """
330
338
  # Get all text elements within the region
331
- elements = region.find_all('text')
332
-
339
+ elements = region.find_all("text")
340
+
333
341
  # Apply OCR if needed
334
342
  if not elements:
335
343
  logger.info(f"No text elements found in region, applying OCR")
336
344
  elements = region.apply_ocr()
337
-
345
+
338
346
  # Extract word boxes adjusted for the cropped region
339
347
  x0, top = int(region.x0), int(region.top)
340
348
  word_boxes = self._get_word_boxes_from_elements(elements, offset_x=x0, offset_y=top)
341
-
349
+
342
350
  # Generate a cropped image of the region
343
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_file:
351
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
344
352
  temp_path = temp_file.name
345
-
353
+
346
354
  # Get page image at high resolution - this returns a PIL Image directly
347
355
  page_image = region.page.to_image(resolution=300, include_highlights=False)
348
-
356
+
349
357
  # Crop to region
350
358
  x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
351
359
  region_image = page_image.crop((x0, top, x1, bottom))
352
360
  region_image.save(temp_path)
353
-
361
+
354
362
  try:
355
363
  # Ask the question
356
364
  result = self.ask(
@@ -358,40 +366,40 @@ class DocumentQA:
358
366
  question=question,
359
367
  word_boxes=word_boxes,
360
368
  min_confidence=min_confidence,
361
- debug=debug
369
+ debug=debug,
362
370
  )
363
-
371
+
364
372
  # Add region reference to the result
365
373
  result["region"] = region
366
374
  result["page_num"] = region.page.index
367
-
375
+
368
376
  # Add element references if possible
369
377
  if result.get("found", False) and "start" in result and "end" in result:
370
378
  start_idx = result["start"]
371
379
  end_idx = result["end"]
372
-
380
+
373
381
  # Make sure we have valid indices and elements to work with
374
382
  if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
375
383
  # Find the actual source elements in the original list
376
384
  # Since word_boxes may have filtered out some elements, we need to map indices
377
-
385
+
378
386
  # Get the text from result word boxes
379
- matched_texts = [wb[0] for wb in word_boxes[start_idx:end_idx+1]]
380
-
387
+ matched_texts = [wb[0] for wb in word_boxes[start_idx : end_idx + 1]]
388
+
381
389
  # Find corresponding elements in the full element list
382
390
  source_elements = []
383
391
  for element in elements:
384
- if hasattr(element, 'text') and element.text in matched_texts:
392
+ if hasattr(element, "text") and element.text in matched_texts:
385
393
  source_elements.append(element)
386
394
  # Remove from matched texts to avoid duplicates
387
395
  if element.text in matched_texts:
388
396
  matched_texts.remove(element.text)
389
-
397
+
390
398
  result["source_elements"] = ElementCollection(source_elements)
391
-
399
+
392
400
  return result
393
-
401
+
394
402
  finally:
395
403
  # Clean up temporary file
396
404
  if os.path.exists(temp_path):
397
- os.remove(temp_path)
405
+ os.remove(temp_path)
@@ -7,32 +7,29 @@ from typing import Optional
7
7
  # Import the concrete implementation
8
8
  from .haystack_search_service import HaystackSearchService
9
9
 
10
- # --- Protocol Import ---
11
- # Import the protocol for type hinting
12
- from .search_service_protocol import (
13
- SearchServiceProtocol,
14
- IndexConfigurationError,
15
- Indexable
10
+ # --- Utils Import ---
11
+ from .haystack_utils import ( # Re-export flag and helper
12
+ HAS_HAYSTACK_EXTRAS,
13
+ check_haystack_availability,
16
14
  )
17
15
 
18
16
  # --- Option Imports (for convenience) ---
19
17
  # Make options easily available via `from natural_pdf.search import ...`
20
- from .search_options import (
21
- BaseSearchOptions,
22
- SearchOptions, # Alias for TextSearchOptions for simplicity?
23
- TextSearchOptions,
24
- MultiModalSearchOptions
25
- )
26
- # --- Utils Import ---
27
- from .haystack_utils import HAS_HAYSTACK_EXTRAS, check_haystack_availability # Re-export flag and helper
18
+ from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
19
+ from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
20
+
21
+ # --- Protocol Import ---
22
+ # Import the protocol for type hinting
23
+ from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
28
24
 
29
25
  logger = logging.getLogger(__name__)
30
26
 
31
27
  # --- Factory Function ---
32
28
 
29
+
33
30
  def get_search_service(
34
- collection_name: str, # Add collection_name as a required argument
35
- persist: bool = False, # Default to In-Memory
31
+ collection_name: str, # Add collection_name as a required argument
32
+ persist: bool = False, # Default to In-Memory
36
33
  # Configuration for the service itself
37
34
  default_persist_path: Optional[str] = None,
38
35
  default_embedding_model: Optional[str] = None,
@@ -56,39 +53,48 @@ def get_search_service(
56
53
  Returns:
57
54
  An instance conforming to the SearchServiceProtocol for the specified collection.
58
55
  """
59
- logger.debug(f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})...")
60
-
56
+ logger.debug(
57
+ f"Calling get_search_service factory for collection '{collection_name}' (persist={persist})..."
58
+ )
59
+
61
60
  # For now, we only have one implementation
62
61
  # Collect arguments relevant to HaystackSearchService.__init__
63
62
  service_args = {}
64
- service_args['collection_name'] = collection_name # Pass collection_name
65
- service_args['persist'] = persist # Pass persist flag to service constructor
63
+ service_args["collection_name"] = collection_name # Pass collection_name
64
+ service_args["persist"] = persist # Pass persist flag to service constructor
66
65
  if default_persist_path is not None:
67
- service_args['default_persist_path'] = default_persist_path
66
+ service_args["default_persist_path"] = default_persist_path
68
67
  if default_embedding_model is not None:
69
- service_args['default_embedding_model'] = default_embedding_model
70
-
68
+ service_args["default_embedding_model"] = default_embedding_model
69
+
71
70
  # TODO: Implement caching/registry if needed to return the same instance
72
71
  # for the same configuration instead of always creating a new one.
73
72
  # cache_key = tuple(sorted(service_args.items()))
74
73
  # if cache_key in _service_instance_cache:
75
74
  # return _service_instance_cache[cache_key]
76
-
75
+
77
76
  try:
78
77
  service_instance = HaystackSearchService(**service_args)
79
78
  # _service_instance_cache[cache_key] = service_instance
80
- logger.info(f"Created new HaystackSearchService instance for collection '{collection_name}'.")
79
+ logger.info(
80
+ f"Created new HaystackSearchService instance for collection '{collection_name}'."
81
+ )
81
82
  return service_instance
82
83
  except ImportError as e:
83
- logger.error(f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True)
84
- raise ImportError("Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]") from e
84
+ logger.error(
85
+ f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
86
+ )
87
+ raise ImportError(
88
+ "Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
89
+ ) from e
85
90
  except Exception as e:
86
- logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
87
- raise RuntimeError("Could not create Search Service instance.") from e
91
+ logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
92
+ raise RuntimeError("Could not create Search Service instance.") from e
93
+
88
94
 
89
95
  # --- Optional: Define a default instance for extreme ease of use? ---
90
96
  # try:
91
97
  # default_search_service = get_search_service()
92
98
  # except Exception:
93
- # default_search_service = None
94
- # logger.warning("Could not create default search service instance on import.")
99
+ # default_search_service = None
100
+ # logger.warning("Could not create default search service instance on import.")