natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,458 @@
1
+ import logging
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
3
+
4
+ from pdfplumber.utils.geometry import objects_to_bbox # For calculating combined bbox
5
+
6
+ # For runtime image manipulation
7
+ from PIL import Image as PIL_Image_Runtime
8
+
9
+ if TYPE_CHECKING:
10
+ from PIL.Image import Image as PIL_Image # For type hints
11
+ from natural_pdf.elements.base import Element as PhysicalElement
12
+ from natural_pdf.elements.region import Region as PhysicalRegion
13
+ from natural_pdf.elements.collections import ElementCollection
14
+ from natural_pdf.core.page import Page as PhysicalPage
15
+ from .flow import Flow
16
+ from .element import FlowElement
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class FlowRegion:
22
+ """
23
+ Represents a selected area within a Flow, potentially composed of multiple
24
+ physical Region objects (constituent_regions) that might span across
25
+ different original pages or disjoint physical regions defined in the Flow.
26
+
27
+ A FlowRegion is the result of a directional operation (e.g., .below(), .above())
28
+ on a FlowElement.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ flow: "Flow",
34
+ constituent_regions: List["PhysicalRegion"],
35
+ source_flow_element: "FlowElement",
36
+ boundary_element_found: Optional["PhysicalElement"] = None,
37
+ ):
38
+ """
39
+ Initializes a FlowRegion.
40
+
41
+ Args:
42
+ flow: The Flow instance this region belongs to.
43
+ constituent_regions: A list of physical natural_pdf.elements.region.Region
44
+ objects that make up this FlowRegion.
45
+ source_flow_element: The FlowElement that created this FlowRegion.
46
+ boundary_element_found: The physical element that stopped an 'until' search,
47
+ if applicable.
48
+ """
49
+ self.flow: "Flow" = flow
50
+ self.constituent_regions: List["PhysicalRegion"] = constituent_regions
51
+ self.source_flow_element: "FlowElement" = source_flow_element
52
+ self.boundary_element_found: Optional["PhysicalElement"] = boundary_element_found
53
+
54
+ # Cache for expensive operations
55
+ self._cached_text: Optional[str] = None
56
+ self._cached_elements: Optional["ElementCollection"] = None # Stringized
57
+ self._cached_bbox: Optional[Tuple[float, float, float, float]] = None
58
+
59
+ @property
60
+ def bbox(self) -> Optional[Tuple[float, float, float, float]]:
61
+ """
62
+ Calculates a conceptual bounding box that encompasses all constituent physical regions.
63
+ This is the union of the bounding boxes of the constituent regions in their
64
+ original physical coordinates.
65
+ Returns None if there are no constituent regions.
66
+ """
67
+ if self._cached_bbox is not None:
68
+ return self._cached_bbox
69
+ if not self.constituent_regions:
70
+ return None
71
+
72
+ # Use objects_to_bbox from pdfplumber.utils.geometry to merge bboxes
73
+ # This helper expects a list of objects that have .x0, .top, .x1, .bottom attributes.
74
+ # Our PhysicalRegion objects satisfy this.
75
+ self._cached_bbox = objects_to_bbox(self.constituent_regions)
76
+ return self._cached_bbox
77
+
78
+ @property
79
+ def x0(self) -> Optional[float]:
80
+ return self.bbox[0] if self.bbox else None
81
+
82
+ @property
83
+ def top(self) -> Optional[float]:
84
+ return self.bbox[1] if self.bbox else None
85
+
86
+ @property
87
+ def x1(self) -> Optional[float]:
88
+ return self.bbox[2] if self.bbox else None
89
+
90
+ @property
91
+ def bottom(self) -> Optional[float]:
92
+ return self.bbox[3] if self.bbox else None
93
+
94
+ @property
95
+ def width(self) -> Optional[float]:
96
+ return self.x1 - self.x0 if self.bbox else None
97
+
98
+ @property
99
+ def height(self) -> Optional[float]:
100
+ return self.bottom - self.top if self.bbox else None
101
+
102
+ def extract_text(self, apply_exclusions: bool = True, **kwargs) -> str:
103
+ """
104
+ Extracts and concatenates text from all constituent physical regions.
105
+ The order of concatenation respects the flow's arrangement.
106
+
107
+ Args:
108
+ apply_exclusions: Whether to respect PDF exclusion zones within each
109
+ constituent physical region during text extraction.
110
+ **kwargs: Additional arguments passed to the underlying extract_text method
111
+ of each constituent region.
112
+
113
+ Returns:
114
+ The combined text content as a string.
115
+ """
116
+ if self._cached_text is not None and apply_exclusions: # Simple cache check, might need refinement if kwargs change behavior
117
+ return self._cached_text
118
+
119
+ if not self.constituent_regions:
120
+ return ""
121
+
122
+ texts: List[str] = []
123
+ # For now, simple concatenation. Order depends on how constituent_regions were added.
124
+ # The FlowElement._flow_direction method is responsible for ordering constituent_regions correctly.
125
+ for region in self.constituent_regions:
126
+ texts.append(region.extract_text(apply_exclusions=apply_exclusions, **kwargs))
127
+
128
+ # Join based on flow arrangement (e.g., newline for vertical, space for horizontal)
129
+ # This is a simplification; true layout-aware joining would be more complex.
130
+ joiner = "\n" if self.flow.arrangement == "vertical" else " " # TODO: Make this smarter, consider segment_gap
131
+ extracted = joiner.join(t for t in texts if t)
132
+
133
+ if apply_exclusions: # Only cache if standard exclusion behavior
134
+ self._cached_text = extracted
135
+ return extracted
136
+
137
+ def elements(self, apply_exclusions: bool = True) -> "ElementCollection": # Stringized return
138
+ """
139
+ Collects all unique physical elements from all constituent physical regions.
140
+
141
+ Args:
142
+ apply_exclusions: Whether to respect PDF exclusion zones within each
143
+ constituent physical region when gathering elements.
144
+
145
+ Returns:
146
+ An ElementCollection containing all unique elements.
147
+ """
148
+ from natural_pdf.elements.collections import ElementCollection as RuntimeElementCollection # Local import
149
+
150
+ if self._cached_elements is not None and apply_exclusions: # Simple cache check
151
+ return self._cached_elements
152
+
153
+ if not self.constituent_regions:
154
+ return RuntimeElementCollection([])
155
+
156
+ all_physical_elements: List["PhysicalElement"] = [] # Stringized item type
157
+ seen_elements = set() # To ensure uniqueness if elements are shared or duplicated by region definitions
158
+
159
+ for region in self.constituent_regions:
160
+ # Region.get_elements() returns a list, not ElementCollection
161
+ elements_in_region: List["PhysicalElement"] = region.get_elements(apply_exclusions=apply_exclusions)
162
+ for elem in elements_in_region:
163
+ if elem not in seen_elements: # Check for uniqueness based on object identity
164
+ all_physical_elements.append(elem)
165
+ seen_elements.add(elem)
166
+
167
+ # Basic reading order sort based on original page and coordinates.
168
+ def get_sort_key(phys_elem: "PhysicalElement"): # Stringized param type
169
+ page_idx = -1
170
+ if hasattr(phys_elem, 'page') and hasattr(phys_elem.page, 'index'):
171
+ page_idx = phys_elem.page.index
172
+ return (page_idx, phys_elem.top, phys_elem.x0)
173
+
174
+ try:
175
+ sorted_physical_elements = sorted(all_physical_elements, key=get_sort_key)
176
+ except AttributeError:
177
+ logger.warning("Could not sort elements in FlowRegion by reading order; some elements might be missing page, top or x0 attributes.")
178
+ sorted_physical_elements = all_physical_elements
179
+
180
+ result_collection = RuntimeElementCollection(sorted_physical_elements)
181
+ if apply_exclusions:
182
+ self._cached_elements = result_collection
183
+ return result_collection
184
+
185
+ def find(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> Optional["PhysicalElement"]: # Stringized
186
+ """
187
+ Finds the first physical element within this FlowRegion that matches the selector or text.
188
+ """
189
+ # Uses self.elements() which respects exclusions if apply_exclusions=True by default
190
+ all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
191
+ return all_elems.find(selector=selector, text=text, **kwargs) # ElementCollection.find
192
+
193
+ def find_all(self, selector: Optional[str] = None, *, text: Optional[str] = None, **kwargs) -> "ElementCollection": # Stringized
194
+ """
195
+ Finds all physical elements within this FlowRegion that match the selector or text.
196
+ """
197
+ all_elems = self.elements(apply_exclusions=kwargs.get("apply_exclusions", True))
198
+ return all_elems.find_all(selector=selector, text=text, **kwargs) # ElementCollection.find_all
199
+
200
+ def highlight(self, label: Optional[str] = None, color: Optional[Union[Tuple, str]] = None, **kwargs) -> "FlowRegion": # Stringized
201
+ """
202
+ Highlights all constituent physical regions on their respective pages.
203
+
204
+ Args:
205
+ label: A base label for the highlights. Each constituent region might get an indexed label.
206
+ color: Color for the highlight.
207
+ **kwargs: Additional arguments for the underlying highlight method.
208
+
209
+ Returns:
210
+ Self for method chaining.
211
+ """
212
+ if not self.constituent_regions:
213
+ return self
214
+
215
+ base_label = label if label else "FlowRegionPart"
216
+ for i, region in enumerate(self.constituent_regions):
217
+ current_label = f"{base_label}_{i+1}" if len(self.constituent_regions) > 1 else base_label
218
+ region.highlight(label=current_label, color=color, **kwargs)
219
+ return self
220
+
221
+ def show(
222
+ self,
223
+ scale: float = 2.0,
224
+ labels: bool = True,
225
+ legend_position: str = "right",
226
+ color: Optional[Union[Tuple, str]] = "fuchsia",
227
+ label_prefix: Optional[str] = "FlowPart",
228
+ width: Optional[int] = None,
229
+ stack_direction: str = "vertical",
230
+ stack_gap: int = 5,
231
+ stack_background_color: Tuple[int, int, int] = (255, 255, 255),
232
+ **kwargs
233
+ ) -> Optional["PIL_Image"]:
234
+ """
235
+ Generates and returns a PIL Image of relevant pages with constituent regions highlighted.
236
+ If multiple pages are involved, they are stacked into a single image.
237
+ """
238
+ if not self.constituent_regions:
239
+ logger.info("FlowRegion.show() called with no constituent regions.")
240
+ return None
241
+
242
+ # 1. Group constituent regions by their physical page
243
+ regions_by_page: Dict["PhysicalPage", List["PhysicalRegion"]] = {}
244
+ for region in self.constituent_regions:
245
+ if region.page:
246
+ if region.page not in regions_by_page:
247
+ regions_by_page[region.page] = []
248
+ regions_by_page[region.page].append(region)
249
+ else:
250
+ raise ValueError(f"Constituent region {region.bbox} has no page.")
251
+
252
+ if not regions_by_page:
253
+ logger.info("FlowRegion.show() found no constituent regions with associated pages.")
254
+ return None
255
+
256
+ # 2. Get a highlighter service (e.g., from the first page involved)
257
+ first_page_with_regions = next(iter(regions_by_page.keys()), None)
258
+ highlighter_service = None
259
+ if first_page_with_regions and hasattr(first_page_with_regions, '_highlighter'):
260
+ highlighter_service = first_page_with_regions._highlighter
261
+
262
+ if not highlighter_service:
263
+ raise ValueError(
264
+ "Cannot get highlighter service for FlowRegion.show(). "
265
+ "Ensure constituent regions' pages are initialized with a highlighter."
266
+ )
267
+
268
+ output_page_images: List["PIL_Image_Runtime"] = []
269
+
270
+ # Sort pages by index for consistent output order
271
+ sorted_pages = sorted(regions_by_page.keys(), key=lambda p: p.index if hasattr(p, 'index') else getattr(p, 'page_number', 0))
272
+
273
+ # 3. Render each page with its relevant constituent regions highlighted
274
+ for page_idx, page_obj in enumerate(sorted_pages):
275
+ constituent_regions_on_this_page = regions_by_page[page_obj]
276
+ if not constituent_regions_on_this_page:
277
+ continue
278
+
279
+ temp_highlights_for_page = []
280
+ for i, region_part in enumerate(constituent_regions_on_this_page):
281
+ part_label = None
282
+ if labels and label_prefix: # Ensure labels is True for label_prefix to apply
283
+ # If FlowRegion consists of multiple parts on this page, or overall
284
+ count_indicator = ""
285
+ if len(self.constituent_regions) > 1 : # If flow region has multiple parts overall
286
+ # Find global index of this region_part in self.constituent_regions
287
+ try:
288
+ global_idx = self.constituent_regions.index(region_part)
289
+ count_indicator = f"_{global_idx + 1}"
290
+ except ValueError: # Should not happen if region_part is from the list
291
+ count_indicator = f"_p{page_idx}i{i+1}" # fallback local index
292
+ elif len(constituent_regions_on_this_page) > 1 : # If multiple parts on *this* page, but FR is single part overall
293
+ count_indicator = f"_{i+1}"
294
+
295
+ part_label = f"{label_prefix}{count_indicator}" if label_prefix else None
296
+
297
+ temp_highlights_for_page.append({
298
+ "page_index": page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
299
+ "bbox": region_part.bbox,
300
+ "polygon": region_part.polygon if region_part.has_polygon else None,
301
+ "color": color, # Use the passed color
302
+ "label": part_label,
303
+ "use_color_cycling": False, # Keep specific color
304
+ })
305
+
306
+ if not temp_highlights_for_page:
307
+ continue
308
+
309
+ page_image = highlighter_service.render_preview(
310
+ page_index=page_obj.index if hasattr(page_obj, 'index') else getattr(page_obj, 'page_number', 1) -1,
311
+ temporary_highlights=temp_highlights_for_page,
312
+ scale=scale,
313
+ width=width,
314
+ labels=labels, # Pass through labels
315
+ legend_position=legend_position,
316
+ **kwargs
317
+ )
318
+ if page_image:
319
+ output_page_images.append(page_image)
320
+
321
+ # 4. Stack the generated page images if multiple
322
+ if not output_page_images:
323
+ logger.info("FlowRegion.show() produced no page images to concatenate.")
324
+ return None
325
+
326
+ if len(output_page_images) == 1:
327
+ return output_page_images[0]
328
+
329
+ # Stacking logic (same as in FlowRegionCollection.show)
330
+ if stack_direction == "vertical":
331
+ final_width = max(img.width for img in output_page_images)
332
+ final_height = sum(img.height for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
333
+ if final_width == 0 or final_height == 0:
334
+ raise ValueError("Cannot create concatenated image with zero width or height.")
335
+
336
+ concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
337
+ current_y = 0
338
+ for img in output_page_images:
339
+ paste_x = (final_width - img.width) // 2
340
+ concatenated_image.paste(img, (paste_x, current_y))
341
+ current_y += img.height + stack_gap
342
+ return concatenated_image
343
+ elif stack_direction == "horizontal":
344
+ final_width = sum(img.width for img in output_page_images) + (len(output_page_images) - 1) * stack_gap
345
+ final_height = max(img.height for img in output_page_images)
346
+ if final_width == 0 or final_height == 0:
347
+ raise ValueError("Cannot create concatenated image with zero width or height.")
348
+
349
+ concatenated_image = PIL_Image_Runtime.new("RGB", (final_width, final_height), stack_background_color)
350
+ current_x = 0
351
+ for img in output_page_images:
352
+ paste_y = (final_height - img.height) // 2
353
+ concatenated_image.paste(img, (current_x, paste_y))
354
+ current_x += img.width + stack_gap
355
+ return concatenated_image
356
+ else:
357
+ raise ValueError(f"Invalid stack_direction '{stack_direction}' for FlowRegion.show(). Must be 'vertical' or 'horizontal'.")
358
+
359
+ def to_images(
360
+ self,
361
+ resolution: float = 150,
362
+ **kwargs,
363
+ ) -> List["PIL_Image"]:
364
+ """
365
+ Generates and returns a list of cropped PIL Images,
366
+ one for each constituent physical region of this FlowRegion.
367
+ """
368
+ if not self.constituent_regions:
369
+ logger.info("FlowRegion.to_images() called on an empty FlowRegion.")
370
+ return []
371
+
372
+ cropped_images: List["PIL_Image"] = []
373
+ for region_part in self.constituent_regions:
374
+ try:
375
+ img = region_part.to_image(
376
+ resolution=resolution,
377
+ crop_only=True,
378
+ include_highlights=False,
379
+ **kwargs
380
+ )
381
+ if img:
382
+ cropped_images.append(img)
383
+ except Exception as e:
384
+ logger.error(f"Error generating image for constituent region {region_part.bbox}: {e}", exc_info=True)
385
+
386
+ return cropped_images
387
+
388
+ def to_image(self, background_color=(255,255,255), **kwargs) -> Optional["PIL_Image"]:
389
+ """
390
+ Creates a single composite image by stacking the images of its constituent regions.
391
+ Stacking direction is based on the Flow's arrangement.
392
+ Individual region images are obtained by calling to_images(**kwargs).
393
+
394
+ Args:
395
+ background_color: Tuple for RGB background color of the composite image.
396
+ **kwargs: Additional arguments passed to to_images() for rendering individual parts
397
+ (e.g., resolution).
398
+
399
+ Returns:
400
+ A single PIL.Image.Image object, or None if no constituent images.
401
+ """
402
+ # Use PIL_Image_Runtime for creating new images at runtime
403
+ images = self.to_images(**kwargs)
404
+ if not images:
405
+ return None
406
+ if len(images) == 1:
407
+ return images[0]
408
+
409
+ if self.flow.arrangement == "vertical":
410
+ # Stack vertically
411
+ composite_width = max(img.width for img in images)
412
+ composite_height = sum(img.height for img in images)
413
+ if composite_width == 0 or composite_height == 0: return None # Avoid zero-size image
414
+
415
+ new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
416
+ current_y = 0
417
+ for img in images:
418
+ # Default to left alignment for vertical stacking
419
+ new_image.paste(img, (0, current_y))
420
+ current_y += img.height
421
+ return new_image
422
+
423
+ elif self.flow.arrangement == "horizontal":
424
+ # Stack horizontally
425
+ composite_width = sum(img.width for img in images)
426
+ composite_height = max(img.height for img in images)
427
+ if composite_width == 0 or composite_height == 0: return None
428
+
429
+ new_image = PIL_Image_Runtime.new("RGB", (composite_width, composite_height), background_color)
430
+ current_x = 0
431
+ for img in images:
432
+ # Default to top alignment for horizontal stacking
433
+ new_image.paste(img, (current_x, 0))
434
+ current_x += img.width
435
+ return new_image
436
+ else:
437
+ # Should not happen if flow.arrangement is validated
438
+ logger.warning(f"Unknown flow arrangement: {self.flow.arrangement}. Cannot stack images.")
439
+ return None
440
+
441
+ def __repr__(self) -> str:
442
+ return (
443
+ f"<FlowRegion constituents={len(self.constituent_regions)}, flow={self.flow}, "
444
+ f"source_bbox={self.source_flow_element.bbox if self.source_flow_element else 'N/A'}>"
445
+ )
446
+
447
+ @property
448
+ def is_empty(self) -> bool:
449
+ """Checks if the FlowRegion contains no constituent regions or if all are empty."""
450
+ if not self.constituent_regions:
451
+ return True
452
+ # A more robust check might see if extract_text() is empty and elements() is empty.
453
+ # For now, if it has regions, it's not considered empty by this simple check.
454
+ # User Point 4: FlowRegion can be empty (no text, no elements). This implies checking content.
455
+ try:
456
+ return not bool(self.extract_text(apply_exclusions=False).strip()) and not bool(self.elements(apply_exclusions=False))
457
+ except Exception:
458
+ return True # If error during check, assume empty to be safe
@@ -3,29 +3,46 @@
3
3
  import logging
4
4
  from typing import Optional
5
5
 
6
- # --- Service Implementation Import ---
7
- # Import the concrete implementation
8
- from .haystack_search_service import HaystackSearchService
9
-
10
- # --- Utils Import ---
11
- from .haystack_utils import ( # Re-export flag and helper
12
- HAS_HAYSTACK_EXTRAS,
13
- check_haystack_availability,
14
- )
15
-
16
- # --- Option Imports (for convenience) ---
17
- # Make options easily available via `from natural_pdf.search import ...`
18
- from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
6
+ # Import constants
7
+ from .search_options import SearchOptions
19
8
  from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
20
-
21
- # --- Protocol Import ---
22
- # Import the protocol for type hinting
23
9
  from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
24
10
 
11
+ # Check search extras availability
12
+ LANCEDB_AVAILABLE = False
13
+ SEARCH_DEPENDENCIES_AVAILABLE = False
14
+
15
+ try:
16
+ import sentence_transformers
17
+ import numpy as np
18
+ # Basic search dependencies are available
19
+ SEARCH_DEPENDENCIES_AVAILABLE = True
20
+
21
+ # Check if LanceDB is available
22
+ try:
23
+ import lancedb
24
+ import pyarrow
25
+ LANCEDB_AVAILABLE = True
26
+ from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
27
+ except ImportError:
28
+ # LanceDB not available, we'll use NumPy fallback
29
+ LANCEDB_AVAILABLE = False
30
+ from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
31
+ except ImportError:
32
+ # Basic dependencies missing
33
+ SEARCH_DEPENDENCIES_AVAILABLE = False
34
+ LANCEDB_AVAILABLE = False
35
+
25
36
  logger = logging.getLogger(__name__)
26
37
 
38
+ def check_search_availability():
39
+ """Check if required search dependencies are available."""
40
+ if not SEARCH_DEPENDENCIES_AVAILABLE:
41
+ raise ImportError(
42
+ "Search functionality requires 'sentence-transformers' and NumPy. "
43
+ "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
44
+ )
27
45
 
28
- # Factory Function
29
46
  def get_search_service(
30
47
  collection_name: str,
31
48
  persist: bool = False,
@@ -34,53 +51,49 @@ def get_search_service(
34
51
  ) -> SearchServiceProtocol:
35
52
  """
36
53
  Factory function to get an instance of the configured search service.
37
-
38
- A service instance is tied to a specific index name (collection/table).
39
-
40
- Currently, only returns HaystackSearchService but is structured for future extension.
54
+
55
+ Automatically selects the best available implementation:
56
+ - LanceDB if installed (recommended for both in-memory and persistent)
57
+ - Numpy fallback for in-memory only
41
58
 
42
59
  Args:
43
- collection_name: The logical name for the index this service instance manages
44
- (used as table_name for LanceDB).
60
+ collection_name: The logical name for the index/table this service instance manages.
45
61
  persist: If True, creates a service instance configured for persistent
46
- storage (currently LanceDB). If False (default), uses InMemory.
47
- uri: Override the default path/URI for persistent storage.
62
+ storage. If False (default), uses InMemory (via temp dir for LanceDB).
63
+ uri: Override the default path for persistent storage.
48
64
  default_embedding_model: Override the default embedding model used by the service.
49
- **kwargs: Reserved for future configuration options.
50
65
 
51
66
  Returns:
52
- An instance conforming to the SearchServiceProtocol for the specified collection/table.
67
+ An instance conforming to the SearchServiceProtocol.
53
68
  """
54
69
  logger.debug(
55
- f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
70
+ f"Calling get_search_service factory for collection '{collection_name}' (persist={persist}, uri={uri})..."
56
71
  )
72
+ check_search_availability()
57
73
 
58
- # Collect arguments relevant to HaystackSearchService.__init__
59
- service_args = {}
60
- service_args["table_name"] = collection_name
61
- service_args["persist"] = persist
74
+ service_args = {
75
+ "collection_name": collection_name,
76
+ "persist": persist,
77
+ }
62
78
  if uri is not None:
63
79
  service_args["uri"] = uri
64
- if default_embedding_model is not None:
65
- service_args["embedding_model"] = default_embedding_model
66
80
 
67
- # Cache logic commented out as before
81
+ if default_embedding_model is not None:
82
+ service_args["embedding_model_name"] = default_embedding_model
68
83
 
69
- try:
70
- service_instance = HaystackSearchService(**service_args)
71
- logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
72
- return service_instance
73
- except ImportError as e:
74
- # Error message remains valid
75
- logger.error(
76
- f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
84
+ # If persistence is requested, LanceDB is required
85
+ if persist and not LANCEDB_AVAILABLE:
86
+ raise RuntimeError(
87
+ "Persistent vector search requires LanceDB. "
88
+ "Please install: pip install lancedb"
77
89
  )
78
- raise ImportError(
79
- "Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
80
- ) from e
81
- except Exception as e:
82
- logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
83
- raise RuntimeError("Could not create Search Service instance.") from e
84
-
85
-
86
- # Default instance commented out as before
90
+
91
+ # Select the appropriate implementation
92
+ if LANCEDB_AVAILABLE:
93
+ logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
94
+ service_instance = LanceDBSearchService(**service_args)
95
+ else:
96
+ logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
97
+ service_instance = NumpySearchService(**service_args)
98
+
99
+ return service_instance