natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +4 -2
  8. natural_pdf/core/pdf.py +53 -38
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +203 -59
  11. natural_pdf/elements/region.py +43 -11
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +40 -61
  16. natural_pdf/exporters/hocr_font.py +7 -13
  17. natural_pdf/exporters/original_pdf.py +10 -13
  18. natural_pdf/exporters/searchable_pdf.py +0 -10
  19. natural_pdf/search/__init__.py +65 -52
  20. natural_pdf/search/lancedb_search_service.py +325 -0
  21. natural_pdf/search/numpy_search_service.py +255 -0
  22. natural_pdf/search/searchable_mixin.py +25 -71
  23. natural_pdf/widgets/viewer.py +22 -31
  24. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
  25. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
  26. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  27. natural_pdf/search/haystack_search_service.py +0 -687
  28. natural_pdf/search/haystack_utils.py +0 -474
  29. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  30. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -66,28 +66,28 @@ class HocrTransform:
66
66
  """
67
67
 
68
68
  box_pattern = re.compile(
69
- r'''
69
+ r"""
70
70
  bbox \s+
71
71
  (\d+) \s+ # left: uint
72
72
  (\d+) \s+ # top: uint
73
73
  (\d+) \s+ # right: uint
74
74
  (\d+) # bottom: uint
75
- ''',
75
+ """,
76
76
  re.VERBOSE,
77
77
  )
78
78
  baseline_pattern = re.compile(
79
- r'''
79
+ r"""
80
80
  baseline \s+
81
81
  ([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
82
82
  ([\-\+]?\d+) # +/- int
83
- ''',
83
+ """,
84
84
  re.VERBOSE,
85
85
  )
86
86
  textangle_pattern = re.compile(
87
- r'''
87
+ r"""
88
88
  textangle \s+
89
89
  ([\-\+]?\d*\.?\d*) # +/- decimal float
90
- ''',
90
+ """,
91
91
  re.VERBOSE,
92
92
  )
93
93
 
@@ -121,12 +121,12 @@ class HocrTransform:
121
121
 
122
122
  # if the hOCR file has a namespace, ElementTree requires its use to
123
123
  # find elements
124
- matches = re.match(r'({.*})html', self.hocr.getroot().tag)
125
- self.xmlns = ''
124
+ matches = re.match(r"({.*})html", self.hocr.getroot().tag)
125
+ self.xmlns = ""
126
126
  if matches:
127
127
  self.xmlns = matches.group(1)
128
128
 
129
- for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
129
+ for div in self.hocr.findall(self._child_xpath("div", "ocr_page")):
130
130
  coords = self.element_coordinates(div)
131
131
  if not coords:
132
132
  raise HocrTransformError("hocr file is missing page dimensions")
@@ -137,16 +137,16 @@ class HocrTransform:
137
137
 
138
138
  def _get_element_text(self, element: Element) -> str:
139
139
  """Return the textual content of the element and its children."""
140
- text = element.text if element.text is not None else ''
140
+ text = element.text if element.text is not None else ""
141
141
  for child in element:
142
142
  text += self._get_element_text(child)
143
- text += element.tail if element.tail is not None else ''
143
+ text += element.tail if element.tail is not None else ""
144
144
  return text
145
145
 
146
146
  @classmethod
147
147
  def element_coordinates(cls, element: Element) -> Rectangle | None:
148
148
  """Get coordinates of the bounding box around an element."""
149
- matches = cls.box_pattern.search(element.attrib.get('title', ''))
149
+ matches = cls.box_pattern.search(element.attrib.get("title", ""))
150
150
  if not matches:
151
151
  return None
152
152
  return Rectangle(
@@ -159,7 +159,7 @@ class HocrTransform:
159
159
  @classmethod
160
160
  def baseline(cls, element: Element) -> tuple[float, float]:
161
161
  """Get baseline's slope and intercept."""
162
- matches = cls.baseline_pattern.search(element.attrib.get('title', ''))
162
+ matches = cls.baseline_pattern.search(element.attrib.get("title", ""))
163
163
  if not matches:
164
164
  return (0.0, 0.0)
165
165
  return float(matches.group(1)), int(matches.group(2))
@@ -167,7 +167,7 @@ class HocrTransform:
167
167
  @classmethod
168
168
  def textangle(cls, element: Element) -> float:
169
169
  """Get text angle of an element."""
170
- matches = cls.textangle_pattern.search(element.attrib.get('title', ''))
170
+ matches = cls.textangle_pattern.search(element.attrib.get("title", ""))
171
171
  if not matches:
172
172
  return 0.0
173
173
  return float(matches.group(1))
@@ -220,13 +220,13 @@ class HocrTransform:
220
220
  with canvas.do.save_state(cm=page_matrix):
221
221
  self._debug_draw_paragraph_boxes(canvas)
222
222
  found_lines = False
223
- for par in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
223
+ for par in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
224
224
  for line in (
225
225
  element
226
- for element in par.iterfind(self._child_xpath('span'))
227
- if 'class' in element.attrib
228
- and element.attrib['class']
229
- in {'ocr_header', 'ocr_line', 'ocr_textfloat', 'ocr_caption'}
226
+ for element in par.iterfind(self._child_xpath("span"))
227
+ if "class" in element.attrib
228
+ and element.attrib["class"]
229
+ in {"ocr_header", "ocr_line", "ocr_textfloat", "ocr_caption"}
230
230
  ):
231
231
  found_lines = True
232
232
  direction = self._get_text_direction(par)
@@ -242,7 +242,7 @@ class HocrTransform:
242
242
 
243
243
  if not found_lines:
244
244
  # Tesseract did not report any lines (just words)
245
- root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
245
+ root = self.hocr.find(self._child_xpath("div", "ocr_page"))
246
246
  direction = self._get_text_direction(root)
247
247
  self._do_line(
248
248
  canvas,
@@ -254,27 +254,21 @@ class HocrTransform:
254
254
  )
255
255
  # put the image on the page, scaled to fill the page
256
256
  if image_filename is not None:
257
- canvas.do.draw_image(
258
- image_filename, 0, 0, width=self.width, height=self.height
259
- )
257
+ canvas.do.draw_image(image_filename, 0, 0, width=self.width, height=self.height)
260
258
 
261
259
  # finish up the page and save it
262
260
  canvas.to_pdf().save(out_filename)
263
261
 
264
262
  def _get_text_direction(self, par):
265
263
  """Get the text direction of the paragraph.
266
-
264
+
267
265
  Arabic, Hebrew, Persian, are right-to-left languages.
268
266
  When the paragraph element is None, defaults to left-to-right.
269
267
  """
270
268
  if par is None:
271
269
  return TextDirection.LTR
272
-
273
- return (
274
- TextDirection.RTL
275
- if par.attrib.get('dir', 'ltr') == 'rtl'
276
- else TextDirection.LTR
277
- )
270
+
271
+ return TextDirection.RTL if par.attrib.get("dir", "ltr") == "rtl" else TextDirection.LTR
278
272
 
279
273
  def _get_inject_word_breaks(self, par):
280
274
  """Determine whether word breaks should be injected.
@@ -283,9 +277,9 @@ class HocrTransform:
283
277
  words are usually one or two characters and separators are usually explicit.
284
278
  In all other languages, we inject word breaks to help word segmentation.
285
279
  """
286
- lang = par.attrib.get('lang', '')
280
+ lang = par.attrib.get("lang", "")
287
281
  log.debug(lang)
288
- if lang in {'chi_sim', 'chi_tra', 'jpn', 'kor'}:
282
+ if lang in {"chi_sim", "chi_tra", "jpn", "kor"}:
289
283
  return False
290
284
  return True
291
285
 
@@ -339,8 +333,7 @@ class HocrTransform:
339
333
  # size as the true bounding box of the line.
340
334
  top_left_corner = (line_min_aabb.llx, line_min_aabb.lly)
341
335
  line_size_aabb_matrix = (
342
- Matrix()
343
- .translated(*top_left_corner)
336
+ Matrix().translated(*top_left_corner)
344
337
  # Note: negative sign (textangle is counter-clockwise, see hOCR spec)
345
338
  .rotated(-self.textangle(line))
346
339
  )
@@ -371,12 +364,10 @@ class HocrTransform:
371
364
  text.font(self._fontname, fontsize)
372
365
  text.render_mode(3 if invisible_text else 0)
373
366
 
374
- self._debug_draw_baseline(
375
- canvas, baseline_matrix.inverse().transform(line_min_aabb), 0
376
- )
367
+ self._debug_draw_baseline(canvas, baseline_matrix.inverse().transform(line_min_aabb), 0)
377
368
 
378
369
  canvas.do.fill_color(BLACK) # text in black
379
- elements = line.findall(self._child_xpath('span', elemclass))
370
+ elements = line.findall(self._child_xpath("span", elemclass))
380
371
  for elem, next_elem in pairwise(elements + [None]):
381
372
  self._do_line_word(
382
373
  canvas,
@@ -405,7 +396,7 @@ class HocrTransform:
405
396
  if elem is None:
406
397
  return
407
398
  elemtxt = self.normalize_text(self._get_element_text(elem).strip())
408
- if elemtxt == '':
399
+ if elemtxt == "":
409
400
  return
410
401
 
411
402
  hocr_box = self.element_coordinates(elem)
@@ -430,9 +421,7 @@ class HocrTransform:
430
421
  text.show(self._font.text_encode(elemtxt))
431
422
 
432
423
  # Get coordinates of the next word (if there is one)
433
- hocr_next_box = (
434
- self.element_coordinates(next_elem) if next_elem is not None else None
435
- )
424
+ hocr_next_box = self.element_coordinates(next_elem) if next_elem is not None else None
436
425
  if hocr_next_box is None:
437
426
  return
438
427
  # Render a space between this word and the next word. The explicit space helps
@@ -447,16 +436,14 @@ class HocrTransform:
447
436
  elif text_direction == TextDirection.RTL:
448
437
  space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
449
438
  self._debug_draw_space_bbox(canvas, space_box)
450
- space_width = self._font.text_width(' ', fontsize)
439
+ space_width = self._font.text_width(" ", fontsize)
451
440
  if space_width > 0 and space_box.width > 0:
452
441
  if text_direction == TextDirection.LTR:
453
442
  text.text_transform(Matrix(1, 0, 0, -1, space_box.llx, 0))
454
443
  elif text_direction == TextDirection.RTL:
455
- text.text_transform(
456
- Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0)
457
- )
444
+ text.text_transform(Matrix(-1, 0, 0, -1, space_box.llx + space_box.width, 0))
458
445
  text.horiz_scale(100 * space_box.width / space_width)
459
- text.show(self._font.text_encode(' '))
446
+ text.show(self._font.text_encode(" "))
460
447
 
461
448
  def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
462
449
  """Draw boxes around paragraphs in the document."""
@@ -465,16 +452,14 @@ class HocrTransform:
465
452
  with canvas.do.save_state():
466
453
  # draw box around paragraph
467
454
  canvas.do.stroke_color(color).line_width(0.1)
468
- for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
455
+ for elem in self.hocr.iterfind(self._child_xpath("p", "ocr_par")):
469
456
  elemtxt = self._get_element_text(elem).strip()
470
457
  if len(elemtxt) == 0:
471
458
  continue
472
459
  ocr_par = self.element_coordinates(elem)
473
460
  if ocr_par is None:
474
461
  continue
475
- canvas.do.rect(
476
- ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False
477
- )
462
+ canvas.do.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=False)
478
463
 
479
464
  def _debug_draw_line_bbox(self, canvas: Canvas, line_box: Rectangle, color=BLUE):
480
465
  """Render the bounding box of a text line."""
@@ -485,22 +470,16 @@ class HocrTransform:
485
470
  line_box.llx, line_box.lly, line_box.width, line_box.height, fill=False
486
471
  )
487
472
 
488
- def _debug_draw_word_triangle(
489
- self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1
490
- ):
473
+ def _debug_draw_word_triangle(self, canvas: Canvas, box: Rectangle, color=RED, line_width=0.1):
491
474
  """Render a triangle that conveys word height and drawing direction."""
492
475
  if not self.render_options.render_triangle: # pragma: no cover
493
476
  return
494
477
  with canvas.do.save_state():
495
478
  canvas.do.stroke_color(color).line_width(line_width).line(
496
479
  box.llx, box.lly, box.urx, box.lly
497
- ).line(box.urx, box.lly, box.llx, box.ury).line(
498
- box.llx, box.lly, box.llx, box.ury
499
- )
480
+ ).line(box.urx, box.lly, box.llx, box.ury).line(box.llx, box.lly, box.llx, box.ury)
500
481
 
501
- def _debug_draw_word_bbox(
502
- self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1
503
- ):
482
+ def _debug_draw_word_bbox(self, canvas: Canvas, box: Rectangle, color=GREEN, line_width=0.1):
504
483
  """Render a box depicting the word."""
505
484
  if not self.render_options.render_word_bbox: # pragma: no cover
506
485
  return
@@ -537,4 +516,4 @@ class HocrTransform:
537
516
  baseline_lly,
538
517
  line_box.urx,
539
518
  baseline_lly,
540
- )
519
+ )
@@ -8,11 +8,7 @@ import unicodedata
8
8
  import zlib
9
9
  from importlib.resources import files as package_files
10
10
 
11
- from pikepdf import (
12
- Dictionary,
13
- Name,
14
- Pdf,
15
- )
11
+ from pikepdf import Dictionary, Name, Pdf
16
12
  from pikepdf.canvas import Font
17
13
 
18
14
  log = logging.getLogger(__name__)
@@ -25,8 +21,8 @@ class EncodableFont(Font):
25
21
 
26
22
  class GlyphlessFont(EncodableFont):
27
23
  CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
28
- GLYPHLESS_FONT_NAME = 'pdf.ttf'
29
- GLYPHLESS_FONT_PACKAGE_PATH = 'natural_pdf.exporters.data'
24
+ GLYPHLESS_FONT_NAME = "pdf.ttf"
25
+ GLYPHLESS_FONT_PACKAGE_PATH = "natural_pdf.exporters.data"
30
26
  GLYPHLESS_FONT = (package_files(GLYPHLESS_FONT_PACKAGE_PATH) / GLYPHLESS_FONT_NAME).read_bytes()
31
27
  CHAR_ASPECT = 2
32
28
 
@@ -39,7 +35,7 @@ class GlyphlessFont(EncodableFont):
39
35
  return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
40
36
 
41
37
  def text_encode(self, text: str) -> bytes:
42
- return text.encode('utf-16be')
38
+ return text.encode("utf-16be")
43
39
 
44
40
  def register(self, pdf: Pdf):
45
41
  """Register the glyphless font.
@@ -76,9 +72,7 @@ class GlyphlessFont(EncodableFont):
76
72
  )
77
73
  )
78
74
  basefont.DescendantFonts = [cid_font_type2]
79
- cid_font_type2.CIDToGIDMap = pdf.make_stream(
80
- self.CID_TO_GID_DATA, Filter=Name.FlateDecode
81
- )
75
+ cid_font_type2.CIDToGIDMap = pdf.make_stream(self.CID_TO_GID_DATA, Filter=Name.FlateDecode)
82
76
  basefont.ToUnicode = pdf.make_stream(
83
77
  b"/CIDInit /ProcSet findresource begin\n"
84
78
  b"12 dict begin\n"
@@ -129,7 +123,7 @@ class Courier(EncodableFont):
129
123
  return len(text) * fontsize
130
124
 
131
125
  def text_encode(self, text: str) -> bytes:
132
- return text.encode('pdfdoc', errors='ignore')
126
+ return text.encode("pdfdoc", errors="ignore")
133
127
 
134
128
  def register(self, pdf: Pdf) -> Dictionary:
135
129
  """Register the font."""
@@ -139,4 +133,4 @@ class Courier(EncodableFont):
139
133
  Type=Name.Font,
140
134
  Subtype=Name.Type1,
141
135
  )
142
- )
136
+ )
@@ -44,7 +44,7 @@ def create_original_pdf(
44
44
  if pikepdf is None:
45
45
  raise ImportError(
46
46
  "Saving original PDF pages requires 'pikepdf'. "
47
- "Install with: pip install \"natural-pdf[ocr-export]\""
47
+ 'Install with: pip install "natural-pdf[ocr-export]"'
48
48
  )
49
49
 
50
50
  output_path_str = str(output_path)
@@ -55,18 +55,17 @@ def create_original_pdf(
55
55
  if not source.pages:
56
56
  raise ValueError("Cannot save an empty collection/PDF.")
57
57
  pages_to_extract = source.pages
58
- elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
59
- # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
60
- if hasattr(source, 'pdf') and source.pdf and hasattr(source.pdf, 'path'):
61
- pages_to_extract = [source]
62
- else:
58
+ elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
59
+ # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
60
+ if hasattr(source, "pdf") and source.pdf and hasattr(source.pdf, "path"):
61
+ pages_to_extract = [source]
62
+ else:
63
63
  raise ValueError("Input Page object does not have a valid PDF reference with a path.")
64
64
  else:
65
65
  raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
66
66
 
67
-
68
67
  if not pages_to_extract:
69
- raise ValueError("No valid pages found in the source object.")
68
+ raise ValueError("No valid pages found in the source object.")
70
69
 
71
70
  # Verify all pages come from the same PDF and get path
72
71
  first_page_pdf_path = None
@@ -115,16 +114,14 @@ def create_original_pdf(
115
114
  )
116
115
 
117
116
  except pikepdf.PasswordError:
118
- logger.error(
119
- f"Failed to open password-protected source PDF: {first_page_pdf_path}"
120
- )
117
+ logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
121
118
  raise RuntimeError(
122
119
  f"Source PDF '{first_page_pdf_path}' is password-protected."
123
- ) from None # Raise specific error without chaining the generic Exception
120
+ ) from None # Raise specific error without chaining the generic Exception
124
121
  except Exception as e:
125
122
  logger.error(
126
123
  f"Failed to save original pages PDF to '{output_path_str}': {e}",
127
124
  exc_info=True,
128
125
  )
129
126
  # Re-raise as RuntimeError for consistent API error handling
130
- raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
127
+ raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
@@ -318,16 +318,6 @@ def create_searchable_pdf(
318
318
  dpi: The resolution (dots per inch) for rendering page images and hOCR.
319
319
  """
320
320
 
321
- # --- Ensure dependencies are loaded (they should be if installed) ---
322
- if Image is None or pikepdf is None or HocrTransform is None:
323
- # This should ideally not happen if dependencies are in main install,
324
- # but serves as a safeguard during development or if install is broken.
325
- raise ImportError(
326
- "Required dependencies (Pillow, pikepdf) are missing. "
327
- "Please ensure natural-pdf is installed correctly with all dependencies."
328
- )
329
- # --- End Safeguard Check ---
330
-
331
321
  # duck type to see if source has .pages, to populate pages =
332
322
  if hasattr(source, "pages"):
333
323
  pages = source.pages
@@ -3,29 +3,46 @@
3
3
  import logging
4
4
  from typing import Optional
5
5
 
6
- # --- Service Implementation Import ---
7
- # Import the concrete implementation
8
- from .haystack_search_service import HaystackSearchService
9
-
10
- # --- Utils Import ---
11
- from .haystack_utils import ( # Re-export flag and helper
12
- HAS_HAYSTACK_EXTRAS,
13
- check_haystack_availability,
14
- )
15
-
16
- # --- Option Imports (for convenience) ---
17
- # Make options easily available via `from natural_pdf.search import ...`
18
- from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
6
+ # Import constants
7
+ from .search_options import SearchOptions
19
8
  from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
20
-
21
- # --- Protocol Import ---
22
- # Import the protocol for type hinting
23
9
  from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
24
10
 
11
+ # Check search extras availability
12
+ LANCEDB_AVAILABLE = False
13
+ SEARCH_DEPENDENCIES_AVAILABLE = False
14
+
15
+ try:
16
+ import sentence_transformers
17
+ import numpy as np
18
+ # Basic search dependencies are available
19
+ SEARCH_DEPENDENCIES_AVAILABLE = True
20
+
21
+ # Check if LanceDB is available
22
+ try:
23
+ import lancedb
24
+ import pyarrow
25
+ LANCEDB_AVAILABLE = True
26
+ from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
27
+ except ImportError:
28
+ # LanceDB not available, we'll use NumPy fallback
29
+ LANCEDB_AVAILABLE = False
30
+ from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
31
+ except ImportError:
32
+ # Basic dependencies missing
33
+ SEARCH_DEPENDENCIES_AVAILABLE = False
34
+ LANCEDB_AVAILABLE = False
35
+
25
36
  logger = logging.getLogger(__name__)
26
37
 
38
+ def check_search_availability():
39
+ """Check if required search dependencies are available."""
40
+ if not SEARCH_DEPENDENCIES_AVAILABLE:
41
+ raise ImportError(
42
+ "Search functionality requires 'sentence-transformers' and NumPy. "
43
+ "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
44
+ )
27
45
 
28
- # Factory Function
29
46
  def get_search_service(
30
47
  collection_name: str,
31
48
  persist: bool = False,
@@ -34,53 +51,49 @@ def get_search_service(
34
51
  ) -> SearchServiceProtocol:
35
52
  """
36
53
  Factory function to get an instance of the configured search service.
37
-
38
- A service instance is tied to a specific index name (collection/table).
39
-
40
- Currently, only returns HaystackSearchService but is structured for future extension.
54
+
55
+ Automatically selects the best available implementation:
56
+ - LanceDB if installed (recommended for both in-memory and persistent)
57
+ - Numpy fallback for in-memory only
41
58
 
42
59
  Args:
43
- collection_name: The logical name for the index this service instance manages
44
- (used as table_name for LanceDB).
60
+ collection_name: The logical name for the index/table this service instance manages.
45
61
  persist: If True, creates a service instance configured for persistent
46
- storage (currently LanceDB). If False (default), uses InMemory.
47
- uri: Override the default path/URI for persistent storage.
62
+ storage. If False (default), uses InMemory (via temp dir for LanceDB).
63
+ uri: Override the default path for persistent storage.
48
64
  default_embedding_model: Override the default embedding model used by the service.
49
- **kwargs: Reserved for future configuration options.
50
65
 
51
66
  Returns:
52
- An instance conforming to the SearchServiceProtocol for the specified collection/table.
67
+ An instance conforming to the SearchServiceProtocol.
53
68
  """
54
69
  logger.debug(
55
- f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
70
+ f"Calling get_search_service factory for collection '{collection_name}' (persist={persist}, uri={uri})..."
56
71
  )
72
+ check_search_availability()
57
73
 
58
- # Collect arguments relevant to HaystackSearchService.__init__
59
- service_args = {}
60
- service_args["table_name"] = collection_name
61
- service_args["persist"] = persist
74
+ service_args = {
75
+ "collection_name": collection_name,
76
+ "persist": persist,
77
+ }
62
78
  if uri is not None:
63
79
  service_args["uri"] = uri
64
- if default_embedding_model is not None:
65
- service_args["embedding_model"] = default_embedding_model
66
80
 
67
- # Cache logic commented out as before
81
+ if default_embedding_model is not None:
82
+ service_args["embedding_model_name"] = default_embedding_model
68
83
 
69
- try:
70
- service_instance = HaystackSearchService(**service_args)
71
- logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
72
- return service_instance
73
- except ImportError as e:
74
- # Error message remains valid
75
- logger.error(
76
- f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
84
+ # If persistence is requested, LanceDB is required
85
+ if persist and not LANCEDB_AVAILABLE:
86
+ raise RuntimeError(
87
+ "Persistent vector search requires LanceDB. "
88
+ "Please install: pip install lancedb"
77
89
  )
78
- raise ImportError(
79
- "Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
80
- ) from e
81
- except Exception as e:
82
- logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
83
- raise RuntimeError("Could not create Search Service instance.") from e
84
-
85
-
86
- # Default instance commented out as before
90
+
91
+ # Select the appropriate implementation
92
+ if LANCEDB_AVAILABLE:
93
+ logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
94
+ service_instance = LanceDBSearchService(**service_args)
95
+ else:
96
+ logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
97
+ service_instance = NumpySearchService(**service_args)
98
+
99
+ return service_instance