natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -17,10 +17,16 @@ from natural_pdf.search.search_options import (
17
17
  # Set up logger for this module
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
+ # Import sentence-transformers for dimension calculation
21
+ try:
22
+ from sentence_transformers import SentenceTransformer
23
+ except ImportError:
24
+ SentenceTransformer = None
25
+
20
26
  # --- Define flag BEFORE trying Haystack imports ---
21
27
  HAS_HAYSTACK_EXTRAS = False # Default to False
22
28
 
23
- # --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
29
+ # Conditional Haystack Imports
24
30
  try:
25
31
  import haystack
26
32
  from haystack import Document as HaystackDocument
@@ -30,8 +36,18 @@ try:
30
36
  SentenceTransformersTextEmbedder,
31
37
  )
32
38
  from haystack.document_stores.types import DocumentStore, DuplicatePolicy
33
- from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
34
- from haystack_integrations.document_stores.chroma import ChromaDocumentStore
39
+
40
+ # --- REMOVED Chroma Imports ---
41
+ # from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
42
+ # from haystack_integrations.document_stores.chroma import ChromaDocumentStore
43
+ # --- ADDED LanceDB Imports ---
44
+ try:
45
+ from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
46
+ except ImportError:
47
+ LanceDBDocumentStore = None
48
+ LanceDBEmbeddingRetriever = None
49
+
50
+ # Removed Chroma Imports
35
51
 
36
52
  # Keep try/except for optional Cohere
37
53
  try:
@@ -39,15 +55,7 @@ try:
39
55
  except ImportError:
40
56
  CohereRanker = None
41
57
 
42
- # --- Add ChromaDB embedding function import ---
43
- try:
44
- from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
45
- except ImportError:
46
- logger.warning(
47
- "chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
48
- )
49
- SentenceTransformerEmbeddingFunction = None
50
- # --- End ChromaDB import ---
58
+ # Removed ChromaDB embedding function import
51
59
 
52
60
  HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
53
61
  logger.debug("Successfully imported Haystack components.")
@@ -67,11 +75,15 @@ except ImportError as e:
67
75
  HaystackDocument = Dict # Represent as Dict if not available
68
76
  Pipeline = None
69
77
  SentenceTransformersTextEmbedder = None
70
- ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
78
+ # --- UPDATED Dummies ---
79
+ LanceDBEmbeddingRetriever = (
80
+ None # ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
81
+ )
71
82
  CohereRanker = None
72
- ChromaDocumentStore = None
83
+ LanceDBDocumentStore = None # ChromaDocumentStore = None
73
84
  DuplicatePolicy = None # Dummy for DuplicatePolicy
74
- SentenceTransformerEmbeddingFunction = None # Dummy if kept
85
+ # --- REMOVED Dummies ---
86
+ # SentenceTransformerEmbeddingFunction = None # Dummy if kept
75
87
 
76
88
 
77
89
  # Helper function to check availability and raise error
@@ -90,34 +102,60 @@ def check_haystack_availability(feature_name: str = "Search"):
90
102
 
91
103
 
92
104
  def create_default_document_store(
93
- persist_path: str = "./natural_pdf_index",
94
- collection_name: str = "natural_pdf_default",
95
- embedding_model: Optional[str] = None, # Allow specifying the model
105
+ # --- CHANGED persist_path to uri ---
106
+ uri: str = "./natural_pdf_index",
107
+ collection_name: str = "natural_pdf_default", # LanceDB calls this table_name
108
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", # Make mandatory for dim calculation
96
109
  ) -> DocumentStore:
97
- """Creates a default ChromaDB DocumentStore."""
98
- check_haystack_availability("create_default_document_store")
110
+ """Creates a default LanceDB DocumentStore."""
111
+ check_haystack_availability("create_default_document_store (LanceDB)")
99
112
  logger.debug(
100
- f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'"
113
+ f"Creating default LanceDBDocumentStore at uri='{uri}' with table '{collection_name}'"
101
114
  )
102
- if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
103
- raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
115
+ if not LanceDBDocumentStore:
116
+ raise RuntimeError("LanceDBDocumentStore is not available despite Haystack extras check.")
117
+ if not SentenceTransformer:
118
+ raise ImportError(
119
+ "sentence-transformers library is required to determine embedding dimensions."
120
+ )
104
121
 
105
122
  try:
106
- # Note: For Haystack's Chroma integration, the embedding model is typically handled
107
- # by the Embedder component in the indexing/query pipeline, not set directly
108
- # on the DocumentStore initialization.
109
- # The `embedding_model` parameter passed here might be used later to configure that Embedder.
110
- store = ChromaDocumentStore(
111
- persist_path=persist_path,
112
- collection_name=collection_name,
113
- # embedding_function parameter removed as it caused issues with Haystack's util
123
+ # Calculate embedding dimension
124
+ try:
125
+ model = SentenceTransformer(embedding_model)
126
+ embedding_dims = model.get_sentence_embedding_dimension()
127
+ if not embedding_dims:
128
+ raise ValueError(
129
+ f"Could not determine embedding dimension for model: {embedding_model}"
130
+ )
131
+ logger.debug(
132
+ f"Determined embedding dimension: {embedding_dims} for model '{embedding_model}'"
133
+ )
134
+ except Exception as e:
135
+ logger.error(
136
+ f"Failed to load SentenceTransformer model '{embedding_model}' to get dimensions: {e}",
137
+ exc_info=True,
138
+ )
139
+ raise RuntimeError(
140
+ f"Failed to determine embedding dimension for model '{embedding_model}'."
141
+ ) from e
142
+
143
+ # Create LanceDBDocumentStore
144
+ store = LanceDBDocumentStore(
145
+ database=uri, # Use uri for the database path
146
+ table_name=collection_name,
147
+ embedding_dims=embedding_dims,
148
+ # LanceDB might require a metadata schema, but let's try without it first for simplicity.
149
+ # Add `metadata_schema=...` if needed based on lancedb-haystack requirements.
150
+ )
151
+ logger.info(
152
+ f"Initialized LanceDBDocumentStore (Table: {collection_name}, Dims: {embedding_dims}) at uri '{uri}'"
114
153
  )
115
- logger.info(f"Initialized ChromaDocumentStore (Collection: {collection_name})")
116
154
  return store
117
155
  except Exception as e:
118
- logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
156
+ logger.error(f"Failed to initialize LanceDBDocumentStore: {e}", exc_info=True)
119
157
  raise RuntimeError(
120
- f"Could not create ChromaDocumentStore for collection '{collection_name}'"
158
+ f"Could not create LanceDBDocumentStore for table '{collection_name}' at uri '{uri}'"
121
159
  ) from e
122
160
 
123
161
 
@@ -261,35 +299,32 @@ def create_default_document_embedder(
261
299
  def _perform_haystack_search(
262
300
  query: Union[str, Path, Image.Image],
263
301
  document_store: Any, # Use Any for simplicity now
264
- collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
302
+ collection_name: str, # Passed for clarity, corresponds to table_name in LanceDB
265
303
  embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
266
304
  options: BaseSearchOptions,
267
305
  ) -> List[Dict[str, Any]]:
268
- """Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
306
+ """Internal function to perform search using Haystack components (LanceDBEmbeddingRetriever)."""
269
307
  if not HAS_HAYSTACK_EXTRAS:
270
- check_haystack_availability("_perform_haystack_search")
308
+ check_haystack_availability("_perform_haystack_search (LanceDB)")
271
309
  return [] # Should not be reached due to check
272
310
 
273
311
  logger.info(
274
- f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})..."
312
+ f"Performing Haystack search in table '{collection_name}' (using store: {type(document_store).__name__})..."
275
313
  )
276
314
  logger.debug(f" Query type: {type(query).__name__}")
277
315
  logger.debug(f" Options: {options}")
278
316
 
279
- # --- 1. Embed Query (using the provided text embedder) --- #
317
+ # Embed Query
280
318
  text_query: Optional[str] = None
281
319
  query_embedding: Optional[List[float]] = None
282
320
 
283
321
  if isinstance(query, str):
284
- text_query = query # Keep text for potential reranker use
322
+ text_query = query
285
323
  if not embedder:
286
324
  logger.error(
287
325
  "Text query provided, but no embedder instance was passed to _perform_haystack_search."
288
326
  )
289
327
  return []
290
- # No need to check type if the type hint is enforced upstream
291
- # if not isinstance(embedder, SentenceTransformersTextEmbedder):
292
- # logger.warning(f"Provided embedder is {type(embedder).__name__}, not SentenceTransformersTextEmbedder. Assuming it works like one for query embedding.")
293
328
  try:
294
329
  logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
295
330
  embedding_result = embedder.run(text=text_query)
@@ -306,24 +341,21 @@ def _perform_haystack_search(
306
341
  logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
307
342
  return []
308
343
  elif isinstance(query, Path) or isinstance(query, Image.Image):
309
- # Currently, this function doesn't support multi-modal query embedding directly
310
344
  logger.error(
311
345
  f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
312
346
  )
313
347
  return []
314
348
  else:
315
- # Handle other unexpected types
316
349
  logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
350
+ return []
317
351
 
318
- # If we didn't get an embedding (e.g., non-text query), we can't proceed
319
352
  if query_embedding is None:
320
353
  logger.error("Could not obtain query embedding. Cannot perform search.")
321
354
  return []
322
355
 
323
- # --- 2. Set up Retriever --- #
324
- # Assumes the document_store is ChromaDocumentStore for this utility function context
325
- if not ChromaEmbeddingRetriever:
326
- logger.error("ChromaEmbeddingRetriever not available.")
356
+ # Set up Retriever
357
+ if not LanceDBEmbeddingRetriever:
358
+ logger.error("LanceDBEmbeddingRetriever not available.")
327
359
  return []
328
360
 
329
361
  # Ensure retriever_top_k is set (should be by __post_init__)
@@ -335,28 +367,26 @@ def _perform_haystack_search(
335
367
  retriever_top_k = options.top_k
336
368
 
337
369
  # Instantiate the EMBEDDING retriever
338
- retriever = ChromaEmbeddingRetriever(
370
+ retriever = LanceDBEmbeddingRetriever(
339
371
  document_store=document_store,
340
372
  filters=options.filters or {}, # Pass filters here
341
373
  top_k=retriever_top_k,
342
374
  )
343
375
 
344
376
  logger.debug(
345
- f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
377
+ f"Initialized LanceDBEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
346
378
  )
347
379
 
348
- # --- 3. Set up Optional Reranker --- #
380
+ # Set up Optional Reranker
349
381
  reranker_instance = None
350
- if options.use_reranker in [True, None]: # Check specifically for True or None
382
+ if options.use_reranker in [True, None]:
351
383
  logger.debug("Attempting to initialize reranker...")
352
- # Currently only supports default text reranker (Cohere)
353
384
  reranker_instance = create_default_text_reranker(
354
385
  api_key=options.reranker_api_key,
355
386
  model_name=options.reranker_model or "rerank-english-v2.0",
356
387
  )
357
388
  if reranker_instance:
358
- # Ensure reranker top_k matches final desired top_k
359
- reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
389
+ reranker_instance.top_k = options.top_k
360
390
  logger.info(
361
391
  f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
362
392
  )
@@ -365,7 +395,7 @@ def _perform_haystack_search(
365
395
  "Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
366
396
  )
367
397
 
368
- # --- 4. Build and Run Pipeline --- #
398
+ # Build and Run Pipeline
369
399
  if not Pipeline:
370
400
  logger.error("Haystack Pipeline class not available.")
371
401
  return []
@@ -380,28 +410,25 @@ def _perform_haystack_search(
380
410
  if reranker_instance:
381
411
  search_pipeline.add_component("reranker", reranker_instance)
382
412
  search_pipeline.connect("retriever.documents", "reranker.documents")
383
- # Reranker also needs the query text and final top_k
384
413
  if text_query is None:
385
414
  logger.error(
386
415
  "Reranker requires text query, but it was not available (query might not have been text)."
387
416
  )
388
- # Handle this case - maybe skip reranker or raise error?
389
- # For now, let's skip reranker if text is missing
390
417
  logger.warning("Skipping reranker because text query is missing.")
391
- reranker_instance = None # Effectively remove it from the logic below
392
- last_component_name = "retriever" # Reset last component
393
- # Remove reranker component if added? Less clean. Let's just not add its input.
418
+ reranker_instance = None
419
+ last_component_name = "retriever"
394
420
  else:
395
421
  pipeline_input["reranker"] = {
396
422
  "query": text_query,
397
423
  "top_k": options.top_k,
398
- } # Pass query and final top_k
424
+ }
399
425
  last_component_name = "reranker"
400
426
  logger.debug("Added reranker to pipeline and configured input.")
401
427
  else:
402
- # No reranker was initialized or it was skipped
403
- last_component_name = "reranker"
404
- logger.debug("Added reranker to pipeline.")
428
+ # --- Fix: last_component_name should only be 'reranker' if it was added ---
429
+ # if reranker_instance was initialized and added, last_component_name is 'reranker'
430
+ # if not, it remains 'retriever'
431
+ pass # No change needed here if reranker wasn't added
405
432
 
406
433
  logger.info("Running Haystack search pipeline...")
407
434
  try:
@@ -412,9 +439,8 @@ def _perform_haystack_search(
412
439
  logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
413
440
  return []
414
441
 
415
- # --- 5. Process Results --- #
442
+ # Process Results
416
443
  final_documents: List[HaystackDocument] = []
417
- # Check output based on last component in the pipeline
418
444
  if last_component_name in result and result[last_component_name].get("documents"):
419
445
  final_documents = result[last_component_name]["documents"]
420
446
  logger.debug(
@@ -428,8 +454,7 @@ def _perform_haystack_search(
428
454
 
429
455
  # Convert Haystack Documents to the desired output format
430
456
  output_results = []
431
- for doc in final_documents: # Correctly loop over final_documents
432
- # Check if doc is actually a Haystack Document object or potentially a dict
457
+ for doc in final_documents:
433
458
  doc_id = getattr(doc, "id", None)
434
459
  doc_score = getattr(doc, "score", 0.0)
435
460
  doc_content = getattr(doc, "content", None)
@@ -439,10 +464,9 @@ def _perform_haystack_search(
439
464
  output = {
440
465
  "pdf_path": meta.get("pdf_path", "Unknown"),
441
466
  "page_number": meta.get("page_number", -1),
442
- "score": doc_score if doc_score is not None else 0.0, # Handle potential None score
443
- "content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
467
+ "score": doc_score if doc_score is not None else 0.0,
468
+ "content_snippet": doc_content[:200] + "..." if doc_content else "",
444
469
  "metadata": meta,
445
- # "haystack_document": doc # Optionally include the full Haystack doc
446
470
  }
447
471
  output_results.append(output)
448
472
 
@@ -68,12 +68,11 @@ class SearchServiceProtocol(Protocol):
68
68
  Protocol defining the interface for indexing and searching documents.
69
69
 
70
70
  Implementations of this protocol handle the specifics of interacting
71
- with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
72
- An instance of a service implementing this protocol is tied to a specific collection name.
71
+ with a chosen search backend (e.g., Haystack with LanceDB, Haystack In-Memory).
72
+ An instance of a service implementing this protocol is tied to a specific index name (e.g., table name).
73
73
  """
74
74
 
75
75
  collection_name: str
76
- # Removed internal state hints (_persist, _embedding_model) - implementation detail
77
76
 
78
77
  def index(
79
78
  self,
@@ -82,7 +81,7 @@ class SearchServiceProtocol(Protocol):
82
81
  force_reindex: bool = False,
83
82
  ) -> None:
84
83
  """
85
- Indexes the provided documents into the collection managed by this service instance.
84
+ Indexes the provided documents into the index/table managed by this service instance.
86
85
 
87
86
  Handles store and embedder creation/retrieval, configuration checks,
88
87
  re-indexing logic (including potential deletion), embedding, and writing.
@@ -91,12 +90,12 @@ class SearchServiceProtocol(Protocol):
91
90
  documents: An iterable of objects conforming to the Indexable protocol.
92
91
  embedder_device: The device ('cpu', 'cuda', etc.) for the embedder.
93
92
  Defaults defined by the implementation.
94
- force_reindex: If True, delete the entire existing collection
93
+ force_reindex: If True, delete the entire existing index/table
95
94
  (if configuration permits) before indexing.
96
95
 
97
96
  Raises:
98
97
  IndexConfigurationError: If `force_reindex` is False and the existing
99
- collection has incompatible settings.
98
+ index/table has incompatible settings.
100
99
  ImportError: If required backend libraries are missing.
101
100
  RuntimeError: For other failures during indexing.
102
101
  """
@@ -104,11 +103,11 @@ class SearchServiceProtocol(Protocol):
104
103
 
105
104
  def search(
106
105
  self,
107
- query: Any, # Allow any query type, service implementation handles it
106
+ query: Any,
108
107
  options: BaseSearchOptions,
109
108
  ) -> List[Dict[str, Any]]:
110
109
  """
111
- Performs a search within the collection managed by this service instance.
110
+ Performs a search within the index/table managed by this service instance.
112
111
 
113
112
  Args:
114
113
  query: The search query (type depends on service capabilities).
@@ -119,7 +118,7 @@ class SearchServiceProtocol(Protocol):
119
118
  metadata, and relevance scores.
120
119
 
121
120
  Raises:
122
- FileNotFoundError: If the collection managed by this service does not exist.
121
+ FileNotFoundError: If the index/table managed by this service does not exist or path is invalid.
123
122
  RuntimeError: For other failures during search.
124
123
  TypeError: If the query type is incompatible with the backend/options.
125
124
  """
@@ -129,10 +128,10 @@ class SearchServiceProtocol(Protocol):
129
128
  self,
130
129
  ) -> bool:
131
130
  """
132
- Deletes the entire index/collection managed by this service instance.
131
+ Deletes the entire index/table managed by this service instance.
133
132
 
134
133
  Returns:
135
- True if deletion was successful or collection didn't exist,
134
+ True if deletion was successful or index/table didn't exist,
136
135
  False if deletion failed.
137
136
 
138
137
  Raises:
@@ -145,7 +144,7 @@ class SearchServiceProtocol(Protocol):
145
144
  self,
146
145
  ) -> bool:
147
146
  """
148
- Checks if the index/collection managed by this service instance exists.
147
+ Checks if the index/table managed by this service instance exists.
149
148
 
150
149
  Returns:
151
150
  True if the index exists, False otherwise.