natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,386 @@
1
+ # natural_pdf/search/haystack_utils.py
2
+ import logging
3
+ import os
4
+ from typing import Optional, Dict, Any, List, Union, Tuple, Type
5
+ from pathlib import Path
6
+ from PIL import Image # Ensure Image is imported unconditionally
7
+ import warnings
8
+
9
+ from natural_pdf.search.search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
10
+
11
+ # Set up logger for this module
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # --- Define flag BEFORE trying Haystack imports ---
15
+ HAS_HAYSTACK_EXTRAS = False # Default to False
16
+
17
+ # --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
18
+ try:
19
+ import haystack
20
+ from haystack import Document as HaystackDocument, Pipeline
21
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
22
+ from haystack.document_stores.types import DuplicatePolicy, DocumentStore
23
+ from haystack.components.embedders import (
24
+ SentenceTransformersTextEmbedder,
25
+ SentenceTransformersDocumentEmbedder
26
+ )
27
+ from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
28
+ # Keep try/except for optional Cohere
29
+ try:
30
+ from haystack.components.rankers import CohereRanker
31
+ except ImportError:
32
+ CohereRanker = None
33
+
34
+ # --- Add ChromaDB embedding function import ---
35
+ try:
36
+ from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
37
+ except ImportError:
38
+ logger.warning("chromadb library not found. Custom embedding models for ChromaDocumentStore may not work.")
39
+ SentenceTransformerEmbeddingFunction = None
40
+ # --- End ChromaDB import ---
41
+
42
+ HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
43
+ logger.debug("Successfully imported Haystack components.")
44
+
45
+ except ImportError as e:
46
+ # HAS_HAYSTACK_EXTRAS remains False
47
+ # Log the full error and traceback for debugging
48
+ logger.error(f"Failed to import Haystack components. Search functionality disabled. Error: {e}", exc_info=True)
49
+
50
+ # Define dummy types/classes for type hinting and basic checks when extras aren't installed
51
+ BaseDocumentStore = object
52
+ DocumentStore = object # Dummy for protocol
53
+ BaseEmbedder = object # Define dummy BaseEmbedder
54
+ BaseTextEmbedder = object
55
+ HaystackDocument = Dict # Represent as Dict if not available
56
+ Pipeline = None
57
+ SentenceTransformersTextEmbedder = None
58
+ ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
59
+ CohereRanker = None
60
+ ChromaDocumentStore = None
61
+ DuplicatePolicy = None # Dummy for DuplicatePolicy
62
+ SentenceTransformerEmbeddingFunction = None # Dummy if kept
63
+
64
+
65
+ # Helper function to check availability and raise error
66
+ def check_haystack_availability(feature_name: str = "Search"):
67
+ """Raises ImportError if Haystack extras are not installed."""
68
+ if not HAS_HAYSTACK_EXTRAS:
69
+ raise ImportError(
70
+ f"'{feature_name}' requires Haystack extras. "
71
+ "Please install them using: pip install natural-pdf[haystack]"
72
+ )
73
+
74
+
75
+ # ===========================
76
+ # Default Component Creators
77
+ # ===========================
78
+
79
+ def create_default_document_store(
80
+ persist_path: str = "./natural_pdf_index",
81
+ collection_name: str = "natural_pdf_default",
82
+ embedding_model: Optional[str] = None # Allow specifying the model
83
+ ) -> DocumentStore:
84
+ """Creates a default ChromaDB DocumentStore."""
85
+ check_haystack_availability("create_default_document_store")
86
+ logger.debug(f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'")
87
+ if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
88
+ raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
89
+
90
+ try:
91
+ # Note: For Haystack's Chroma integration, the embedding model is typically handled
92
+ # by the Embedder component in the indexing/query pipeline, not set directly
93
+ # on the DocumentStore initialization.
94
+ # The `embedding_model` parameter passed here might be used later to configure that Embedder.
95
+ store = ChromaDocumentStore(
96
+ persist_path=persist_path,
97
+ collection_name=collection_name,
98
+ # embedding_function parameter removed as it caused issues with Haystack's util
99
+ )
100
+ logger.info(f"Initialized ChromaDocumentStore (Collection: {collection_name})")
101
+ return store
102
+ except Exception as e:
103
+ logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
104
+ raise RuntimeError(f"Could not create ChromaDocumentStore for collection '{collection_name}'") from e
105
+
106
+ def create_default_text_embedder(
107
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
108
+ device: Optional[str] = None # Add device parameter
109
+ ) -> SentenceTransformersTextEmbedder:
110
+ """Creates a default SentenceTransformer text embedder."""
111
+ check_haystack_availability("create_default_text_embedder")
112
+ logger.debug(f"Creating default SentenceTransformersTextEmbedder with model '{model_name}'")
113
+ if not SentenceTransformersTextEmbedder:
114
+ raise RuntimeError("SentenceTransformersTextEmbedder not available.")
115
+ try:
116
+ # Use Haystack component which handles device logic
117
+ embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
118
+ logger.info(f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})")
119
+ return embedder
120
+ except Exception as e:
121
+ logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
122
+ raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
123
+
124
+ def create_default_multimodal_embedder(*args, **kwargs) -> Any:
125
+ """Stub for creating a default multimodal embedder (Not Implemented)."""
126
+ logger.error("Default multimodal embedder creation is not yet implemented.")
127
+ raise NotImplementedError(
128
+ "Creating a default multimodal embedder requires a custom component or integration not yet implemented."
129
+ " See: https://docs.haystack.deepset.ai/docs/custom-components"
130
+ )
131
+
132
+ def create_default_text_reranker(
133
+ api_key: Optional[str] = None,
134
+ model_name: str = "rerank-english-v2.0" # Default Cohere model
135
+ ) -> Optional[Any]: # Returns CohereRanker instance or None
136
+ """
137
+ Creates a default Cohere Reranker if available and API key provided.
138
+
139
+ Requires COHERE_API_KEY environment variable or api_key argument.
140
+ Requires haystack-cohere integration: pip install haystack-cohere
141
+ """
142
+ check_haystack_availability("create_default_text_reranker (optional)")
143
+
144
+ if not CohereRanker:
145
+ logger.debug("CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation.")
146
+ return None
147
+
148
+ # Check for API key (prefer argument over environment variable)
149
+ cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
150
+ if not cohere_api_key:
151
+ logger.warning("COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker.")
152
+ return None
153
+
154
+ logger.debug(f"Creating CohereRanker with model '{model_name}'")
155
+ try:
156
+ # Pass API key via authenticator for better practice if supported, or directly
157
+ # As of haystack 2.0b5, CohereRanker takes api_key directly
158
+ reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
159
+ logger.info(f"Initialized CohereRanker (Model: {model_name})")
160
+ return reranker
161
+ except Exception as e:
162
+ logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
163
+ # Don't raise, just return None as reranker is optional
164
+ return None
165
+
166
+ # --- Default Document Embedder Creator ---
167
+ def create_default_document_embedder(
168
+ model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
169
+ device: Optional[str] = None,
170
+ progress_bar: bool = True,
171
+ normalize_embeddings: bool = False # Changed default based on ST documentation
172
+ ) -> Any: # Return Any as actual type depends on availability
173
+ """Creates a default SentenceTransformersDocumentEmbedder instance.
174
+
175
+ Args:
176
+ model_name: The Sentence Transformers model name or path.
177
+ device: The device (e.g., 'cpu', 'cuda') to use.
178
+ progress_bar: Show progress bar during embedding.
179
+ normalize_embeddings: Normalize embeddings to unit length.
180
+
181
+ Returns:
182
+ A SentenceTransformersDocumentEmbedder instance or raises ImportError.
183
+
184
+ Raises:
185
+ ImportError: If SentenceTransformersDocumentEmbedder is not available.
186
+ RuntimeError: If initialization fails.
187
+ """
188
+ check_haystack_availability("SentenceTransformersDocumentEmbedder")
189
+ if not SentenceTransformersDocumentEmbedder:
190
+ raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
191
+
192
+ # Use the provided device parameter directly.
193
+ # If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
194
+ resolved_device = device
195
+ logger.debug(f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}")
196
+
197
+ try:
198
+ embedder = SentenceTransformersDocumentEmbedder(
199
+ model=model_name,
200
+ device=resolved_device,
201
+ progress_bar=progress_bar,
202
+ normalize_embeddings=normalize_embeddings,
203
+ # meta_fields_to_embed=config.get('DOC_EMBEDDER_META_FIELDS', []) # Removed reliance on config
204
+ # If embedding meta fields is needed, it should be passed as a parameter
205
+ )
206
+ embedder.warm_up()
207
+ logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}") # Use embedder.device after init
208
+ except Exception as e:
209
+ logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
210
+ raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
211
+
212
+ return embedder
213
+
214
+
215
+ # ===========================
216
+ # Helper Functions (Removed _determine_query_embedding)
217
+ # ===========================
218
+
219
+
220
+ # ===========================
221
+ # Central Search Logic
222
+ # ===========================
223
+
224
+ def _perform_haystack_search(
225
+ query: Union[str, Path, Image.Image],
226
+ document_store: Any, # Use Any for simplicity now
227
+ collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
228
+ embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
229
+ options: BaseSearchOptions
230
+ ) -> List[Dict[str, Any]]:
231
+ """Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
232
+ if not HAS_HAYSTACK_EXTRAS:
233
+ check_haystack_availability("_perform_haystack_search")
234
+ return [] # Should not be reached due to check
235
+
236
+ logger.info(f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})...")
237
+ logger.debug(f" Query type: {type(query).__name__}")
238
+ logger.debug(f" Options: {options}")
239
+
240
+ # --- 1. Embed Query (using the provided text embedder) --- #
241
+ text_query: Optional[str] = None
242
+ query_embedding: Optional[List[float]] = None
243
+
244
+ if isinstance(query, str):
245
+ text_query = query # Keep text for potential reranker use
246
+ if not embedder:
247
+ logger.error("Text query provided, but no embedder instance was passed to _perform_haystack_search.")
248
+ return []
249
+ # No need to check type if the type hint is enforced upstream
250
+ # if not isinstance(embedder, SentenceTransformersTextEmbedder):
251
+ # logger.warning(f"Provided embedder is {type(embedder).__name__}, not SentenceTransformersTextEmbedder. Assuming it works like one for query embedding.")
252
+ try:
253
+ logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
254
+ embedding_result = embedder.run(text=text_query)
255
+ query_embedding = embedding_result.get("embedding")
256
+ if not query_embedding:
257
+ logger.error(f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'")
258
+ return []
259
+ logger.debug(f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking.")
260
+ except Exception as e:
261
+ logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
262
+ return []
263
+ elif isinstance(query, Path) or isinstance(query, Image.Image):
264
+ # Currently, this function doesn't support multi-modal query embedding directly
265
+ logger.error(f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text.")
266
+ return []
267
+ else:
268
+ # Handle other unexpected types
269
+ logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
270
+
271
+ # If we didn't get an embedding (e.g., non-text query), we can't proceed
272
+ if query_embedding is None:
273
+ logger.error("Could not obtain query embedding. Cannot perform search.")
274
+ return []
275
+
276
+ # --- 2. Set up Retriever --- #
277
+ # Assumes the document_store is ChromaDocumentStore for this utility function context
278
+ if not ChromaEmbeddingRetriever:
279
+ logger.error("ChromaEmbeddingRetriever not available.")
280
+ return []
281
+
282
+ # Ensure retriever_top_k is set (should be by __post_init__)
283
+ retriever_top_k = options.retriever_top_k
284
+ if retriever_top_k is None:
285
+ logger.warning("options.retriever_top_k was None, defaulting to options.top_k for retriever.")
286
+ retriever_top_k = options.top_k
287
+
288
+ # Instantiate the EMBEDDING retriever
289
+ retriever = ChromaEmbeddingRetriever(document_store=document_store,
290
+ filters=options.filters or {}, # Pass filters here
291
+ top_k=retriever_top_k)
292
+
293
+ logger.debug(f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})")
294
+
295
+
296
+ # --- 3. Set up Optional Reranker --- #
297
+ reranker_instance = None
298
+ if options.use_reranker in [True, None]: # Check specifically for True or None
299
+ logger.debug("Attempting to initialize reranker...")
300
+ # Currently only supports default text reranker (Cohere)
301
+ reranker_instance = create_default_text_reranker(api_key=options.reranker_api_key,
302
+ model_name=options.reranker_model or "rerank-english-v2.0")
303
+ if reranker_instance:
304
+ # Ensure reranker top_k matches final desired top_k
305
+ reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
306
+ logger.info(f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})")
307
+ else:
308
+ logger.warning("Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking.")
309
+
310
+
311
+ # --- 4. Build and Run Pipeline --- #
312
+ if not Pipeline:
313
+ logger.error("Haystack Pipeline class not available.")
314
+ return []
315
+
316
+ search_pipeline = Pipeline()
317
+ search_pipeline.add_component("retriever", retriever)
318
+
319
+ # Define pipeline input based on EMBEDDING retriever needs
320
+ pipeline_input = {"retriever": {"query_embedding": query_embedding}}
321
+ last_component_name = "retriever"
322
+
323
+ if reranker_instance:
324
+ search_pipeline.add_component("reranker", reranker_instance)
325
+ search_pipeline.connect("retriever.documents", "reranker.documents")
326
+ # Reranker also needs the query text and final top_k
327
+ if text_query is None:
328
+ logger.error("Reranker requires text query, but it was not available (query might not have been text).")
329
+ # Handle this case - maybe skip reranker or raise error?
330
+ # For now, let's skip reranker if text is missing
331
+ logger.warning("Skipping reranker because text query is missing.")
332
+ reranker_instance = None # Effectively remove it from the logic below
333
+ last_component_name = "retriever" # Reset last component
334
+ # Remove reranker component if added? Less clean. Let's just not add its input.
335
+ else:
336
+ pipeline_input["reranker"] = {"query": text_query, "top_k": options.top_k} # Pass query and final top_k
337
+ last_component_name = "reranker"
338
+ logger.debug("Added reranker to pipeline and configured input.")
339
+ else:
340
+ # No reranker was initialized or it was skipped
341
+ last_component_name = "reranker"
342
+ logger.debug("Added reranker to pipeline.")
343
+
344
+
345
+ logger.info("Running Haystack search pipeline...")
346
+ try:
347
+ result = search_pipeline.run(pipeline_input)
348
+ logger.info("Haystack search pipeline finished.")
349
+
350
+ except Exception as e:
351
+ logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
352
+ return []
353
+
354
+ # --- 5. Process Results --- #
355
+ final_documents: List[HaystackDocument] = []
356
+ # Check output based on last component in the pipeline
357
+ if last_component_name in result and result[last_component_name].get("documents"):
358
+ final_documents = result[last_component_name]["documents"]
359
+ logger.debug(f"Processed results from '{last_component_name}' ({len(final_documents)} documents).")
360
+ else:
361
+ logger.warning(f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}")
362
+ return []
363
+
364
+ # Convert Haystack Documents to the desired output format
365
+ output_results = []
366
+ for doc in final_documents: # Correctly loop over final_documents
367
+ # Check if doc is actually a Haystack Document object or potentially a dict
368
+ doc_id = getattr(doc, 'id', None)
369
+ doc_score = getattr(doc, 'score', 0.0)
370
+ doc_content = getattr(doc, 'content', None)
371
+ doc_meta = getattr(doc, 'meta', {})
372
+
373
+ meta = doc_meta or {}
374
+ output = {
375
+ "pdf_path": meta.get("pdf_path", "Unknown"),
376
+ "page_number": meta.get("page_number", -1),
377
+ "score": doc_score if doc_score is not None else 0.0, # Handle potential None score
378
+ "content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
379
+ "metadata": meta,
380
+ # "haystack_document": doc # Optionally include the full Haystack doc
381
+ }
382
+ output_results.append(output)
383
+
384
+ logger.info(f"Returning {len(output_results)} relevant results.")
385
+ return output_results
386
+
@@ -0,0 +1,72 @@
1
+ import logging
2
+ from dataclasses import dataclass, field
3
+ from typing import List, Optional, Dict, Any, Tuple, Union, Literal
4
+
5
+ # Use object placeholders for external types to avoid direct dependency
6
+ BaseRanker = object
7
+ BaseEmbedder = object
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # --- Base Search Options ---
12
+ @dataclass
13
+ class BaseSearchOptions:
14
+ """Base options for search operations."""
15
+ # How many results to return finally (after retrieval and optional reranking)
16
+ top_k: int = 10
17
+ # How many candidates the retriever should fetch initially (relevant if reranking)
18
+ # Defaults to a value based on top_k if reranking, otherwise just top_k
19
+ retriever_top_k: Optional[int] = None
20
+ # Filters applied during retrieval (Haystack standard dictionary format)
21
+ filters: Optional[Dict[str, Any]] = None
22
+
23
+ # --- Reranking Configuration ---
24
+ # Option 1: Simple boolean/None
25
+ use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
26
+ # Option 2: Provide a specific instance (takes precedence over use_reranker boolean)
27
+ reranker_instance: Optional[BaseRanker] = None
28
+ # Parameters for default Cohere reranker (if use_reranker=True)
29
+ reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
30
+ reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
31
+
32
+ # --- Embedder Configuration (Less common to override per-query, usually set at indexing) ---
33
+ # embedder_instance: Optional[BaseEmbedder] = None # Might be useful for advanced cases
34
+
35
+ def __post_init__(self):
36
+ # Validate that top_k values make sense
37
+ if self.retriever_top_k is None:
38
+ # If retriever_top_k isn't set, default it based on reranking needs
39
+ if self.use_reranker:
40
+ self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
41
+ else:
42
+ self.retriever_top_k = self.top_k
43
+ elif self.retriever_top_k < self.top_k:
44
+ logger.warning(f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results.")
45
+
46
+ # --- Text Search Specific Options ---
47
+ @dataclass
48
+ class TextSearchOptions(BaseSearchOptions):
49
+ """Options specific to text-based semantic search."""
50
+ # Add any text-specific overrides or parameters here if needed in the future
51
+ # e.g., specifying default text reranker model name if different defaults emerge
52
+ # default_text_reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
53
+ pass # Currently inherits all base options
54
+
55
+ # --- MultiModal Search Specific Options ---
56
+ @dataclass
57
+ class MultiModalSearchOptions(BaseSearchOptions):
58
+ """Options specific to multimodal semantic search."""
59
+ # Flag to potentially use a default multimodal reranker if available
60
+ # (overrides base use_reranker=True if reranker_instance is None)
61
+ use_multimodal_reranker: bool = True # Attempt multimodal rerank if use_reranker=True/None and no instance given
62
+ # e.g., specifying default multimodal embedder/reranker models
63
+ # default_multimodal_embedder_model: str = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
64
+ # default_multimodal_reranker_model: str = "jinaai/jina-reranker-m0" # Example
65
+
66
+ # --- Union Type ---
67
+ # Defines the types allowed for search configuration.
68
+ SearchOptions = Union[
69
+ TextSearchOptions,
70
+ MultiModalSearchOptions,
71
+ BaseSearchOptions # Include base for typing flexibility
72
+ ]
@@ -0,0 +1,189 @@
1
+ """Defines the protocol for a search service."""
2
+ from typing import Protocol, List, Dict, Any, Optional, Union, Iterable
3
+ from pathlib import Path
4
+ from PIL import Image
5
+
6
+ # Use typing_extensions for Python < 3.8 compatibility if needed,
7
+ # otherwise, typing.Protocol is fine for >= 3.8
8
+ # from typing_extensions import Protocol
9
+
10
+ # Forward declare SearchOptions to avoid circular import if needed,
11
+ # or import if structure allows (assuming it's safe here)
12
+ from natural_pdf.search.search_options import SearchOptions, BaseSearchOptions
13
+
14
+ # Use Dict as placeholder for external Haystack Document type
15
+ HaystackDocument = Dict[str, Any]
16
+
17
+
18
+ class IndexConfigurationError(RuntimeError):
19
+ """Custom exception for configuration mismatches during indexing."""
20
+ pass
21
+
22
+
23
+ # Add new exception for sync/init safety
24
+ class IndexExistsError(RuntimeError):
25
+ """Raised when attempting to index implicitly to an existing persistent index without force_reindex=True."""
26
+ pass
27
+
28
+
29
+ class Indexable(Protocol):
30
+ """
31
+ Protocol defining the minimal interface for an object that can be indexed.
32
+ Objects passed to SearchServiceProtocol.index must conform to this.
33
+ """
34
+
35
+ def get_id(self) -> str:
36
+ """Return a unique identifier for this indexable item."""
37
+ ...
38
+
39
+ def get_metadata(self) -> Dict[str, Any]:
40
+ """Return a dictionary of metadata associated with this item."""
41
+ ...
42
+
43
+ def get_content(self) -> Any:
44
+ """
45
+ Return the primary content of this item.
46
+ The SearchService implementation will determine how to process this content
47
+ (e.g., call .extract_text(), .to_image(), or handle directly).
48
+ """
49
+ ...
50
+
51
+ # Added for syncing
52
+ def get_content_hash(self) -> str:
53
+ """
54
+ Return a hash representing the indexable content.
55
+ Used by SearchableMixin.sync_index to detect changes efficiently.
56
+ Implementations should hash the same content used for generating embeddings.
57
+ """
58
+ ...
59
+
60
+
61
+ class SearchServiceProtocol(Protocol):
62
+ """
63
+ Protocol defining the interface for indexing and searching documents.
64
+
65
+ Implementations of this protocol handle the specifics of interacting
66
+ with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
67
+ An instance of a service implementing this protocol is tied to a specific collection name.
68
+ """
69
+ collection_name: str
70
+ # Removed internal state hints (_persist, _embedding_model) - implementation detail
71
+
72
+ def index(
73
+ self,
74
+ documents: Iterable[Indexable],
75
+ embedder_device: Optional[str] = None,
76
+ force_reindex: bool = False,
77
+ ) -> None:
78
+ """
79
+ Indexes the provided documents into the collection managed by this service instance.
80
+
81
+ Handles store and embedder creation/retrieval, configuration checks,
82
+ re-indexing logic (including potential deletion), embedding, and writing.
83
+
84
+ Args:
85
+ documents: An iterable of objects conforming to the Indexable protocol.
86
+ embedder_device: The device ('cpu', 'cuda', etc.) for the embedder.
87
+ Defaults defined by the implementation.
88
+ force_reindex: If True, delete the entire existing collection
89
+ (if configuration permits) before indexing.
90
+
91
+ Raises:
92
+ IndexConfigurationError: If `force_reindex` is False and the existing
93
+ collection has incompatible settings.
94
+ ImportError: If required backend libraries are missing.
95
+ RuntimeError: For other failures during indexing.
96
+ """
97
+ ...
98
+
99
+ def search(
100
+ self,
101
+ query: Any, # Allow any query type, service implementation handles it
102
+ options: BaseSearchOptions,
103
+ ) -> List[Dict[str, Any]]:
104
+ """
105
+ Performs a search within the collection managed by this service instance.
106
+
107
+ Args:
108
+ query: The search query (type depends on service capabilities).
109
+ options: SearchOptions object containing configuration like top_k, filters, etc.
110
+
111
+ Returns:
112
+ A list of result dictionaries, typically containing document content,
113
+ metadata, and relevance scores.
114
+
115
+ Raises:
116
+ FileNotFoundError: If the collection managed by this service does not exist.
117
+ RuntimeError: For other failures during search.
118
+ TypeError: If the query type is incompatible with the backend/options.
119
+ """
120
+ ...
121
+
122
+ def delete_index(
123
+ self,
124
+ ) -> bool:
125
+ """
126
+ Deletes the entire index/collection managed by this service instance.
127
+
128
+ Returns:
129
+ True if deletion was successful or collection didn't exist,
130
+ False if deletion failed.
131
+
132
+ Raises:
133
+ ImportError: If required backend libraries are missing.
134
+ RuntimeError: For backend errors during deletion.
135
+ """
136
+ ...
137
+
138
+ def index_exists(
139
+ self,
140
+ ) -> bool:
141
+ """
142
+ Checks if the index/collection managed by this service instance exists.
143
+
144
+ Returns:
145
+ True if the index exists, False otherwise.
146
+
147
+ Raises:
148
+ ImportError: If required backend libraries are missing.
149
+ RuntimeError: For backend errors during the check.
150
+ """
151
+ ...
152
+
153
+ # --- Methods required for sync_index (full strategy) ---
154
+
155
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
156
+ """
157
+ Retrieves documents from the index, optionally including metadata.
158
+ Required for the 'full' strategy in SearchableMixin.sync_index.
159
+
160
+ Args:
161
+ include_metadata: If True, include the 'meta' field in the returned dicts.
162
+ Metadata should include 'content_hash' if available.
163
+ **kwargs: Additional backend-specific filtering or retrieval options.
164
+
165
+ Returns:
166
+ A list of dictionaries, each representing a document.
167
+ Must include at least 'id'. If include_metadata=True, must include 'meta'.
168
+
169
+ Raises:
170
+ NotImplementedError: If the service does not support listing documents.
171
+ """
172
+ ...
173
+
174
+ def delete_documents(self, ids: List[str]) -> None:
175
+ """
176
+ Deletes documents from the index based on their IDs.
177
+ Required for the 'full' strategy in SearchableMixin.sync_index.
178
+
179
+ Args:
180
+ ids: A list of document IDs to delete.
181
+
182
+ Raises:
183
+ NotImplementedError: If the service does not support deleting documents by ID.
184
+ RuntimeError: For backend errors during deletion.
185
+ """
186
+ ...
187
+
188
+ # Optional: Add methods for getting index stats, etc.
189
+ # def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...