natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +33 -1
- natural_pdf/analyzers/layout/layout_analyzer.py +133 -44
- natural_pdf/analyzers/layout/layout_manager.py +9 -6
- natural_pdf/analyzers/layout/layout_options.py +2 -4
- natural_pdf/analyzers/layout/surya.py +199 -91
- natural_pdf/collections/pdf_collection.py +259 -0
- natural_pdf/core/page.py +97 -69
- natural_pdf/core/pdf.py +382 -171
- natural_pdf/elements/region.py +55 -26
- natural_pdf/exporters/__init__.py +1 -0
- natural_pdf/exporters/searchable_pdf.py +252 -0
- natural_pdf/search/__init__.py +94 -0
- natural_pdf/search/haystack_search_service.py +520 -0
- natural_pdf/search/haystack_utils.py +386 -0
- natural_pdf/search/search_options.py +72 -0
- natural_pdf/search/search_service_protocol.py +189 -0
- natural_pdf/search/searchable_mixin.py +464 -0
- natural_pdf-0.1.3.dist-info/METADATA +137 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/RECORD +22 -13
- natural_pdf-0.1.1.dist-info/METADATA +0 -295
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.1.dist-info → natural_pdf-0.1.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,386 @@
|
|
1
|
+
# natural_pdf/search/haystack_utils.py
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from typing import Optional, Dict, Any, List, Union, Tuple, Type
|
5
|
+
from pathlib import Path
|
6
|
+
from PIL import Image # Ensure Image is imported unconditionally
|
7
|
+
import warnings
|
8
|
+
|
9
|
+
from natural_pdf.search.search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
|
10
|
+
|
11
|
+
# Set up logger for this module
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
# --- Define flag BEFORE trying Haystack imports ---
|
15
|
+
HAS_HAYSTACK_EXTRAS = False # Default to False
|
16
|
+
|
17
|
+
# --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
|
18
|
+
try:
|
19
|
+
import haystack
|
20
|
+
from haystack import Document as HaystackDocument, Pipeline
|
21
|
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
22
|
+
from haystack.document_stores.types import DuplicatePolicy, DocumentStore
|
23
|
+
from haystack.components.embedders import (
|
24
|
+
SentenceTransformersTextEmbedder,
|
25
|
+
SentenceTransformersDocumentEmbedder
|
26
|
+
)
|
27
|
+
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
28
|
+
# Keep try/except for optional Cohere
|
29
|
+
try:
|
30
|
+
from haystack.components.rankers import CohereRanker
|
31
|
+
except ImportError:
|
32
|
+
CohereRanker = None
|
33
|
+
|
34
|
+
# --- Add ChromaDB embedding function import ---
|
35
|
+
try:
|
36
|
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
37
|
+
except ImportError:
|
38
|
+
logger.warning("chromadb library not found. Custom embedding models for ChromaDocumentStore may not work.")
|
39
|
+
SentenceTransformerEmbeddingFunction = None
|
40
|
+
# --- End ChromaDB import ---
|
41
|
+
|
42
|
+
HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
|
43
|
+
logger.debug("Successfully imported Haystack components.")
|
44
|
+
|
45
|
+
except ImportError as e:
|
46
|
+
# HAS_HAYSTACK_EXTRAS remains False
|
47
|
+
# Log the full error and traceback for debugging
|
48
|
+
logger.error(f"Failed to import Haystack components. Search functionality disabled. Error: {e}", exc_info=True)
|
49
|
+
|
50
|
+
# Define dummy types/classes for type hinting and basic checks when extras aren't installed
|
51
|
+
BaseDocumentStore = object
|
52
|
+
DocumentStore = object # Dummy for protocol
|
53
|
+
BaseEmbedder = object # Define dummy BaseEmbedder
|
54
|
+
BaseTextEmbedder = object
|
55
|
+
HaystackDocument = Dict # Represent as Dict if not available
|
56
|
+
Pipeline = None
|
57
|
+
SentenceTransformersTextEmbedder = None
|
58
|
+
ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
|
59
|
+
CohereRanker = None
|
60
|
+
ChromaDocumentStore = None
|
61
|
+
DuplicatePolicy = None # Dummy for DuplicatePolicy
|
62
|
+
SentenceTransformerEmbeddingFunction = None # Dummy if kept
|
63
|
+
|
64
|
+
|
65
|
+
# Helper function to check availability and raise error
|
66
|
+
def check_haystack_availability(feature_name: str = "Search"):
|
67
|
+
"""Raises ImportError if Haystack extras are not installed."""
|
68
|
+
if not HAS_HAYSTACK_EXTRAS:
|
69
|
+
raise ImportError(
|
70
|
+
f"'{feature_name}' requires Haystack extras. "
|
71
|
+
"Please install them using: pip install natural-pdf[haystack]"
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
# ===========================
|
76
|
+
# Default Component Creators
|
77
|
+
# ===========================
|
78
|
+
|
79
|
+
def create_default_document_store(
|
80
|
+
persist_path: str = "./natural_pdf_index",
|
81
|
+
collection_name: str = "natural_pdf_default",
|
82
|
+
embedding_model: Optional[str] = None # Allow specifying the model
|
83
|
+
) -> DocumentStore:
|
84
|
+
"""Creates a default ChromaDB DocumentStore."""
|
85
|
+
check_haystack_availability("create_default_document_store")
|
86
|
+
logger.debug(f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'")
|
87
|
+
if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
|
88
|
+
raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
|
89
|
+
|
90
|
+
try:
|
91
|
+
# Note: For Haystack's Chroma integration, the embedding model is typically handled
|
92
|
+
# by the Embedder component in the indexing/query pipeline, not set directly
|
93
|
+
# on the DocumentStore initialization.
|
94
|
+
# The `embedding_model` parameter passed here might be used later to configure that Embedder.
|
95
|
+
store = ChromaDocumentStore(
|
96
|
+
persist_path=persist_path,
|
97
|
+
collection_name=collection_name,
|
98
|
+
# embedding_function parameter removed as it caused issues with Haystack's util
|
99
|
+
)
|
100
|
+
logger.info(f"Initialized ChromaDocumentStore (Collection: {collection_name})")
|
101
|
+
return store
|
102
|
+
except Exception as e:
|
103
|
+
logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
|
104
|
+
raise RuntimeError(f"Could not create ChromaDocumentStore for collection '{collection_name}'") from e
|
105
|
+
|
106
|
+
def create_default_text_embedder(
|
107
|
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
108
|
+
device: Optional[str] = None # Add device parameter
|
109
|
+
) -> SentenceTransformersTextEmbedder:
|
110
|
+
"""Creates a default SentenceTransformer text embedder."""
|
111
|
+
check_haystack_availability("create_default_text_embedder")
|
112
|
+
logger.debug(f"Creating default SentenceTransformersTextEmbedder with model '{model_name}'")
|
113
|
+
if not SentenceTransformersTextEmbedder:
|
114
|
+
raise RuntimeError("SentenceTransformersTextEmbedder not available.")
|
115
|
+
try:
|
116
|
+
# Use Haystack component which handles device logic
|
117
|
+
embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
|
118
|
+
logger.info(f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})")
|
119
|
+
return embedder
|
120
|
+
except Exception as e:
|
121
|
+
logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
|
122
|
+
raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
|
123
|
+
|
124
|
+
def create_default_multimodal_embedder(*args, **kwargs) -> Any:
|
125
|
+
"""Stub for creating a default multimodal embedder (Not Implemented)."""
|
126
|
+
logger.error("Default multimodal embedder creation is not yet implemented.")
|
127
|
+
raise NotImplementedError(
|
128
|
+
"Creating a default multimodal embedder requires a custom component or integration not yet implemented."
|
129
|
+
" See: https://docs.haystack.deepset.ai/docs/custom-components"
|
130
|
+
)
|
131
|
+
|
132
|
+
def create_default_text_reranker(
|
133
|
+
api_key: Optional[str] = None,
|
134
|
+
model_name: str = "rerank-english-v2.0" # Default Cohere model
|
135
|
+
) -> Optional[Any]: # Returns CohereRanker instance or None
|
136
|
+
"""
|
137
|
+
Creates a default Cohere Reranker if available and API key provided.
|
138
|
+
|
139
|
+
Requires COHERE_API_KEY environment variable or api_key argument.
|
140
|
+
Requires haystack-cohere integration: pip install haystack-cohere
|
141
|
+
"""
|
142
|
+
check_haystack_availability("create_default_text_reranker (optional)")
|
143
|
+
|
144
|
+
if not CohereRanker:
|
145
|
+
logger.debug("CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation.")
|
146
|
+
return None
|
147
|
+
|
148
|
+
# Check for API key (prefer argument over environment variable)
|
149
|
+
cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
|
150
|
+
if not cohere_api_key:
|
151
|
+
logger.warning("COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker.")
|
152
|
+
return None
|
153
|
+
|
154
|
+
logger.debug(f"Creating CohereRanker with model '{model_name}'")
|
155
|
+
try:
|
156
|
+
# Pass API key via authenticator for better practice if supported, or directly
|
157
|
+
# As of haystack 2.0b5, CohereRanker takes api_key directly
|
158
|
+
reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
|
159
|
+
logger.info(f"Initialized CohereRanker (Model: {model_name})")
|
160
|
+
return reranker
|
161
|
+
except Exception as e:
|
162
|
+
logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
|
163
|
+
# Don't raise, just return None as reranker is optional
|
164
|
+
return None
|
165
|
+
|
166
|
+
# --- Default Document Embedder Creator ---
|
167
|
+
def create_default_document_embedder(
|
168
|
+
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
169
|
+
device: Optional[str] = None,
|
170
|
+
progress_bar: bool = True,
|
171
|
+
normalize_embeddings: bool = False # Changed default based on ST documentation
|
172
|
+
) -> Any: # Return Any as actual type depends on availability
|
173
|
+
"""Creates a default SentenceTransformersDocumentEmbedder instance.
|
174
|
+
|
175
|
+
Args:
|
176
|
+
model_name: The Sentence Transformers model name or path.
|
177
|
+
device: The device (e.g., 'cpu', 'cuda') to use.
|
178
|
+
progress_bar: Show progress bar during embedding.
|
179
|
+
normalize_embeddings: Normalize embeddings to unit length.
|
180
|
+
|
181
|
+
Returns:
|
182
|
+
A SentenceTransformersDocumentEmbedder instance or raises ImportError.
|
183
|
+
|
184
|
+
Raises:
|
185
|
+
ImportError: If SentenceTransformersDocumentEmbedder is not available.
|
186
|
+
RuntimeError: If initialization fails.
|
187
|
+
"""
|
188
|
+
check_haystack_availability("SentenceTransformersDocumentEmbedder")
|
189
|
+
if not SentenceTransformersDocumentEmbedder:
|
190
|
+
raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
|
191
|
+
|
192
|
+
# Use the provided device parameter directly.
|
193
|
+
# If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
|
194
|
+
resolved_device = device
|
195
|
+
logger.debug(f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}")
|
196
|
+
|
197
|
+
try:
|
198
|
+
embedder = SentenceTransformersDocumentEmbedder(
|
199
|
+
model=model_name,
|
200
|
+
device=resolved_device,
|
201
|
+
progress_bar=progress_bar,
|
202
|
+
normalize_embeddings=normalize_embeddings,
|
203
|
+
# meta_fields_to_embed=config.get('DOC_EMBEDDER_META_FIELDS', []) # Removed reliance on config
|
204
|
+
# If embedding meta fields is needed, it should be passed as a parameter
|
205
|
+
)
|
206
|
+
embedder.warm_up()
|
207
|
+
logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}") # Use embedder.device after init
|
208
|
+
except Exception as e:
|
209
|
+
logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
|
210
|
+
raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
|
211
|
+
|
212
|
+
return embedder
|
213
|
+
|
214
|
+
|
215
|
+
# ===========================
|
216
|
+
# Helper Functions (Removed _determine_query_embedding)
|
217
|
+
# ===========================
|
218
|
+
|
219
|
+
|
220
|
+
# ===========================
|
221
|
+
# Central Search Logic
|
222
|
+
# ===========================
|
223
|
+
|
224
|
+
def _perform_haystack_search(
|
225
|
+
query: Union[str, Path, Image.Image],
|
226
|
+
document_store: Any, # Use Any for simplicity now
|
227
|
+
collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
|
228
|
+
embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
|
229
|
+
options: BaseSearchOptions
|
230
|
+
) -> List[Dict[str, Any]]:
|
231
|
+
"""Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
|
232
|
+
if not HAS_HAYSTACK_EXTRAS:
|
233
|
+
check_haystack_availability("_perform_haystack_search")
|
234
|
+
return [] # Should not be reached due to check
|
235
|
+
|
236
|
+
logger.info(f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})...")
|
237
|
+
logger.debug(f" Query type: {type(query).__name__}")
|
238
|
+
logger.debug(f" Options: {options}")
|
239
|
+
|
240
|
+
# --- 1. Embed Query (using the provided text embedder) --- #
|
241
|
+
text_query: Optional[str] = None
|
242
|
+
query_embedding: Optional[List[float]] = None
|
243
|
+
|
244
|
+
if isinstance(query, str):
|
245
|
+
text_query = query # Keep text for potential reranker use
|
246
|
+
if not embedder:
|
247
|
+
logger.error("Text query provided, but no embedder instance was passed to _perform_haystack_search.")
|
248
|
+
return []
|
249
|
+
# No need to check type if the type hint is enforced upstream
|
250
|
+
# if not isinstance(embedder, SentenceTransformersTextEmbedder):
|
251
|
+
# logger.warning(f"Provided embedder is {type(embedder).__name__}, not SentenceTransformersTextEmbedder. Assuming it works like one for query embedding.")
|
252
|
+
try:
|
253
|
+
logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
|
254
|
+
embedding_result = embedder.run(text=text_query)
|
255
|
+
query_embedding = embedding_result.get("embedding")
|
256
|
+
if not query_embedding:
|
257
|
+
logger.error(f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'")
|
258
|
+
return []
|
259
|
+
logger.debug(f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking.")
|
260
|
+
except Exception as e:
|
261
|
+
logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
|
262
|
+
return []
|
263
|
+
elif isinstance(query, Path) or isinstance(query, Image.Image):
|
264
|
+
# Currently, this function doesn't support multi-modal query embedding directly
|
265
|
+
logger.error(f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text.")
|
266
|
+
return []
|
267
|
+
else:
|
268
|
+
# Handle other unexpected types
|
269
|
+
logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
|
270
|
+
|
271
|
+
# If we didn't get an embedding (e.g., non-text query), we can't proceed
|
272
|
+
if query_embedding is None:
|
273
|
+
logger.error("Could not obtain query embedding. Cannot perform search.")
|
274
|
+
return []
|
275
|
+
|
276
|
+
# --- 2. Set up Retriever --- #
|
277
|
+
# Assumes the document_store is ChromaDocumentStore for this utility function context
|
278
|
+
if not ChromaEmbeddingRetriever:
|
279
|
+
logger.error("ChromaEmbeddingRetriever not available.")
|
280
|
+
return []
|
281
|
+
|
282
|
+
# Ensure retriever_top_k is set (should be by __post_init__)
|
283
|
+
retriever_top_k = options.retriever_top_k
|
284
|
+
if retriever_top_k is None:
|
285
|
+
logger.warning("options.retriever_top_k was None, defaulting to options.top_k for retriever.")
|
286
|
+
retriever_top_k = options.top_k
|
287
|
+
|
288
|
+
# Instantiate the EMBEDDING retriever
|
289
|
+
retriever = ChromaEmbeddingRetriever(document_store=document_store,
|
290
|
+
filters=options.filters or {}, # Pass filters here
|
291
|
+
top_k=retriever_top_k)
|
292
|
+
|
293
|
+
logger.debug(f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})")
|
294
|
+
|
295
|
+
|
296
|
+
# --- 3. Set up Optional Reranker --- #
|
297
|
+
reranker_instance = None
|
298
|
+
if options.use_reranker in [True, None]: # Check specifically for True or None
|
299
|
+
logger.debug("Attempting to initialize reranker...")
|
300
|
+
# Currently only supports default text reranker (Cohere)
|
301
|
+
reranker_instance = create_default_text_reranker(api_key=options.reranker_api_key,
|
302
|
+
model_name=options.reranker_model or "rerank-english-v2.0")
|
303
|
+
if reranker_instance:
|
304
|
+
# Ensure reranker top_k matches final desired top_k
|
305
|
+
reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
|
306
|
+
logger.info(f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})")
|
307
|
+
else:
|
308
|
+
logger.warning("Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking.")
|
309
|
+
|
310
|
+
|
311
|
+
# --- 4. Build and Run Pipeline --- #
|
312
|
+
if not Pipeline:
|
313
|
+
logger.error("Haystack Pipeline class not available.")
|
314
|
+
return []
|
315
|
+
|
316
|
+
search_pipeline = Pipeline()
|
317
|
+
search_pipeline.add_component("retriever", retriever)
|
318
|
+
|
319
|
+
# Define pipeline input based on EMBEDDING retriever needs
|
320
|
+
pipeline_input = {"retriever": {"query_embedding": query_embedding}}
|
321
|
+
last_component_name = "retriever"
|
322
|
+
|
323
|
+
if reranker_instance:
|
324
|
+
search_pipeline.add_component("reranker", reranker_instance)
|
325
|
+
search_pipeline.connect("retriever.documents", "reranker.documents")
|
326
|
+
# Reranker also needs the query text and final top_k
|
327
|
+
if text_query is None:
|
328
|
+
logger.error("Reranker requires text query, but it was not available (query might not have been text).")
|
329
|
+
# Handle this case - maybe skip reranker or raise error?
|
330
|
+
# For now, let's skip reranker if text is missing
|
331
|
+
logger.warning("Skipping reranker because text query is missing.")
|
332
|
+
reranker_instance = None # Effectively remove it from the logic below
|
333
|
+
last_component_name = "retriever" # Reset last component
|
334
|
+
# Remove reranker component if added? Less clean. Let's just not add its input.
|
335
|
+
else:
|
336
|
+
pipeline_input["reranker"] = {"query": text_query, "top_k": options.top_k} # Pass query and final top_k
|
337
|
+
last_component_name = "reranker"
|
338
|
+
logger.debug("Added reranker to pipeline and configured input.")
|
339
|
+
else:
|
340
|
+
# No reranker was initialized or it was skipped
|
341
|
+
last_component_name = "reranker"
|
342
|
+
logger.debug("Added reranker to pipeline.")
|
343
|
+
|
344
|
+
|
345
|
+
logger.info("Running Haystack search pipeline...")
|
346
|
+
try:
|
347
|
+
result = search_pipeline.run(pipeline_input)
|
348
|
+
logger.info("Haystack search pipeline finished.")
|
349
|
+
|
350
|
+
except Exception as e:
|
351
|
+
logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
|
352
|
+
return []
|
353
|
+
|
354
|
+
# --- 5. Process Results --- #
|
355
|
+
final_documents: List[HaystackDocument] = []
|
356
|
+
# Check output based on last component in the pipeline
|
357
|
+
if last_component_name in result and result[last_component_name].get("documents"):
|
358
|
+
final_documents = result[last_component_name]["documents"]
|
359
|
+
logger.debug(f"Processed results from '{last_component_name}' ({len(final_documents)} documents).")
|
360
|
+
else:
|
361
|
+
logger.warning(f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}")
|
362
|
+
return []
|
363
|
+
|
364
|
+
# Convert Haystack Documents to the desired output format
|
365
|
+
output_results = []
|
366
|
+
for doc in final_documents: # Correctly loop over final_documents
|
367
|
+
# Check if doc is actually a Haystack Document object or potentially a dict
|
368
|
+
doc_id = getattr(doc, 'id', None)
|
369
|
+
doc_score = getattr(doc, 'score', 0.0)
|
370
|
+
doc_content = getattr(doc, 'content', None)
|
371
|
+
doc_meta = getattr(doc, 'meta', {})
|
372
|
+
|
373
|
+
meta = doc_meta or {}
|
374
|
+
output = {
|
375
|
+
"pdf_path": meta.get("pdf_path", "Unknown"),
|
376
|
+
"page_number": meta.get("page_number", -1),
|
377
|
+
"score": doc_score if doc_score is not None else 0.0, # Handle potential None score
|
378
|
+
"content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
|
379
|
+
"metadata": meta,
|
380
|
+
# "haystack_document": doc # Optionally include the full Haystack doc
|
381
|
+
}
|
382
|
+
output_results.append(output)
|
383
|
+
|
384
|
+
logger.info(f"Returning {len(output_results)} relevant results.")
|
385
|
+
return output_results
|
386
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import logging
|
2
|
+
from dataclasses import dataclass, field
|
3
|
+
from typing import List, Optional, Dict, Any, Tuple, Union, Literal
|
4
|
+
|
5
|
+
# Use object placeholders for external types to avoid direct dependency
|
6
|
+
BaseRanker = object
|
7
|
+
BaseEmbedder = object
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
# --- Base Search Options ---
|
12
|
+
@dataclass
|
13
|
+
class BaseSearchOptions:
|
14
|
+
"""Base options for search operations."""
|
15
|
+
# How many results to return finally (after retrieval and optional reranking)
|
16
|
+
top_k: int = 10
|
17
|
+
# How many candidates the retriever should fetch initially (relevant if reranking)
|
18
|
+
# Defaults to a value based on top_k if reranking, otherwise just top_k
|
19
|
+
retriever_top_k: Optional[int] = None
|
20
|
+
# Filters applied during retrieval (Haystack standard dictionary format)
|
21
|
+
filters: Optional[Dict[str, Any]] = None
|
22
|
+
|
23
|
+
# --- Reranking Configuration ---
|
24
|
+
# Option 1: Simple boolean/None
|
25
|
+
use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
|
26
|
+
# Option 2: Provide a specific instance (takes precedence over use_reranker boolean)
|
27
|
+
reranker_instance: Optional[BaseRanker] = None
|
28
|
+
# Parameters for default Cohere reranker (if use_reranker=True)
|
29
|
+
reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
|
30
|
+
reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
|
31
|
+
|
32
|
+
# --- Embedder Configuration (Less common to override per-query, usually set at indexing) ---
|
33
|
+
# embedder_instance: Optional[BaseEmbedder] = None # Might be useful for advanced cases
|
34
|
+
|
35
|
+
def __post_init__(self):
|
36
|
+
# Validate that top_k values make sense
|
37
|
+
if self.retriever_top_k is None:
|
38
|
+
# If retriever_top_k isn't set, default it based on reranking needs
|
39
|
+
if self.use_reranker:
|
40
|
+
self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
|
41
|
+
else:
|
42
|
+
self.retriever_top_k = self.top_k
|
43
|
+
elif self.retriever_top_k < self.top_k:
|
44
|
+
logger.warning(f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results.")
|
45
|
+
|
46
|
+
# --- Text Search Specific Options ---
|
47
|
+
@dataclass
|
48
|
+
class TextSearchOptions(BaseSearchOptions):
|
49
|
+
"""Options specific to text-based semantic search."""
|
50
|
+
# Add any text-specific overrides or parameters here if needed in the future
|
51
|
+
# e.g., specifying default text reranker model name if different defaults emerge
|
52
|
+
# default_text_reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
53
|
+
pass # Currently inherits all base options
|
54
|
+
|
55
|
+
# --- MultiModal Search Specific Options ---
|
56
|
+
@dataclass
|
57
|
+
class MultiModalSearchOptions(BaseSearchOptions):
|
58
|
+
"""Options specific to multimodal semantic search."""
|
59
|
+
# Flag to potentially use a default multimodal reranker if available
|
60
|
+
# (overrides base use_reranker=True if reranker_instance is None)
|
61
|
+
use_multimodal_reranker: bool = True # Attempt multimodal rerank if use_reranker=True/None and no instance given
|
62
|
+
# e.g., specifying default multimodal embedder/reranker models
|
63
|
+
# default_multimodal_embedder_model: str = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
|
64
|
+
# default_multimodal_reranker_model: str = "jinaai/jina-reranker-m0" # Example
|
65
|
+
|
66
|
+
# --- Union Type ---
|
67
|
+
# Defines the types allowed for search configuration.
|
68
|
+
SearchOptions = Union[
|
69
|
+
TextSearchOptions,
|
70
|
+
MultiModalSearchOptions,
|
71
|
+
BaseSearchOptions # Include base for typing flexibility
|
72
|
+
]
|
@@ -0,0 +1,189 @@
|
|
1
|
+
"""Defines the protocol for a search service."""
|
2
|
+
from typing import Protocol, List, Dict, Any, Optional, Union, Iterable
|
3
|
+
from pathlib import Path
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
# Use typing_extensions for Python < 3.8 compatibility if needed,
|
7
|
+
# otherwise, typing.Protocol is fine for >= 3.8
|
8
|
+
# from typing_extensions import Protocol
|
9
|
+
|
10
|
+
# Forward declare SearchOptions to avoid circular import if needed,
|
11
|
+
# or import if structure allows (assuming it's safe here)
|
12
|
+
from natural_pdf.search.search_options import SearchOptions, BaseSearchOptions
|
13
|
+
|
14
|
+
# Use Dict as placeholder for external Haystack Document type
|
15
|
+
HaystackDocument = Dict[str, Any]
|
16
|
+
|
17
|
+
|
18
|
+
class IndexConfigurationError(RuntimeError):
|
19
|
+
"""Custom exception for configuration mismatches during indexing."""
|
20
|
+
pass
|
21
|
+
|
22
|
+
|
23
|
+
# Add new exception for sync/init safety
|
24
|
+
class IndexExistsError(RuntimeError):
|
25
|
+
"""Raised when attempting to index implicitly to an existing persistent index without force_reindex=True."""
|
26
|
+
pass
|
27
|
+
|
28
|
+
|
29
|
+
class Indexable(Protocol):
|
30
|
+
"""
|
31
|
+
Protocol defining the minimal interface for an object that can be indexed.
|
32
|
+
Objects passed to SearchServiceProtocol.index must conform to this.
|
33
|
+
"""
|
34
|
+
|
35
|
+
def get_id(self) -> str:
|
36
|
+
"""Return a unique identifier for this indexable item."""
|
37
|
+
...
|
38
|
+
|
39
|
+
def get_metadata(self) -> Dict[str, Any]:
|
40
|
+
"""Return a dictionary of metadata associated with this item."""
|
41
|
+
...
|
42
|
+
|
43
|
+
def get_content(self) -> Any:
|
44
|
+
"""
|
45
|
+
Return the primary content of this item.
|
46
|
+
The SearchService implementation will determine how to process this content
|
47
|
+
(e.g., call .extract_text(), .to_image(), or handle directly).
|
48
|
+
"""
|
49
|
+
...
|
50
|
+
|
51
|
+
# Added for syncing
|
52
|
+
def get_content_hash(self) -> str:
|
53
|
+
"""
|
54
|
+
Return a hash representing the indexable content.
|
55
|
+
Used by SearchableMixin.sync_index to detect changes efficiently.
|
56
|
+
Implementations should hash the same content used for generating embeddings.
|
57
|
+
"""
|
58
|
+
...
|
59
|
+
|
60
|
+
|
61
|
+
class SearchServiceProtocol(Protocol):
|
62
|
+
"""
|
63
|
+
Protocol defining the interface for indexing and searching documents.
|
64
|
+
|
65
|
+
Implementations of this protocol handle the specifics of interacting
|
66
|
+
with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
|
67
|
+
An instance of a service implementing this protocol is tied to a specific collection name.
|
68
|
+
"""
|
69
|
+
collection_name: str
|
70
|
+
# Removed internal state hints (_persist, _embedding_model) - implementation detail
|
71
|
+
|
72
|
+
def index(
|
73
|
+
self,
|
74
|
+
documents: Iterable[Indexable],
|
75
|
+
embedder_device: Optional[str] = None,
|
76
|
+
force_reindex: bool = False,
|
77
|
+
) -> None:
|
78
|
+
"""
|
79
|
+
Indexes the provided documents into the collection managed by this service instance.
|
80
|
+
|
81
|
+
Handles store and embedder creation/retrieval, configuration checks,
|
82
|
+
re-indexing logic (including potential deletion), embedding, and writing.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
documents: An iterable of objects conforming to the Indexable protocol.
|
86
|
+
embedder_device: The device ('cpu', 'cuda', etc.) for the embedder.
|
87
|
+
Defaults defined by the implementation.
|
88
|
+
force_reindex: If True, delete the entire existing collection
|
89
|
+
(if configuration permits) before indexing.
|
90
|
+
|
91
|
+
Raises:
|
92
|
+
IndexConfigurationError: If `force_reindex` is False and the existing
|
93
|
+
collection has incompatible settings.
|
94
|
+
ImportError: If required backend libraries are missing.
|
95
|
+
RuntimeError: For other failures during indexing.
|
96
|
+
"""
|
97
|
+
...
|
98
|
+
|
99
|
+
def search(
|
100
|
+
self,
|
101
|
+
query: Any, # Allow any query type, service implementation handles it
|
102
|
+
options: BaseSearchOptions,
|
103
|
+
) -> List[Dict[str, Any]]:
|
104
|
+
"""
|
105
|
+
Performs a search within the collection managed by this service instance.
|
106
|
+
|
107
|
+
Args:
|
108
|
+
query: The search query (type depends on service capabilities).
|
109
|
+
options: SearchOptions object containing configuration like top_k, filters, etc.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
A list of result dictionaries, typically containing document content,
|
113
|
+
metadata, and relevance scores.
|
114
|
+
|
115
|
+
Raises:
|
116
|
+
FileNotFoundError: If the collection managed by this service does not exist.
|
117
|
+
RuntimeError: For other failures during search.
|
118
|
+
TypeError: If the query type is incompatible with the backend/options.
|
119
|
+
"""
|
120
|
+
...
|
121
|
+
|
122
|
+
def delete_index(
|
123
|
+
self,
|
124
|
+
) -> bool:
|
125
|
+
"""
|
126
|
+
Deletes the entire index/collection managed by this service instance.
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
True if deletion was successful or collection didn't exist,
|
130
|
+
False if deletion failed.
|
131
|
+
|
132
|
+
Raises:
|
133
|
+
ImportError: If required backend libraries are missing.
|
134
|
+
RuntimeError: For backend errors during deletion.
|
135
|
+
"""
|
136
|
+
...
|
137
|
+
|
138
|
+
def index_exists(
|
139
|
+
self,
|
140
|
+
) -> bool:
|
141
|
+
"""
|
142
|
+
Checks if the index/collection managed by this service instance exists.
|
143
|
+
|
144
|
+
Returns:
|
145
|
+
True if the index exists, False otherwise.
|
146
|
+
|
147
|
+
Raises:
|
148
|
+
ImportError: If required backend libraries are missing.
|
149
|
+
RuntimeError: For backend errors during the check.
|
150
|
+
"""
|
151
|
+
...
|
152
|
+
|
153
|
+
# --- Methods required for sync_index (full strategy) ---
|
154
|
+
|
155
|
+
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
156
|
+
"""
|
157
|
+
Retrieves documents from the index, optionally including metadata.
|
158
|
+
Required for the 'full' strategy in SearchableMixin.sync_index.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
include_metadata: If True, include the 'meta' field in the returned dicts.
|
162
|
+
Metadata should include 'content_hash' if available.
|
163
|
+
**kwargs: Additional backend-specific filtering or retrieval options.
|
164
|
+
|
165
|
+
Returns:
|
166
|
+
A list of dictionaries, each representing a document.
|
167
|
+
Must include at least 'id'. If include_metadata=True, must include 'meta'.
|
168
|
+
|
169
|
+
Raises:
|
170
|
+
NotImplementedError: If the service does not support listing documents.
|
171
|
+
"""
|
172
|
+
...
|
173
|
+
|
174
|
+
def delete_documents(self, ids: List[str]) -> None:
|
175
|
+
"""
|
176
|
+
Deletes documents from the index based on their IDs.
|
177
|
+
Required for the 'full' strategy in SearchableMixin.sync_index.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
ids: A list of document IDs to delete.
|
181
|
+
|
182
|
+
Raises:
|
183
|
+
NotImplementedError: If the service does not support deleting documents by ID.
|
184
|
+
RuntimeError: For backend errors during deletion.
|
185
|
+
"""
|
186
|
+
...
|
187
|
+
|
188
|
+
# Optional: Add methods for getting index stats, etc.
|
189
|
+
# def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...
|