natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/api/index.md +386 -0
- docs/assets/favicon.png +3 -0
- docs/assets/favicon.svg +3 -0
- docs/assets/javascripts/custom.js +17 -0
- docs/assets/logo.svg +3 -0
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +17 -0
- docs/assets/social-preview.svg +17 -0
- docs/assets/stylesheets/custom.css +65 -0
- docs/document-qa/index.ipynb +435 -0
- docs/document-qa/index.md +79 -0
- docs/element-selection/index.ipynb +915 -0
- docs/element-selection/index.md +229 -0
- docs/index.md +170 -0
- docs/installation/index.md +69 -0
- docs/interactive-widget/index.ipynb +962 -0
- docs/interactive-widget/index.md +12 -0
- docs/layout-analysis/index.ipynb +818 -0
- docs/layout-analysis/index.md +185 -0
- docs/ocr/index.md +209 -0
- docs/pdf-navigation/index.ipynb +314 -0
- docs/pdf-navigation/index.md +97 -0
- docs/regions/index.ipynb +816 -0
- docs/regions/index.md +294 -0
- docs/tables/index.ipynb +658 -0
- docs/tables/index.md +144 -0
- docs/text-analysis/index.ipynb +370 -0
- docs/text-analysis/index.md +105 -0
- docs/text-extraction/index.ipynb +1478 -0
- docs/text-extraction/index.md +292 -0
- docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
- docs/tutorials/01-loading-and-extraction.md +95 -0
- docs/tutorials/02-finding-elements.ipynb +340 -0
- docs/tutorials/02-finding-elements.md +149 -0
- docs/tutorials/03-extracting-blocks.ipynb +147 -0
- docs/tutorials/03-extracting-blocks.md +48 -0
- docs/tutorials/04-table-extraction.ipynb +114 -0
- docs/tutorials/04-table-extraction.md +50 -0
- docs/tutorials/05-excluding-content.ipynb +270 -0
- docs/tutorials/05-excluding-content.md +109 -0
- docs/tutorials/06-document-qa.ipynb +332 -0
- docs/tutorials/06-document-qa.md +91 -0
- docs/tutorials/07-layout-analysis.ipynb +288 -0
- docs/tutorials/07-layout-analysis.md +66 -0
- docs/tutorials/07-working-with-regions.ipynb +413 -0
- docs/tutorials/07-working-with-regions.md +151 -0
- docs/tutorials/08-spatial-navigation.ipynb +508 -0
- docs/tutorials/08-spatial-navigation.md +190 -0
- docs/tutorials/09-section-extraction.ipynb +2434 -0
- docs/tutorials/09-section-extraction.md +256 -0
- docs/tutorials/10-form-field-extraction.ipynb +512 -0
- docs/tutorials/10-form-field-extraction.md +201 -0
- docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
- docs/tutorials/11-enhanced-table-processing.md +9 -0
- docs/tutorials/12-ocr-integration.ipynb +604 -0
- docs/tutorials/12-ocr-integration.md +175 -0
- docs/tutorials/13-semantic-search.ipynb +1328 -0
- docs/tutorials/13-semantic-search.md +77 -0
- docs/visual-debugging/index.ipynb +2970 -0
- docs/visual-debugging/index.md +157 -0
- docs/visual-debugging/region.png +0 -0
- natural_pdf/__init__.py +50 -33
- natural_pdf/analyzers/__init__.py +2 -1
- natural_pdf/analyzers/layout/base.py +32 -24
- natural_pdf/analyzers/layout/docling.py +131 -72
- natural_pdf/analyzers/layout/gemini.py +264 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
- natural_pdf/analyzers/layout/layout_manager.py +125 -58
- natural_pdf/analyzers/layout/layout_options.py +43 -17
- natural_pdf/analyzers/layout/paddle.py +152 -95
- natural_pdf/analyzers/layout/surya.py +164 -92
- natural_pdf/analyzers/layout/tatr.py +149 -84
- natural_pdf/analyzers/layout/yolo.py +89 -45
- natural_pdf/analyzers/text_options.py +22 -15
- natural_pdf/analyzers/text_structure.py +131 -85
- natural_pdf/analyzers/utils.py +30 -23
- natural_pdf/collections/pdf_collection.py +146 -97
- natural_pdf/core/__init__.py +1 -1
- natural_pdf/core/element_manager.py +419 -337
- natural_pdf/core/highlighting_service.py +268 -196
- natural_pdf/core/page.py +1044 -521
- natural_pdf/core/pdf.py +516 -313
- natural_pdf/elements/__init__.py +1 -1
- natural_pdf/elements/base.py +307 -225
- natural_pdf/elements/collections.py +805 -543
- natural_pdf/elements/line.py +39 -36
- natural_pdf/elements/rect.py +32 -30
- natural_pdf/elements/region.py +889 -879
- natural_pdf/elements/text.py +127 -99
- natural_pdf/exporters/__init__.py +0 -1
- natural_pdf/exporters/searchable_pdf.py +261 -102
- natural_pdf/ocr/__init__.py +57 -35
- natural_pdf/ocr/engine.py +150 -46
- natural_pdf/ocr/engine_easyocr.py +146 -150
- natural_pdf/ocr/engine_paddle.py +118 -175
- natural_pdf/ocr/engine_surya.py +78 -141
- natural_pdf/ocr/ocr_factory.py +114 -0
- natural_pdf/ocr/ocr_manager.py +122 -124
- natural_pdf/ocr/ocr_options.py +16 -20
- natural_pdf/ocr/utils.py +98 -0
- natural_pdf/qa/__init__.py +1 -1
- natural_pdf/qa/document_qa.py +119 -111
- natural_pdf/search/__init__.py +37 -31
- natural_pdf/search/haystack_search_service.py +312 -189
- natural_pdf/search/haystack_utils.py +186 -122
- natural_pdf/search/search_options.py +25 -14
- natural_pdf/search/search_service_protocol.py +12 -6
- natural_pdf/search/searchable_mixin.py +261 -176
- natural_pdf/selectors/__init__.py +2 -1
- natural_pdf/selectors/parser.py +159 -316
- natural_pdf/templates/__init__.py +1 -1
- natural_pdf/templates/spa/css/style.css +334 -0
- natural_pdf/templates/spa/index.html +31 -0
- natural_pdf/templates/spa/js/app.js +472 -0
- natural_pdf/templates/spa/words.txt +235976 -0
- natural_pdf/utils/debug.py +32 -0
- natural_pdf/utils/highlighting.py +8 -2
- natural_pdf/utils/identifiers.py +29 -0
- natural_pdf/utils/packaging.py +418 -0
- natural_pdf/utils/reading_order.py +65 -63
- natural_pdf/utils/text_extraction.py +195 -0
- natural_pdf/utils/visualization.py +70 -61
- natural_pdf/widgets/__init__.py +2 -3
- natural_pdf/widgets/viewer.py +749 -718
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
- natural_pdf-0.1.6.dist-info/RECORD +141 -0
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
- natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
- notebooks/Examples.ipynb +1293 -0
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +543 -0
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- natural_pdf/templates/ocr_debug.html +0 -517
- natural_pdf-0.1.4.dist-info/RECORD +0 -61
- natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
- {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,30 +1,38 @@
|
|
1
1
|
# natural_pdf/search/haystack_utils.py
|
2
2
|
import logging
|
3
3
|
import os
|
4
|
-
from typing import Optional, Dict, Any, List, Union, Tuple, Type
|
5
|
-
from pathlib import Path
|
6
|
-
from PIL import Image # Ensure Image is imported unconditionally
|
7
4
|
import warnings
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
7
|
+
|
8
|
+
from PIL import Image # Ensure Image is imported unconditionally
|
8
9
|
|
9
|
-
from natural_pdf.search.search_options import
|
10
|
+
from natural_pdf.search.search_options import (
|
11
|
+
BaseSearchOptions,
|
12
|
+
MultiModalSearchOptions,
|
13
|
+
SearchOptions,
|
14
|
+
TextSearchOptions,
|
15
|
+
)
|
10
16
|
|
11
17
|
# Set up logger for this module
|
12
18
|
logger = logging.getLogger(__name__)
|
13
19
|
|
14
20
|
# --- Define flag BEFORE trying Haystack imports ---
|
15
|
-
HAS_HAYSTACK_EXTRAS = False
|
21
|
+
HAS_HAYSTACK_EXTRAS = False # Default to False
|
16
22
|
|
17
23
|
# --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
|
18
24
|
try:
|
19
25
|
import haystack
|
20
|
-
from haystack import Document as HaystackDocument
|
21
|
-
from
|
22
|
-
from haystack.document_stores.types import DuplicatePolicy, DocumentStore
|
26
|
+
from haystack import Document as HaystackDocument
|
27
|
+
from haystack import Pipeline
|
23
28
|
from haystack.components.embedders import (
|
29
|
+
SentenceTransformersDocumentEmbedder,
|
24
30
|
SentenceTransformersTextEmbedder,
|
25
|
-
SentenceTransformersDocumentEmbedder
|
26
31
|
)
|
32
|
+
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
27
33
|
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
34
|
+
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
35
|
+
|
28
36
|
# Keep try/except for optional Cohere
|
29
37
|
try:
|
30
38
|
from haystack.components.rankers import CohereRanker
|
@@ -33,33 +41,37 @@ try:
|
|
33
41
|
|
34
42
|
# --- Add ChromaDB embedding function import ---
|
35
43
|
try:
|
36
|
-
|
44
|
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
37
45
|
except ImportError:
|
38
|
-
|
39
|
-
|
46
|
+
logger.warning(
|
47
|
+
"chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
|
48
|
+
)
|
49
|
+
SentenceTransformerEmbeddingFunction = None
|
40
50
|
# --- End ChromaDB import ---
|
41
51
|
|
42
|
-
HAS_HAYSTACK_EXTRAS = True
|
52
|
+
HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
|
43
53
|
logger.debug("Successfully imported Haystack components.")
|
44
54
|
|
45
55
|
except ImportError as e:
|
46
56
|
# HAS_HAYSTACK_EXTRAS remains False
|
47
57
|
# Log the full error and traceback for debugging
|
48
|
-
logger.
|
58
|
+
logger.warning(
|
59
|
+
f"Failed to import Haystack components. Semantic search functionality disabled.",
|
60
|
+
)
|
49
61
|
|
50
62
|
# Define dummy types/classes for type hinting and basic checks when extras aren't installed
|
51
63
|
BaseDocumentStore = object
|
52
|
-
DocumentStore = object
|
53
|
-
BaseEmbedder = object
|
64
|
+
DocumentStore = object # Dummy for protocol
|
65
|
+
BaseEmbedder = object # Define dummy BaseEmbedder
|
54
66
|
BaseTextEmbedder = object
|
55
|
-
HaystackDocument = Dict
|
67
|
+
HaystackDocument = Dict # Represent as Dict if not available
|
56
68
|
Pipeline = None
|
57
69
|
SentenceTransformersTextEmbedder = None
|
58
|
-
ChromaEmbeddingRetriever = None
|
70
|
+
ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
|
59
71
|
CohereRanker = None
|
60
72
|
ChromaDocumentStore = None
|
61
|
-
DuplicatePolicy = None
|
62
|
-
SentenceTransformerEmbeddingFunction = None
|
73
|
+
DuplicatePolicy = None # Dummy for DuplicatePolicy
|
74
|
+
SentenceTransformerEmbeddingFunction = None # Dummy if kept
|
63
75
|
|
64
76
|
|
65
77
|
# Helper function to check availability and raise error
|
@@ -76,16 +88,19 @@ def check_haystack_availability(feature_name: str = "Search"):
|
|
76
88
|
# Default Component Creators
|
77
89
|
# ===========================
|
78
90
|
|
91
|
+
|
79
92
|
def create_default_document_store(
|
80
93
|
persist_path: str = "./natural_pdf_index",
|
81
94
|
collection_name: str = "natural_pdf_default",
|
82
|
-
embedding_model: Optional[str] = None
|
95
|
+
embedding_model: Optional[str] = None, # Allow specifying the model
|
83
96
|
) -> DocumentStore:
|
84
97
|
"""Creates a default ChromaDB DocumentStore."""
|
85
98
|
check_haystack_availability("create_default_document_store")
|
86
|
-
logger.debug(
|
87
|
-
|
88
|
-
|
99
|
+
logger.debug(
|
100
|
+
f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'"
|
101
|
+
)
|
102
|
+
if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
|
103
|
+
raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
|
89
104
|
|
90
105
|
try:
|
91
106
|
# Note: For Haystack's Chroma integration, the embedding model is typically handled
|
@@ -101,11 +116,14 @@ def create_default_document_store(
|
|
101
116
|
return store
|
102
117
|
except Exception as e:
|
103
118
|
logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
|
104
|
-
raise RuntimeError(
|
119
|
+
raise RuntimeError(
|
120
|
+
f"Could not create ChromaDocumentStore for collection '{collection_name}'"
|
121
|
+
) from e
|
122
|
+
|
105
123
|
|
106
124
|
def create_default_text_embedder(
|
107
125
|
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
108
|
-
device: Optional[str] = None
|
126
|
+
device: Optional[str] = None, # Add device parameter
|
109
127
|
) -> SentenceTransformersTextEmbedder:
|
110
128
|
"""Creates a default SentenceTransformer text embedder."""
|
111
129
|
check_haystack_availability("create_default_text_embedder")
|
@@ -115,11 +133,16 @@ def create_default_text_embedder(
|
|
115
133
|
try:
|
116
134
|
# Use Haystack component which handles device logic
|
117
135
|
embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
|
118
|
-
logger.info(
|
136
|
+
logger.info(
|
137
|
+
f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})"
|
138
|
+
)
|
119
139
|
return embedder
|
120
140
|
except Exception as e:
|
121
141
|
logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
|
122
|
-
raise RuntimeError(
|
142
|
+
raise RuntimeError(
|
143
|
+
f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
|
144
|
+
) from e
|
145
|
+
|
123
146
|
|
124
147
|
def create_default_multimodal_embedder(*args, **kwargs) -> Any:
|
125
148
|
"""Stub for creating a default multimodal embedder (Not Implemented)."""
|
@@ -129,47 +152,52 @@ def create_default_multimodal_embedder(*args, **kwargs) -> Any:
|
|
129
152
|
" See: https://docs.haystack.deepset.ai/docs/custom-components"
|
130
153
|
)
|
131
154
|
|
155
|
+
|
132
156
|
def create_default_text_reranker(
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
api_key: Optional[str] = None, model_name: str = "rerank-english-v2.0" # Default Cohere model
|
158
|
+
) -> Optional[Any]: # Returns CohereRanker instance or None
|
159
|
+
"""
|
160
|
+
Creates a default Cohere Reranker if available and API key provided.
|
161
|
+
|
162
|
+
Requires COHERE_API_KEY environment variable or api_key argument.
|
163
|
+
Requires haystack-cohere integration: pip install haystack-cohere
|
164
|
+
"""
|
165
|
+
check_haystack_availability("create_default_text_reranker (optional)")
|
166
|
+
|
167
|
+
if not CohereRanker:
|
168
|
+
logger.debug(
|
169
|
+
"CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation."
|
170
|
+
)
|
171
|
+
return None
|
172
|
+
|
173
|
+
# Check for API key (prefer argument over environment variable)
|
174
|
+
cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
|
175
|
+
if not cohere_api_key:
|
176
|
+
logger.warning(
|
177
|
+
"COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker."
|
178
|
+
)
|
179
|
+
return None
|
180
|
+
|
181
|
+
logger.debug(f"Creating CohereRanker with model '{model_name}'")
|
182
|
+
try:
|
183
|
+
# Pass API key via authenticator for better practice if supported, or directly
|
184
|
+
# As of haystack 2.0b5, CohereRanker takes api_key directly
|
185
|
+
reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
|
186
|
+
logger.info(f"Initialized CohereRanker (Model: {model_name})")
|
187
|
+
return reranker
|
188
|
+
except Exception as e:
|
189
|
+
logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
|
190
|
+
# Don't raise, just return None as reranker is optional
|
191
|
+
return None
|
192
|
+
|
165
193
|
|
166
194
|
# --- Default Document Embedder Creator ---
|
167
195
|
def create_default_document_embedder(
|
168
196
|
model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
|
169
197
|
device: Optional[str] = None,
|
170
198
|
progress_bar: bool = True,
|
171
|
-
normalize_embeddings: bool = False
|
172
|
-
) -> Any:
|
199
|
+
normalize_embeddings: bool = False, # Changed default based on ST documentation
|
200
|
+
) -> Any: # Return Any as actual type depends on availability
|
173
201
|
"""Creates a default SentenceTransformersDocumentEmbedder instance.
|
174
202
|
|
175
203
|
Args:
|
@@ -192,7 +220,9 @@ def create_default_document_embedder(
|
|
192
220
|
# Use the provided device parameter directly.
|
193
221
|
# If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
|
194
222
|
resolved_device = device
|
195
|
-
logger.debug(
|
223
|
+
logger.debug(
|
224
|
+
f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}"
|
225
|
+
)
|
196
226
|
|
197
227
|
try:
|
198
228
|
embedder = SentenceTransformersDocumentEmbedder(
|
@@ -204,10 +234,16 @@ def create_default_document_embedder(
|
|
204
234
|
# If embedding meta fields is needed, it should be passed as a parameter
|
205
235
|
)
|
206
236
|
embedder.warm_up()
|
207
|
-
logger.info(
|
237
|
+
logger.info(
|
238
|
+
f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}"
|
239
|
+
) # Use embedder.device after init
|
208
240
|
except Exception as e:
|
209
|
-
logger.error(
|
210
|
-
|
241
|
+
logger.error(
|
242
|
+
f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
|
243
|
+
)
|
244
|
+
raise RuntimeError(
|
245
|
+
f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
|
246
|
+
) from e
|
211
247
|
|
212
248
|
return embedder
|
213
249
|
|
@@ -221,19 +257,22 @@ def create_default_document_embedder(
|
|
221
257
|
# Central Search Logic
|
222
258
|
# ===========================
|
223
259
|
|
260
|
+
|
224
261
|
def _perform_haystack_search(
|
225
262
|
query: Union[str, Path, Image.Image],
|
226
|
-
document_store: Any,
|
227
|
-
collection_name: str,
|
228
|
-
embedder: SentenceTransformersTextEmbedder,
|
229
|
-
options: BaseSearchOptions
|
263
|
+
document_store: Any, # Use Any for simplicity now
|
264
|
+
collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
|
265
|
+
embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
|
266
|
+
options: BaseSearchOptions,
|
230
267
|
) -> List[Dict[str, Any]]:
|
231
268
|
"""Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
|
232
269
|
if not HAS_HAYSTACK_EXTRAS:
|
233
270
|
check_haystack_availability("_perform_haystack_search")
|
234
|
-
return []
|
271
|
+
return [] # Should not be reached due to check
|
235
272
|
|
236
|
-
logger.info(
|
273
|
+
logger.info(
|
274
|
+
f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})..."
|
275
|
+
)
|
237
276
|
logger.debug(f" Query type: {type(query).__name__}")
|
238
277
|
logger.debug(f" Options: {options}")
|
239
278
|
|
@@ -242,9 +281,11 @@ def _perform_haystack_search(
|
|
242
281
|
query_embedding: Optional[List[float]] = None
|
243
282
|
|
244
283
|
if isinstance(query, str):
|
245
|
-
text_query = query
|
284
|
+
text_query = query # Keep text for potential reranker use
|
246
285
|
if not embedder:
|
247
|
-
logger.error(
|
286
|
+
logger.error(
|
287
|
+
"Text query provided, but no embedder instance was passed to _perform_haystack_search."
|
288
|
+
)
|
248
289
|
return []
|
249
290
|
# No need to check type if the type hint is enforced upstream
|
250
291
|
# if not isinstance(embedder, SentenceTransformersTextEmbedder):
|
@@ -254,15 +295,21 @@ def _perform_haystack_search(
|
|
254
295
|
embedding_result = embedder.run(text=text_query)
|
255
296
|
query_embedding = embedding_result.get("embedding")
|
256
297
|
if not query_embedding:
|
257
|
-
|
258
|
-
|
259
|
-
|
298
|
+
logger.error(
|
299
|
+
f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'"
|
300
|
+
)
|
301
|
+
return []
|
302
|
+
logger.debug(
|
303
|
+
f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking."
|
304
|
+
)
|
260
305
|
except Exception as e:
|
261
306
|
logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
|
262
307
|
return []
|
263
308
|
elif isinstance(query, Path) or isinstance(query, Image.Image):
|
264
309
|
# Currently, this function doesn't support multi-modal query embedding directly
|
265
|
-
logger.error(
|
310
|
+
logger.error(
|
311
|
+
f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
|
312
|
+
)
|
266
313
|
return []
|
267
314
|
else:
|
268
315
|
# Handle other unexpected types
|
@@ -276,42 +323,52 @@ def _perform_haystack_search(
|
|
276
323
|
# --- 2. Set up Retriever --- #
|
277
324
|
# Assumes the document_store is ChromaDocumentStore for this utility function context
|
278
325
|
if not ChromaEmbeddingRetriever:
|
279
|
-
|
280
|
-
|
326
|
+
logger.error("ChromaEmbeddingRetriever not available.")
|
327
|
+
return []
|
281
328
|
|
282
329
|
# Ensure retriever_top_k is set (should be by __post_init__)
|
283
330
|
retriever_top_k = options.retriever_top_k
|
284
331
|
if retriever_top_k is None:
|
285
|
-
|
286
|
-
|
332
|
+
logger.warning(
|
333
|
+
"options.retriever_top_k was None, defaulting to options.top_k for retriever."
|
334
|
+
)
|
335
|
+
retriever_top_k = options.top_k
|
287
336
|
|
288
337
|
# Instantiate the EMBEDDING retriever
|
289
|
-
retriever = ChromaEmbeddingRetriever(
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
338
|
+
retriever = ChromaEmbeddingRetriever(
|
339
|
+
document_store=document_store,
|
340
|
+
filters=options.filters or {}, # Pass filters here
|
341
|
+
top_k=retriever_top_k,
|
342
|
+
)
|
294
343
|
|
344
|
+
logger.debug(
|
345
|
+
f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
|
346
|
+
)
|
295
347
|
|
296
348
|
# --- 3. Set up Optional Reranker --- #
|
297
349
|
reranker_instance = None
|
298
|
-
if options.use_reranker in [True, None]:
|
350
|
+
if options.use_reranker in [True, None]: # Check specifically for True or None
|
299
351
|
logger.debug("Attempting to initialize reranker...")
|
300
352
|
# Currently only supports default text reranker (Cohere)
|
301
|
-
reranker_instance = create_default_text_reranker(
|
302
|
-
|
353
|
+
reranker_instance = create_default_text_reranker(
|
354
|
+
api_key=options.reranker_api_key,
|
355
|
+
model_name=options.reranker_model or "rerank-english-v2.0",
|
356
|
+
)
|
303
357
|
if reranker_instance:
|
304
|
-
|
305
|
-
|
306
|
-
|
358
|
+
# Ensure reranker top_k matches final desired top_k
|
359
|
+
reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
|
360
|
+
logger.info(
|
361
|
+
f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
|
362
|
+
)
|
307
363
|
else:
|
308
|
-
logger.warning(
|
309
|
-
|
364
|
+
logger.warning(
|
365
|
+
"Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
|
366
|
+
)
|
310
367
|
|
311
368
|
# --- 4. Build and Run Pipeline --- #
|
312
369
|
if not Pipeline:
|
313
|
-
|
314
|
-
|
370
|
+
logger.error("Haystack Pipeline class not available.")
|
371
|
+
return []
|
315
372
|
|
316
373
|
search_pipeline = Pipeline()
|
317
374
|
search_pipeline.add_component("retriever", retriever)
|
@@ -325,15 +382,20 @@ def _perform_haystack_search(
|
|
325
382
|
search_pipeline.connect("retriever.documents", "reranker.documents")
|
326
383
|
# Reranker also needs the query text and final top_k
|
327
384
|
if text_query is None:
|
328
|
-
logger.error(
|
385
|
+
logger.error(
|
386
|
+
"Reranker requires text query, but it was not available (query might not have been text)."
|
387
|
+
)
|
329
388
|
# Handle this case - maybe skip reranker or raise error?
|
330
389
|
# For now, let's skip reranker if text is missing
|
331
390
|
logger.warning("Skipping reranker because text query is missing.")
|
332
|
-
reranker_instance = None
|
333
|
-
last_component_name = "retriever"
|
391
|
+
reranker_instance = None # Effectively remove it from the logic below
|
392
|
+
last_component_name = "retriever" # Reset last component
|
334
393
|
# Remove reranker component if added? Less clean. Let's just not add its input.
|
335
394
|
else:
|
336
|
-
pipeline_input["reranker"] = {
|
395
|
+
pipeline_input["reranker"] = {
|
396
|
+
"query": text_query,
|
397
|
+
"top_k": options.top_k,
|
398
|
+
} # Pass query and final top_k
|
337
399
|
last_component_name = "reranker"
|
338
400
|
logger.debug("Added reranker to pipeline and configured input.")
|
339
401
|
else:
|
@@ -341,7 +403,6 @@ def _perform_haystack_search(
|
|
341
403
|
last_component_name = "reranker"
|
342
404
|
logger.debug("Added reranker to pipeline.")
|
343
405
|
|
344
|
-
|
345
406
|
logger.info("Running Haystack search pipeline...")
|
346
407
|
try:
|
347
408
|
result = search_pipeline.run(pipeline_input)
|
@@ -356,31 +417,34 @@ def _perform_haystack_search(
|
|
356
417
|
# Check output based on last component in the pipeline
|
357
418
|
if last_component_name in result and result[last_component_name].get("documents"):
|
358
419
|
final_documents = result[last_component_name]["documents"]
|
359
|
-
logger.debug(
|
420
|
+
logger.debug(
|
421
|
+
f"Processed results from '{last_component_name}' ({len(final_documents)} documents)."
|
422
|
+
)
|
360
423
|
else:
|
361
|
-
logger.warning(
|
424
|
+
logger.warning(
|
425
|
+
f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}"
|
426
|
+
)
|
362
427
|
return []
|
363
428
|
|
364
429
|
# Convert Haystack Documents to the desired output format
|
365
430
|
output_results = []
|
366
|
-
for doc in final_documents:
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
431
|
+
for doc in final_documents: # Correctly loop over final_documents
|
432
|
+
# Check if doc is actually a Haystack Document object or potentially a dict
|
433
|
+
doc_id = getattr(doc, "id", None)
|
434
|
+
doc_score = getattr(doc, "score", 0.0)
|
435
|
+
doc_content = getattr(doc, "content", None)
|
436
|
+
doc_meta = getattr(doc, "meta", {})
|
437
|
+
|
438
|
+
meta = doc_meta or {}
|
439
|
+
output = {
|
440
|
+
"pdf_path": meta.get("pdf_path", "Unknown"),
|
441
|
+
"page_number": meta.get("page_number", -1),
|
442
|
+
"score": doc_score if doc_score is not None else 0.0, # Handle potential None score
|
443
|
+
"content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
|
444
|
+
"metadata": meta,
|
445
|
+
# "haystack_document": doc # Optionally include the full Haystack doc
|
446
|
+
}
|
447
|
+
output_results.append(output)
|
383
448
|
|
384
449
|
logger.info(f"Returning {len(output_results)} relevant results.")
|
385
450
|
return output_results
|
386
|
-
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from dataclasses import dataclass, field
|
3
|
-
from typing import
|
3
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
4
4
|
|
5
5
|
# Use object placeholders for external types to avoid direct dependency
|
6
6
|
BaseRanker = object
|
@@ -8,10 +8,12 @@ BaseEmbedder = object
|
|
8
8
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
11
|
+
|
11
12
|
# --- Base Search Options ---
|
12
13
|
@dataclass
|
13
14
|
class BaseSearchOptions:
|
14
15
|
"""Base options for search operations."""
|
16
|
+
|
15
17
|
# How many results to return finally (after retrieval and optional reranking)
|
16
18
|
top_k: int = 10
|
17
19
|
# How many candidates the retriever should fetch initially (relevant if reranking)
|
@@ -22,12 +24,12 @@ class BaseSearchOptions:
|
|
22
24
|
|
23
25
|
# --- Reranking Configuration ---
|
24
26
|
# Option 1: Simple boolean/None
|
25
|
-
use_reranker: Optional[bool] = True
|
27
|
+
use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
|
26
28
|
# Option 2: Provide a specific instance (takes precedence over use_reranker boolean)
|
27
29
|
reranker_instance: Optional[BaseRanker] = None
|
28
30
|
# Parameters for default Cohere reranker (if use_reranker=True)
|
29
|
-
reranker_model: Optional[str] = None
|
30
|
-
reranker_api_key: Optional[str] = None
|
31
|
+
reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
|
32
|
+
reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
|
31
33
|
|
32
34
|
# --- Embedder Configuration (Less common to override per-query, usually set at indexing) ---
|
33
35
|
# embedder_instance: Optional[BaseEmbedder] = None # Might be useful for advanced cases
|
@@ -35,38 +37,47 @@ class BaseSearchOptions:
|
|
35
37
|
def __post_init__(self):
|
36
38
|
# Validate that top_k values make sense
|
37
39
|
if self.retriever_top_k is None:
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
# If retriever_top_k isn't set, default it based on reranking needs
|
41
|
+
if self.use_reranker:
|
42
|
+
self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
|
43
|
+
else:
|
44
|
+
self.retriever_top_k = self.top_k
|
43
45
|
elif self.retriever_top_k < self.top_k:
|
44
|
-
logger.warning(
|
46
|
+
logger.warning(
|
47
|
+
f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results."
|
48
|
+
)
|
49
|
+
|
45
50
|
|
46
51
|
# --- Text Search Specific Options ---
|
47
52
|
@dataclass
|
48
53
|
class TextSearchOptions(BaseSearchOptions):
|
49
54
|
"""Options specific to text-based semantic search."""
|
55
|
+
|
50
56
|
# Add any text-specific overrides or parameters here if needed in the future
|
51
57
|
# e.g., specifying default text reranker model name if different defaults emerge
|
52
58
|
# default_text_reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
53
|
-
pass
|
59
|
+
pass # Currently inherits all base options
|
60
|
+
|
54
61
|
|
55
62
|
# --- MultiModal Search Specific Options ---
|
56
63
|
@dataclass
|
57
64
|
class MultiModalSearchOptions(BaseSearchOptions):
|
58
65
|
"""Options specific to multimodal semantic search."""
|
66
|
+
|
59
67
|
# Flag to potentially use a default multimodal reranker if available
|
60
68
|
# (overrides base use_reranker=True if reranker_instance is None)
|
61
|
-
use_multimodal_reranker: bool =
|
69
|
+
use_multimodal_reranker: bool = (
|
70
|
+
True # Attempt multimodal rerank if use_reranker=True/None and no instance given
|
71
|
+
)
|
62
72
|
# e.g., specifying default multimodal embedder/reranker models
|
63
73
|
# default_multimodal_embedder_model: str = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
|
64
74
|
# default_multimodal_reranker_model: str = "jinaai/jina-reranker-m0" # Example
|
65
75
|
|
76
|
+
|
66
77
|
# --- Union Type ---
|
67
78
|
# Defines the types allowed for search configuration.
|
68
79
|
SearchOptions = Union[
|
69
80
|
TextSearchOptions,
|
70
81
|
MultiModalSearchOptions,
|
71
|
-
BaseSearchOptions
|
72
|
-
]
|
82
|
+
BaseSearchOptions, # Include base for typing flexibility
|
83
|
+
]
|
@@ -1,15 +1,18 @@
|
|
1
1
|
"""Defines the protocol for a search service."""
|
2
|
-
|
2
|
+
|
3
3
|
from pathlib import Path
|
4
|
+
from typing import Any, Dict, Iterable, List, Optional, Protocol, Union
|
5
|
+
|
4
6
|
from PIL import Image
|
5
7
|
|
8
|
+
# Forward declare SearchOptions to avoid circular import if needed,
|
9
|
+
# or import if structure allows (assuming it's safe here)
|
10
|
+
from natural_pdf.search.search_options import BaseSearchOptions, SearchOptions
|
11
|
+
|
6
12
|
# Use typing_extensions for Python < 3.8 compatibility if needed,
|
7
13
|
# otherwise, typing.Protocol is fine for >= 3.8
|
8
14
|
# from typing_extensions import Protocol
|
9
15
|
|
10
|
-
# Forward declare SearchOptions to avoid circular import if needed,
|
11
|
-
# or import if structure allows (assuming it's safe here)
|
12
|
-
from natural_pdf.search.search_options import SearchOptions, BaseSearchOptions
|
13
16
|
|
14
17
|
# Use Dict as placeholder for external Haystack Document type
|
15
18
|
HaystackDocument = Dict[str, Any]
|
@@ -17,12 +20,14 @@ HaystackDocument = Dict[str, Any]
|
|
17
20
|
|
18
21
|
class IndexConfigurationError(RuntimeError):
|
19
22
|
"""Custom exception for configuration mismatches during indexing."""
|
23
|
+
|
20
24
|
pass
|
21
25
|
|
22
26
|
|
23
27
|
# Add new exception for sync/init safety
|
24
28
|
class IndexExistsError(RuntimeError):
|
25
29
|
"""Raised when attempting to index implicitly to an existing persistent index without force_reindex=True."""
|
30
|
+
|
26
31
|
pass
|
27
32
|
|
28
33
|
|
@@ -66,6 +71,7 @@ class SearchServiceProtocol(Protocol):
|
|
66
71
|
with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
|
67
72
|
An instance of a service implementing this protocol is tied to a specific collection name.
|
68
73
|
"""
|
74
|
+
|
69
75
|
collection_name: str
|
70
76
|
# Removed internal state hints (_persist, _embedding_model) - implementation detail
|
71
77
|
|
@@ -98,7 +104,7 @@ class SearchServiceProtocol(Protocol):
|
|
98
104
|
|
99
105
|
def search(
|
100
106
|
self,
|
101
|
-
query: Any,
|
107
|
+
query: Any, # Allow any query type, service implementation handles it
|
102
108
|
options: BaseSearchOptions,
|
103
109
|
) -> List[Dict[str, Any]]:
|
104
110
|
"""
|
@@ -186,4 +192,4 @@ class SearchServiceProtocol(Protocol):
|
|
186
192
|
...
|
187
193
|
|
188
194
|
# Optional: Add methods for getting index stats, etc.
|
189
|
-
# def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...
|
195
|
+
# def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...
|