natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -17,10 +17,16 @@ from natural_pdf.search.search_options import (
|
|
17
17
|
# Set up logger for this module
|
18
18
|
logger = logging.getLogger(__name__)
|
19
19
|
|
20
|
+
# Import sentence-transformers for dimension calculation
|
21
|
+
try:
|
22
|
+
from sentence_transformers import SentenceTransformer
|
23
|
+
except ImportError:
|
24
|
+
SentenceTransformer = None
|
25
|
+
|
20
26
|
# --- Define flag BEFORE trying Haystack imports ---
|
21
27
|
HAS_HAYSTACK_EXTRAS = False # Default to False
|
22
28
|
|
23
|
-
#
|
29
|
+
# Conditional Haystack Imports
|
24
30
|
try:
|
25
31
|
import haystack
|
26
32
|
from haystack import Document as HaystackDocument
|
@@ -30,8 +36,18 @@ try:
|
|
30
36
|
SentenceTransformersTextEmbedder,
|
31
37
|
)
|
32
38
|
from haystack.document_stores.types import DocumentStore, DuplicatePolicy
|
33
|
-
|
34
|
-
|
39
|
+
|
40
|
+
# --- REMOVED Chroma Imports ---
|
41
|
+
# from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
|
42
|
+
# from haystack_integrations.document_stores.chroma import ChromaDocumentStore
|
43
|
+
# --- ADDED LanceDB Imports ---
|
44
|
+
try:
|
45
|
+
from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
|
46
|
+
except ImportError:
|
47
|
+
LanceDBDocumentStore = None
|
48
|
+
LanceDBEmbeddingRetriever = None
|
49
|
+
|
50
|
+
# Removed Chroma Imports
|
35
51
|
|
36
52
|
# Keep try/except for optional Cohere
|
37
53
|
try:
|
@@ -39,15 +55,7 @@ try:
|
|
39
55
|
except ImportError:
|
40
56
|
CohereRanker = None
|
41
57
|
|
42
|
-
#
|
43
|
-
try:
|
44
|
-
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
45
|
-
except ImportError:
|
46
|
-
logger.warning(
|
47
|
-
"chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
|
48
|
-
)
|
49
|
-
SentenceTransformerEmbeddingFunction = None
|
50
|
-
# --- End ChromaDB import ---
|
58
|
+
# Removed ChromaDB embedding function import
|
51
59
|
|
52
60
|
HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
|
53
61
|
logger.debug("Successfully imported Haystack components.")
|
@@ -67,11 +75,15 @@ except ImportError as e:
|
|
67
75
|
HaystackDocument = Dict # Represent as Dict if not available
|
68
76
|
Pipeline = None
|
69
77
|
SentenceTransformersTextEmbedder = None
|
70
|
-
|
78
|
+
# --- UPDATED Dummies ---
|
79
|
+
LanceDBEmbeddingRetriever = (
|
80
|
+
None # ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
|
81
|
+
)
|
71
82
|
CohereRanker = None
|
72
|
-
ChromaDocumentStore = None
|
83
|
+
LanceDBDocumentStore = None # ChromaDocumentStore = None
|
73
84
|
DuplicatePolicy = None # Dummy for DuplicatePolicy
|
74
|
-
|
85
|
+
# --- REMOVED Dummies ---
|
86
|
+
# SentenceTransformerEmbeddingFunction = None # Dummy if kept
|
75
87
|
|
76
88
|
|
77
89
|
# Helper function to check availability and raise error
|
@@ -90,34 +102,60 @@ def check_haystack_availability(feature_name: str = "Search"):
|
|
90
102
|
|
91
103
|
|
92
104
|
def create_default_document_store(
|
93
|
-
persist_path
|
94
|
-
|
95
|
-
|
105
|
+
# --- CHANGED persist_path to uri ---
|
106
|
+
uri: str = "./natural_pdf_index",
|
107
|
+
collection_name: str = "natural_pdf_default", # LanceDB calls this table_name
|
108
|
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", # Make mandatory for dim calculation
|
96
109
|
) -> DocumentStore:
|
97
|
-
"""Creates a default
|
98
|
-
check_haystack_availability("create_default_document_store")
|
110
|
+
"""Creates a default LanceDB DocumentStore."""
|
111
|
+
check_haystack_availability("create_default_document_store (LanceDB)")
|
99
112
|
logger.debug(
|
100
|
-
f"Creating default
|
113
|
+
f"Creating default LanceDBDocumentStore at uri='{uri}' with table '{collection_name}'"
|
101
114
|
)
|
102
|
-
if not
|
103
|
-
raise RuntimeError("
|
115
|
+
if not LanceDBDocumentStore:
|
116
|
+
raise RuntimeError("LanceDBDocumentStore is not available despite Haystack extras check.")
|
117
|
+
if not SentenceTransformer:
|
118
|
+
raise ImportError(
|
119
|
+
"sentence-transformers library is required to determine embedding dimensions."
|
120
|
+
)
|
104
121
|
|
105
122
|
try:
|
106
|
-
#
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
123
|
+
# Calculate embedding dimension
|
124
|
+
try:
|
125
|
+
model = SentenceTransformer(embedding_model)
|
126
|
+
embedding_dims = model.get_sentence_embedding_dimension()
|
127
|
+
if not embedding_dims:
|
128
|
+
raise ValueError(
|
129
|
+
f"Could not determine embedding dimension for model: {embedding_model}"
|
130
|
+
)
|
131
|
+
logger.debug(
|
132
|
+
f"Determined embedding dimension: {embedding_dims} for model '{embedding_model}'"
|
133
|
+
)
|
134
|
+
except Exception as e:
|
135
|
+
logger.error(
|
136
|
+
f"Failed to load SentenceTransformer model '{embedding_model}' to get dimensions: {e}",
|
137
|
+
exc_info=True,
|
138
|
+
)
|
139
|
+
raise RuntimeError(
|
140
|
+
f"Failed to determine embedding dimension for model '{embedding_model}'."
|
141
|
+
) from e
|
142
|
+
|
143
|
+
# Create LanceDBDocumentStore
|
144
|
+
store = LanceDBDocumentStore(
|
145
|
+
database=uri, # Use uri for the database path
|
146
|
+
table_name=collection_name,
|
147
|
+
embedding_dims=embedding_dims,
|
148
|
+
# LanceDB might require a metadata schema, but let's try without it first for simplicity.
|
149
|
+
# Add `metadata_schema=...` if needed based on lancedb-haystack requirements.
|
150
|
+
)
|
151
|
+
logger.info(
|
152
|
+
f"Initialized LanceDBDocumentStore (Table: {collection_name}, Dims: {embedding_dims}) at uri '{uri}'"
|
114
153
|
)
|
115
|
-
logger.info(f"Initialized ChromaDocumentStore (Collection: {collection_name})")
|
116
154
|
return store
|
117
155
|
except Exception as e:
|
118
|
-
logger.error(f"Failed to initialize
|
156
|
+
logger.error(f"Failed to initialize LanceDBDocumentStore: {e}", exc_info=True)
|
119
157
|
raise RuntimeError(
|
120
|
-
f"Could not create
|
158
|
+
f"Could not create LanceDBDocumentStore for table '{collection_name}' at uri '{uri}'"
|
121
159
|
) from e
|
122
160
|
|
123
161
|
|
@@ -261,35 +299,32 @@ def create_default_document_embedder(
|
|
261
299
|
def _perform_haystack_search(
|
262
300
|
query: Union[str, Path, Image.Image],
|
263
301
|
document_store: Any, # Use Any for simplicity now
|
264
|
-
collection_name: str, # Passed for clarity,
|
302
|
+
collection_name: str, # Passed for clarity, corresponds to table_name in LanceDB
|
265
303
|
embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
|
266
304
|
options: BaseSearchOptions,
|
267
305
|
) -> List[Dict[str, Any]]:
|
268
|
-
"""Internal function to perform search using Haystack components (
|
306
|
+
"""Internal function to perform search using Haystack components (LanceDBEmbeddingRetriever)."""
|
269
307
|
if not HAS_HAYSTACK_EXTRAS:
|
270
|
-
check_haystack_availability("_perform_haystack_search")
|
308
|
+
check_haystack_availability("_perform_haystack_search (LanceDB)")
|
271
309
|
return [] # Should not be reached due to check
|
272
310
|
|
273
311
|
logger.info(
|
274
|
-
f"Performing Haystack search in
|
312
|
+
f"Performing Haystack search in table '{collection_name}' (using store: {type(document_store).__name__})..."
|
275
313
|
)
|
276
314
|
logger.debug(f" Query type: {type(query).__name__}")
|
277
315
|
logger.debug(f" Options: {options}")
|
278
316
|
|
279
|
-
#
|
317
|
+
# Embed Query
|
280
318
|
text_query: Optional[str] = None
|
281
319
|
query_embedding: Optional[List[float]] = None
|
282
320
|
|
283
321
|
if isinstance(query, str):
|
284
|
-
text_query = query
|
322
|
+
text_query = query
|
285
323
|
if not embedder:
|
286
324
|
logger.error(
|
287
325
|
"Text query provided, but no embedder instance was passed to _perform_haystack_search."
|
288
326
|
)
|
289
327
|
return []
|
290
|
-
# No need to check type if the type hint is enforced upstream
|
291
|
-
# if not isinstance(embedder, SentenceTransformersTextEmbedder):
|
292
|
-
# logger.warning(f"Provided embedder is {type(embedder).__name__}, not SentenceTransformersTextEmbedder. Assuming it works like one for query embedding.")
|
293
328
|
try:
|
294
329
|
logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
|
295
330
|
embedding_result = embedder.run(text=text_query)
|
@@ -306,24 +341,21 @@ def _perform_haystack_search(
|
|
306
341
|
logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
|
307
342
|
return []
|
308
343
|
elif isinstance(query, Path) or isinstance(query, Image.Image):
|
309
|
-
# Currently, this function doesn't support multi-modal query embedding directly
|
310
344
|
logger.error(
|
311
345
|
f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
|
312
346
|
)
|
313
347
|
return []
|
314
348
|
else:
|
315
|
-
# Handle other unexpected types
|
316
349
|
logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
|
350
|
+
return []
|
317
351
|
|
318
|
-
# If we didn't get an embedding (e.g., non-text query), we can't proceed
|
319
352
|
if query_embedding is None:
|
320
353
|
logger.error("Could not obtain query embedding. Cannot perform search.")
|
321
354
|
return []
|
322
355
|
|
323
|
-
#
|
324
|
-
|
325
|
-
|
326
|
-
logger.error("ChromaEmbeddingRetriever not available.")
|
356
|
+
# Set up Retriever
|
357
|
+
if not LanceDBEmbeddingRetriever:
|
358
|
+
logger.error("LanceDBEmbeddingRetriever not available.")
|
327
359
|
return []
|
328
360
|
|
329
361
|
# Ensure retriever_top_k is set (should be by __post_init__)
|
@@ -335,28 +367,26 @@ def _perform_haystack_search(
|
|
335
367
|
retriever_top_k = options.top_k
|
336
368
|
|
337
369
|
# Instantiate the EMBEDDING retriever
|
338
|
-
retriever =
|
370
|
+
retriever = LanceDBEmbeddingRetriever(
|
339
371
|
document_store=document_store,
|
340
372
|
filters=options.filters or {}, # Pass filters here
|
341
373
|
top_k=retriever_top_k,
|
342
374
|
)
|
343
375
|
|
344
376
|
logger.debug(
|
345
|
-
f"Initialized
|
377
|
+
f"Initialized LanceDBEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
|
346
378
|
)
|
347
379
|
|
348
|
-
#
|
380
|
+
# Set up Optional Reranker
|
349
381
|
reranker_instance = None
|
350
|
-
if options.use_reranker in [True, None]:
|
382
|
+
if options.use_reranker in [True, None]:
|
351
383
|
logger.debug("Attempting to initialize reranker...")
|
352
|
-
# Currently only supports default text reranker (Cohere)
|
353
384
|
reranker_instance = create_default_text_reranker(
|
354
385
|
api_key=options.reranker_api_key,
|
355
386
|
model_name=options.reranker_model or "rerank-english-v2.0",
|
356
387
|
)
|
357
388
|
if reranker_instance:
|
358
|
-
|
359
|
-
reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
|
389
|
+
reranker_instance.top_k = options.top_k
|
360
390
|
logger.info(
|
361
391
|
f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
|
362
392
|
)
|
@@ -365,7 +395,7 @@ def _perform_haystack_search(
|
|
365
395
|
"Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
|
366
396
|
)
|
367
397
|
|
368
|
-
#
|
398
|
+
# Build and Run Pipeline
|
369
399
|
if not Pipeline:
|
370
400
|
logger.error("Haystack Pipeline class not available.")
|
371
401
|
return []
|
@@ -380,28 +410,25 @@ def _perform_haystack_search(
|
|
380
410
|
if reranker_instance:
|
381
411
|
search_pipeline.add_component("reranker", reranker_instance)
|
382
412
|
search_pipeline.connect("retriever.documents", "reranker.documents")
|
383
|
-
# Reranker also needs the query text and final top_k
|
384
413
|
if text_query is None:
|
385
414
|
logger.error(
|
386
415
|
"Reranker requires text query, but it was not available (query might not have been text)."
|
387
416
|
)
|
388
|
-
# Handle this case - maybe skip reranker or raise error?
|
389
|
-
# For now, let's skip reranker if text is missing
|
390
417
|
logger.warning("Skipping reranker because text query is missing.")
|
391
|
-
reranker_instance = None
|
392
|
-
last_component_name = "retriever"
|
393
|
-
# Remove reranker component if added? Less clean. Let's just not add its input.
|
418
|
+
reranker_instance = None
|
419
|
+
last_component_name = "retriever"
|
394
420
|
else:
|
395
421
|
pipeline_input["reranker"] = {
|
396
422
|
"query": text_query,
|
397
423
|
"top_k": options.top_k,
|
398
|
-
}
|
424
|
+
}
|
399
425
|
last_component_name = "reranker"
|
400
426
|
logger.debug("Added reranker to pipeline and configured input.")
|
401
427
|
else:
|
402
|
-
#
|
403
|
-
last_component_name
|
404
|
-
|
428
|
+
# --- Fix: last_component_name should only be 'reranker' if it was added ---
|
429
|
+
# if reranker_instance was initialized and added, last_component_name is 'reranker'
|
430
|
+
# if not, it remains 'retriever'
|
431
|
+
pass # No change needed here if reranker wasn't added
|
405
432
|
|
406
433
|
logger.info("Running Haystack search pipeline...")
|
407
434
|
try:
|
@@ -412,9 +439,8 @@ def _perform_haystack_search(
|
|
412
439
|
logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
|
413
440
|
return []
|
414
441
|
|
415
|
-
#
|
442
|
+
# Process Results
|
416
443
|
final_documents: List[HaystackDocument] = []
|
417
|
-
# Check output based on last component in the pipeline
|
418
444
|
if last_component_name in result and result[last_component_name].get("documents"):
|
419
445
|
final_documents = result[last_component_name]["documents"]
|
420
446
|
logger.debug(
|
@@ -428,8 +454,7 @@ def _perform_haystack_search(
|
|
428
454
|
|
429
455
|
# Convert Haystack Documents to the desired output format
|
430
456
|
output_results = []
|
431
|
-
for doc in final_documents:
|
432
|
-
# Check if doc is actually a Haystack Document object or potentially a dict
|
457
|
+
for doc in final_documents:
|
433
458
|
doc_id = getattr(doc, "id", None)
|
434
459
|
doc_score = getattr(doc, "score", 0.0)
|
435
460
|
doc_content = getattr(doc, "content", None)
|
@@ -439,10 +464,9 @@ def _perform_haystack_search(
|
|
439
464
|
output = {
|
440
465
|
"pdf_path": meta.get("pdf_path", "Unknown"),
|
441
466
|
"page_number": meta.get("page_number", -1),
|
442
|
-
"score": doc_score if doc_score is not None else 0.0,
|
443
|
-
"content_snippet": doc_content[:200] + "..." if doc_content else "",
|
467
|
+
"score": doc_score if doc_score is not None else 0.0,
|
468
|
+
"content_snippet": doc_content[:200] + "..." if doc_content else "",
|
444
469
|
"metadata": meta,
|
445
|
-
# "haystack_document": doc # Optionally include the full Haystack doc
|
446
470
|
}
|
447
471
|
output_results.append(output)
|
448
472
|
|
@@ -68,12 +68,11 @@ class SearchServiceProtocol(Protocol):
|
|
68
68
|
Protocol defining the interface for indexing and searching documents.
|
69
69
|
|
70
70
|
Implementations of this protocol handle the specifics of interacting
|
71
|
-
with a chosen search backend (e.g., Haystack with
|
72
|
-
An instance of a service implementing this protocol is tied to a specific
|
71
|
+
with a chosen search backend (e.g., Haystack with LanceDB, Haystack In-Memory).
|
72
|
+
An instance of a service implementing this protocol is tied to a specific index name (e.g., table name).
|
73
73
|
"""
|
74
74
|
|
75
75
|
collection_name: str
|
76
|
-
# Removed internal state hints (_persist, _embedding_model) - implementation detail
|
77
76
|
|
78
77
|
def index(
|
79
78
|
self,
|
@@ -82,7 +81,7 @@ class SearchServiceProtocol(Protocol):
|
|
82
81
|
force_reindex: bool = False,
|
83
82
|
) -> None:
|
84
83
|
"""
|
85
|
-
Indexes the provided documents into the
|
84
|
+
Indexes the provided documents into the index/table managed by this service instance.
|
86
85
|
|
87
86
|
Handles store and embedder creation/retrieval, configuration checks,
|
88
87
|
re-indexing logic (including potential deletion), embedding, and writing.
|
@@ -91,12 +90,12 @@ class SearchServiceProtocol(Protocol):
|
|
91
90
|
documents: An iterable of objects conforming to the Indexable protocol.
|
92
91
|
embedder_device: The device ('cpu', 'cuda', etc.) for the embedder.
|
93
92
|
Defaults defined by the implementation.
|
94
|
-
force_reindex: If True, delete the entire existing
|
93
|
+
force_reindex: If True, delete the entire existing index/table
|
95
94
|
(if configuration permits) before indexing.
|
96
95
|
|
97
96
|
Raises:
|
98
97
|
IndexConfigurationError: If `force_reindex` is False and the existing
|
99
|
-
|
98
|
+
index/table has incompatible settings.
|
100
99
|
ImportError: If required backend libraries are missing.
|
101
100
|
RuntimeError: For other failures during indexing.
|
102
101
|
"""
|
@@ -104,11 +103,11 @@ class SearchServiceProtocol(Protocol):
|
|
104
103
|
|
105
104
|
def search(
|
106
105
|
self,
|
107
|
-
query: Any,
|
106
|
+
query: Any,
|
108
107
|
options: BaseSearchOptions,
|
109
108
|
) -> List[Dict[str, Any]]:
|
110
109
|
"""
|
111
|
-
Performs a search within the
|
110
|
+
Performs a search within the index/table managed by this service instance.
|
112
111
|
|
113
112
|
Args:
|
114
113
|
query: The search query (type depends on service capabilities).
|
@@ -119,7 +118,7 @@ class SearchServiceProtocol(Protocol):
|
|
119
118
|
metadata, and relevance scores.
|
120
119
|
|
121
120
|
Raises:
|
122
|
-
FileNotFoundError: If the
|
121
|
+
FileNotFoundError: If the index/table managed by this service does not exist or path is invalid.
|
123
122
|
RuntimeError: For other failures during search.
|
124
123
|
TypeError: If the query type is incompatible with the backend/options.
|
125
124
|
"""
|
@@ -129,10 +128,10 @@ class SearchServiceProtocol(Protocol):
|
|
129
128
|
self,
|
130
129
|
) -> bool:
|
131
130
|
"""
|
132
|
-
Deletes the entire index/
|
131
|
+
Deletes the entire index/table managed by this service instance.
|
133
132
|
|
134
133
|
Returns:
|
135
|
-
True if deletion was successful or
|
134
|
+
True if deletion was successful or index/table didn't exist,
|
136
135
|
False if deletion failed.
|
137
136
|
|
138
137
|
Raises:
|
@@ -145,7 +144,7 @@ class SearchServiceProtocol(Protocol):
|
|
145
144
|
self,
|
146
145
|
) -> bool:
|
147
146
|
"""
|
148
|
-
Checks if the index/
|
147
|
+
Checks if the index/table managed by this service instance exists.
|
149
148
|
|
150
149
|
Returns:
|
151
150
|
True if the index exists, False otherwise.
|