natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,30 +1,38 @@
1
1
  # natural_pdf/search/haystack_utils.py
2
2
  import logging
3
3
  import os
4
- from typing import Optional, Dict, Any, List, Union, Tuple, Type
5
- from pathlib import Path
6
- from PIL import Image # Ensure Image is imported unconditionally
7
4
  import warnings
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
7
+
8
+ from PIL import Image # Ensure Image is imported unconditionally
8
9
 
9
- from natural_pdf.search.search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
10
+ from natural_pdf.search.search_options import (
11
+ BaseSearchOptions,
12
+ MultiModalSearchOptions,
13
+ SearchOptions,
14
+ TextSearchOptions,
15
+ )
10
16
 
11
17
  # Set up logger for this module
12
18
  logger = logging.getLogger(__name__)
13
19
 
14
20
  # --- Define flag BEFORE trying Haystack imports ---
15
- HAS_HAYSTACK_EXTRAS = False # Default to False
21
+ HAS_HAYSTACK_EXTRAS = False # Default to False
16
22
 
17
23
  # --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
18
24
  try:
19
25
  import haystack
20
- from haystack import Document as HaystackDocument, Pipeline
21
- from haystack_integrations.document_stores.chroma import ChromaDocumentStore
22
- from haystack.document_stores.types import DuplicatePolicy, DocumentStore
26
+ from haystack import Document as HaystackDocument
27
+ from haystack import Pipeline
23
28
  from haystack.components.embedders import (
29
+ SentenceTransformersDocumentEmbedder,
24
30
  SentenceTransformersTextEmbedder,
25
- SentenceTransformersDocumentEmbedder
26
31
  )
32
+ from haystack.document_stores.types import DocumentStore, DuplicatePolicy
27
33
  from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
34
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
35
+
28
36
  # Keep try/except for optional Cohere
29
37
  try:
30
38
  from haystack.components.rankers import CohereRanker
@@ -33,33 +41,37 @@ try:
33
41
 
34
42
  # --- Add ChromaDB embedding function import ---
35
43
  try:
36
- from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
44
+ from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
37
45
  except ImportError:
38
- logger.warning("chromadb library not found. Custom embedding models for ChromaDocumentStore may not work.")
39
- SentenceTransformerEmbeddingFunction = None
46
+ logger.warning(
47
+ "chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
48
+ )
49
+ SentenceTransformerEmbeddingFunction = None
40
50
  # --- End ChromaDB import ---
41
51
 
42
- HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
52
+ HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
43
53
  logger.debug("Successfully imported Haystack components.")
44
54
 
45
55
  except ImportError as e:
46
56
  # HAS_HAYSTACK_EXTRAS remains False
47
57
  # Log the full error and traceback for debugging
48
- logger.error(f"Failed to import Haystack components. Search functionality disabled. Error: {e}", exc_info=True)
58
+ logger.warning(
59
+ f"Failed to import Haystack components. Semantic search functionality disabled.",
60
+ )
49
61
 
50
62
  # Define dummy types/classes for type hinting and basic checks when extras aren't installed
51
63
  BaseDocumentStore = object
52
- DocumentStore = object # Dummy for protocol
53
- BaseEmbedder = object # Define dummy BaseEmbedder
64
+ DocumentStore = object # Dummy for protocol
65
+ BaseEmbedder = object # Define dummy BaseEmbedder
54
66
  BaseTextEmbedder = object
55
- HaystackDocument = Dict # Represent as Dict if not available
67
+ HaystackDocument = Dict # Represent as Dict if not available
56
68
  Pipeline = None
57
69
  SentenceTransformersTextEmbedder = None
58
- ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
70
+ ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
59
71
  CohereRanker = None
60
72
  ChromaDocumentStore = None
61
- DuplicatePolicy = None # Dummy for DuplicatePolicy
62
- SentenceTransformerEmbeddingFunction = None # Dummy if kept
73
+ DuplicatePolicy = None # Dummy for DuplicatePolicy
74
+ SentenceTransformerEmbeddingFunction = None # Dummy if kept
63
75
 
64
76
 
65
77
  # Helper function to check availability and raise error
@@ -76,16 +88,19 @@ def check_haystack_availability(feature_name: str = "Search"):
76
88
  # Default Component Creators
77
89
  # ===========================
78
90
 
91
+
79
92
  def create_default_document_store(
80
93
  persist_path: str = "./natural_pdf_index",
81
94
  collection_name: str = "natural_pdf_default",
82
- embedding_model: Optional[str] = None # Allow specifying the model
95
+ embedding_model: Optional[str] = None, # Allow specifying the model
83
96
  ) -> DocumentStore:
84
97
  """Creates a default ChromaDB DocumentStore."""
85
98
  check_haystack_availability("create_default_document_store")
86
- logger.debug(f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'")
87
- if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
88
- raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
99
+ logger.debug(
100
+ f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'"
101
+ )
102
+ if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
103
+ raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
89
104
 
90
105
  try:
91
106
  # Note: For Haystack's Chroma integration, the embedding model is typically handled
@@ -101,11 +116,14 @@ def create_default_document_store(
101
116
  return store
102
117
  except Exception as e:
103
118
  logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
104
- raise RuntimeError(f"Could not create ChromaDocumentStore for collection '{collection_name}'") from e
119
+ raise RuntimeError(
120
+ f"Could not create ChromaDocumentStore for collection '{collection_name}'"
121
+ ) from e
122
+
105
123
 
106
124
  def create_default_text_embedder(
107
125
  model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
108
- device: Optional[str] = None # Add device parameter
126
+ device: Optional[str] = None, # Add device parameter
109
127
  ) -> SentenceTransformersTextEmbedder:
110
128
  """Creates a default SentenceTransformer text embedder."""
111
129
  check_haystack_availability("create_default_text_embedder")
@@ -115,11 +133,16 @@ def create_default_text_embedder(
115
133
  try:
116
134
  # Use Haystack component which handles device logic
117
135
  embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
118
- logger.info(f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})")
136
+ logger.info(
137
+ f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})"
138
+ )
119
139
  return embedder
120
140
  except Exception as e:
121
141
  logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
122
- raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
142
+ raise RuntimeError(
143
+ f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
144
+ ) from e
145
+
123
146
 
124
147
  def create_default_multimodal_embedder(*args, **kwargs) -> Any:
125
148
  """Stub for creating a default multimodal embedder (Not Implemented)."""
@@ -129,47 +152,52 @@ def create_default_multimodal_embedder(*args, **kwargs) -> Any:
129
152
  " See: https://docs.haystack.deepset.ai/docs/custom-components"
130
153
  )
131
154
 
155
+
132
156
  def create_default_text_reranker(
133
- api_key: Optional[str] = None,
134
- model_name: str = "rerank-english-v2.0" # Default Cohere model
135
- ) -> Optional[Any]: # Returns CohereRanker instance or None
136
- """
137
- Creates a default Cohere Reranker if available and API key provided.
138
-
139
- Requires COHERE_API_KEY environment variable or api_key argument.
140
- Requires haystack-cohere integration: pip install haystack-cohere
141
- """
142
- check_haystack_availability("create_default_text_reranker (optional)")
143
-
144
- if not CohereRanker:
145
- logger.debug("CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation.")
146
- return None
147
-
148
- # Check for API key (prefer argument over environment variable)
149
- cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
150
- if not cohere_api_key:
151
- logger.warning("COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker.")
152
- return None
153
-
154
- logger.debug(f"Creating CohereRanker with model '{model_name}'")
155
- try:
156
- # Pass API key via authenticator for better practice if supported, or directly
157
- # As of haystack 2.0b5, CohereRanker takes api_key directly
158
- reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
159
- logger.info(f"Initialized CohereRanker (Model: {model_name})")
160
- return reranker
161
- except Exception as e:
162
- logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
163
- # Don't raise, just return None as reranker is optional
164
- return None
157
+ api_key: Optional[str] = None, model_name: str = "rerank-english-v2.0" # Default Cohere model
158
+ ) -> Optional[Any]: # Returns CohereRanker instance or None
159
+ """
160
+ Creates a default Cohere Reranker if available and API key provided.
161
+
162
+ Requires COHERE_API_KEY environment variable or api_key argument.
163
+ Requires haystack-cohere integration: pip install haystack-cohere
164
+ """
165
+ check_haystack_availability("create_default_text_reranker (optional)")
166
+
167
+ if not CohereRanker:
168
+ logger.debug(
169
+ "CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation."
170
+ )
171
+ return None
172
+
173
+ # Check for API key (prefer argument over environment variable)
174
+ cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
175
+ if not cohere_api_key:
176
+ logger.warning(
177
+ "COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker."
178
+ )
179
+ return None
180
+
181
+ logger.debug(f"Creating CohereRanker with model '{model_name}'")
182
+ try:
183
+ # Pass API key via authenticator for better practice if supported, or directly
184
+ # As of haystack 2.0b5, CohereRanker takes api_key directly
185
+ reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
186
+ logger.info(f"Initialized CohereRanker (Model: {model_name})")
187
+ return reranker
188
+ except Exception as e:
189
+ logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
190
+ # Don't raise, just return None as reranker is optional
191
+ return None
192
+
165
193
 
166
194
  # --- Default Document Embedder Creator ---
167
195
  def create_default_document_embedder(
168
196
  model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
169
197
  device: Optional[str] = None,
170
198
  progress_bar: bool = True,
171
- normalize_embeddings: bool = False # Changed default based on ST documentation
172
- ) -> Any: # Return Any as actual type depends on availability
199
+ normalize_embeddings: bool = False, # Changed default based on ST documentation
200
+ ) -> Any: # Return Any as actual type depends on availability
173
201
  """Creates a default SentenceTransformersDocumentEmbedder instance.
174
202
 
175
203
  Args:
@@ -192,7 +220,9 @@ def create_default_document_embedder(
192
220
  # Use the provided device parameter directly.
193
221
  # If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
194
222
  resolved_device = device
195
- logger.debug(f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}")
223
+ logger.debug(
224
+ f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}"
225
+ )
196
226
 
197
227
  try:
198
228
  embedder = SentenceTransformersDocumentEmbedder(
@@ -204,10 +234,16 @@ def create_default_document_embedder(
204
234
  # If embedding meta fields is needed, it should be passed as a parameter
205
235
  )
206
236
  embedder.warm_up()
207
- logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}") # Use embedder.device after init
237
+ logger.info(
238
+ f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}"
239
+ ) # Use embedder.device after init
208
240
  except Exception as e:
209
- logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
210
- raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
241
+ logger.error(
242
+ f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
243
+ )
244
+ raise RuntimeError(
245
+ f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
246
+ ) from e
211
247
 
212
248
  return embedder
213
249
 
@@ -221,19 +257,22 @@ def create_default_document_embedder(
221
257
  # Central Search Logic
222
258
  # ===========================
223
259
 
260
+
224
261
  def _perform_haystack_search(
225
262
  query: Union[str, Path, Image.Image],
226
- document_store: Any, # Use Any for simplicity now
227
- collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
228
- embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
229
- options: BaseSearchOptions
263
+ document_store: Any, # Use Any for simplicity now
264
+ collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
265
+ embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
266
+ options: BaseSearchOptions,
230
267
  ) -> List[Dict[str, Any]]:
231
268
  """Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
232
269
  if not HAS_HAYSTACK_EXTRAS:
233
270
  check_haystack_availability("_perform_haystack_search")
234
- return [] # Should not be reached due to check
271
+ return [] # Should not be reached due to check
235
272
 
236
- logger.info(f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})...")
273
+ logger.info(
274
+ f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})..."
275
+ )
237
276
  logger.debug(f" Query type: {type(query).__name__}")
238
277
  logger.debug(f" Options: {options}")
239
278
 
@@ -242,9 +281,11 @@ def _perform_haystack_search(
242
281
  query_embedding: Optional[List[float]] = None
243
282
 
244
283
  if isinstance(query, str):
245
- text_query = query # Keep text for potential reranker use
284
+ text_query = query # Keep text for potential reranker use
246
285
  if not embedder:
247
- logger.error("Text query provided, but no embedder instance was passed to _perform_haystack_search.")
286
+ logger.error(
287
+ "Text query provided, but no embedder instance was passed to _perform_haystack_search."
288
+ )
248
289
  return []
249
290
  # No need to check type if the type hint is enforced upstream
250
291
  # if not isinstance(embedder, SentenceTransformersTextEmbedder):
@@ -254,15 +295,21 @@ def _perform_haystack_search(
254
295
  embedding_result = embedder.run(text=text_query)
255
296
  query_embedding = embedding_result.get("embedding")
256
297
  if not query_embedding:
257
- logger.error(f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'")
258
- return []
259
- logger.debug(f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking.")
298
+ logger.error(
299
+ f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'"
300
+ )
301
+ return []
302
+ logger.debug(
303
+ f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking."
304
+ )
260
305
  except Exception as e:
261
306
  logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
262
307
  return []
263
308
  elif isinstance(query, Path) or isinstance(query, Image.Image):
264
309
  # Currently, this function doesn't support multi-modal query embedding directly
265
- logger.error(f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text.")
310
+ logger.error(
311
+ f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
312
+ )
266
313
  return []
267
314
  else:
268
315
  # Handle other unexpected types
@@ -276,42 +323,52 @@ def _perform_haystack_search(
276
323
  # --- 2. Set up Retriever --- #
277
324
  # Assumes the document_store is ChromaDocumentStore for this utility function context
278
325
  if not ChromaEmbeddingRetriever:
279
- logger.error("ChromaEmbeddingRetriever not available.")
280
- return []
326
+ logger.error("ChromaEmbeddingRetriever not available.")
327
+ return []
281
328
 
282
329
  # Ensure retriever_top_k is set (should be by __post_init__)
283
330
  retriever_top_k = options.retriever_top_k
284
331
  if retriever_top_k is None:
285
- logger.warning("options.retriever_top_k was None, defaulting to options.top_k for retriever.")
286
- retriever_top_k = options.top_k
332
+ logger.warning(
333
+ "options.retriever_top_k was None, defaulting to options.top_k for retriever."
334
+ )
335
+ retriever_top_k = options.top_k
287
336
 
288
337
  # Instantiate the EMBEDDING retriever
289
- retriever = ChromaEmbeddingRetriever(document_store=document_store,
290
- filters=options.filters or {}, # Pass filters here
291
- top_k=retriever_top_k)
292
-
293
- logger.debug(f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})")
338
+ retriever = ChromaEmbeddingRetriever(
339
+ document_store=document_store,
340
+ filters=options.filters or {}, # Pass filters here
341
+ top_k=retriever_top_k,
342
+ )
294
343
 
344
+ logger.debug(
345
+ f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
346
+ )
295
347
 
296
348
  # --- 3. Set up Optional Reranker --- #
297
349
  reranker_instance = None
298
- if options.use_reranker in [True, None]: # Check specifically for True or None
350
+ if options.use_reranker in [True, None]: # Check specifically for True or None
299
351
  logger.debug("Attempting to initialize reranker...")
300
352
  # Currently only supports default text reranker (Cohere)
301
- reranker_instance = create_default_text_reranker(api_key=options.reranker_api_key,
302
- model_name=options.reranker_model or "rerank-english-v2.0")
353
+ reranker_instance = create_default_text_reranker(
354
+ api_key=options.reranker_api_key,
355
+ model_name=options.reranker_model or "rerank-english-v2.0",
356
+ )
303
357
  if reranker_instance:
304
- # Ensure reranker top_k matches final desired top_k
305
- reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
306
- logger.info(f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})")
358
+ # Ensure reranker top_k matches final desired top_k
359
+ reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
360
+ logger.info(
361
+ f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
362
+ )
307
363
  else:
308
- logger.warning("Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking.")
309
-
364
+ logger.warning(
365
+ "Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
366
+ )
310
367
 
311
368
  # --- 4. Build and Run Pipeline --- #
312
369
  if not Pipeline:
313
- logger.error("Haystack Pipeline class not available.")
314
- return []
370
+ logger.error("Haystack Pipeline class not available.")
371
+ return []
315
372
 
316
373
  search_pipeline = Pipeline()
317
374
  search_pipeline.add_component("retriever", retriever)
@@ -325,15 +382,20 @@ def _perform_haystack_search(
325
382
  search_pipeline.connect("retriever.documents", "reranker.documents")
326
383
  # Reranker also needs the query text and final top_k
327
384
  if text_query is None:
328
- logger.error("Reranker requires text query, but it was not available (query might not have been text).")
385
+ logger.error(
386
+ "Reranker requires text query, but it was not available (query might not have been text)."
387
+ )
329
388
  # Handle this case - maybe skip reranker or raise error?
330
389
  # For now, let's skip reranker if text is missing
331
390
  logger.warning("Skipping reranker because text query is missing.")
332
- reranker_instance = None # Effectively remove it from the logic below
333
- last_component_name = "retriever" # Reset last component
391
+ reranker_instance = None # Effectively remove it from the logic below
392
+ last_component_name = "retriever" # Reset last component
334
393
  # Remove reranker component if added? Less clean. Let's just not add its input.
335
394
  else:
336
- pipeline_input["reranker"] = {"query": text_query, "top_k": options.top_k} # Pass query and final top_k
395
+ pipeline_input["reranker"] = {
396
+ "query": text_query,
397
+ "top_k": options.top_k,
398
+ } # Pass query and final top_k
337
399
  last_component_name = "reranker"
338
400
  logger.debug("Added reranker to pipeline and configured input.")
339
401
  else:
@@ -341,7 +403,6 @@ def _perform_haystack_search(
341
403
  last_component_name = "reranker"
342
404
  logger.debug("Added reranker to pipeline.")
343
405
 
344
-
345
406
  logger.info("Running Haystack search pipeline...")
346
407
  try:
347
408
  result = search_pipeline.run(pipeline_input)
@@ -356,31 +417,34 @@ def _perform_haystack_search(
356
417
  # Check output based on last component in the pipeline
357
418
  if last_component_name in result and result[last_component_name].get("documents"):
358
419
  final_documents = result[last_component_name]["documents"]
359
- logger.debug(f"Processed results from '{last_component_name}' ({len(final_documents)} documents).")
420
+ logger.debug(
421
+ f"Processed results from '{last_component_name}' ({len(final_documents)} documents)."
422
+ )
360
423
  else:
361
- logger.warning(f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}")
424
+ logger.warning(
425
+ f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}"
426
+ )
362
427
  return []
363
428
 
364
429
  # Convert Haystack Documents to the desired output format
365
430
  output_results = []
366
- for doc in final_documents: # Correctly loop over final_documents
367
- # Check if doc is actually a Haystack Document object or potentially a dict
368
- doc_id = getattr(doc, 'id', None)
369
- doc_score = getattr(doc, 'score', 0.0)
370
- doc_content = getattr(doc, 'content', None)
371
- doc_meta = getattr(doc, 'meta', {})
372
-
373
- meta = doc_meta or {}
374
- output = {
375
- "pdf_path": meta.get("pdf_path", "Unknown"),
376
- "page_number": meta.get("page_number", -1),
377
- "score": doc_score if doc_score is not None else 0.0, # Handle potential None score
378
- "content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
379
- "metadata": meta,
380
- # "haystack_document": doc # Optionally include the full Haystack doc
381
- }
382
- output_results.append(output)
431
+ for doc in final_documents: # Correctly loop over final_documents
432
+ # Check if doc is actually a Haystack Document object or potentially a dict
433
+ doc_id = getattr(doc, "id", None)
434
+ doc_score = getattr(doc, "score", 0.0)
435
+ doc_content = getattr(doc, "content", None)
436
+ doc_meta = getattr(doc, "meta", {})
437
+
438
+ meta = doc_meta or {}
439
+ output = {
440
+ "pdf_path": meta.get("pdf_path", "Unknown"),
441
+ "page_number": meta.get("page_number", -1),
442
+ "score": doc_score if doc_score is not None else 0.0, # Handle potential None score
443
+ "content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
444
+ "metadata": meta,
445
+ # "haystack_document": doc # Optionally include the full Haystack doc
446
+ }
447
+ output_results.append(output)
383
448
 
384
449
  logger.info(f"Returning {len(output_results)} relevant results.")
385
450
  return output_results
386
-
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  from dataclasses import dataclass, field
3
- from typing import List, Optional, Dict, Any, Tuple, Union, Literal
3
+ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
4
4
 
5
5
  # Use object placeholders for external types to avoid direct dependency
6
6
  BaseRanker = object
@@ -8,10 +8,12 @@ BaseEmbedder = object
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
+
11
12
  # --- Base Search Options ---
12
13
  @dataclass
13
14
  class BaseSearchOptions:
14
15
  """Base options for search operations."""
16
+
15
17
  # How many results to return finally (after retrieval and optional reranking)
16
18
  top_k: int = 10
17
19
  # How many candidates the retriever should fetch initially (relevant if reranking)
@@ -22,12 +24,12 @@ class BaseSearchOptions:
22
24
 
23
25
  # --- Reranking Configuration ---
24
26
  # Option 1: Simple boolean/None
25
- use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
27
+ use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
26
28
  # Option 2: Provide a specific instance (takes precedence over use_reranker boolean)
27
29
  reranker_instance: Optional[BaseRanker] = None
28
30
  # Parameters for default Cohere reranker (if use_reranker=True)
29
- reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
30
- reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
31
+ reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
32
+ reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
31
33
 
32
34
  # --- Embedder Configuration (Less common to override per-query, usually set at indexing) ---
33
35
  # embedder_instance: Optional[BaseEmbedder] = None # Might be useful for advanced cases
@@ -35,38 +37,47 @@ class BaseSearchOptions:
35
37
  def __post_init__(self):
36
38
  # Validate that top_k values make sense
37
39
  if self.retriever_top_k is None:
38
- # If retriever_top_k isn't set, default it based on reranking needs
39
- if self.use_reranker:
40
- self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
41
- else:
42
- self.retriever_top_k = self.top_k
40
+ # If retriever_top_k isn't set, default it based on reranking needs
41
+ if self.use_reranker:
42
+ self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
43
+ else:
44
+ self.retriever_top_k = self.top_k
43
45
  elif self.retriever_top_k < self.top_k:
44
- logger.warning(f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results.")
46
+ logger.warning(
47
+ f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results."
48
+ )
49
+
45
50
 
46
51
  # --- Text Search Specific Options ---
47
52
  @dataclass
48
53
  class TextSearchOptions(BaseSearchOptions):
49
54
  """Options specific to text-based semantic search."""
55
+
50
56
  # Add any text-specific overrides or parameters here if needed in the future
51
57
  # e.g., specifying default text reranker model name if different defaults emerge
52
58
  # default_text_reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
53
- pass # Currently inherits all base options
59
+ pass # Currently inherits all base options
60
+
54
61
 
55
62
  # --- MultiModal Search Specific Options ---
56
63
  @dataclass
57
64
  class MultiModalSearchOptions(BaseSearchOptions):
58
65
  """Options specific to multimodal semantic search."""
66
+
59
67
  # Flag to potentially use a default multimodal reranker if available
60
68
  # (overrides base use_reranker=True if reranker_instance is None)
61
- use_multimodal_reranker: bool = True # Attempt multimodal rerank if use_reranker=True/None and no instance given
69
+ use_multimodal_reranker: bool = (
70
+ True # Attempt multimodal rerank if use_reranker=True/None and no instance given
71
+ )
62
72
  # e.g., specifying default multimodal embedder/reranker models
63
73
  # default_multimodal_embedder_model: str = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
64
74
  # default_multimodal_reranker_model: str = "jinaai/jina-reranker-m0" # Example
65
75
 
76
+
66
77
  # --- Union Type ---
67
78
  # Defines the types allowed for search configuration.
68
79
  SearchOptions = Union[
69
80
  TextSearchOptions,
70
81
  MultiModalSearchOptions,
71
- BaseSearchOptions # Include base for typing flexibility
72
- ]
82
+ BaseSearchOptions, # Include base for typing flexibility
83
+ ]
@@ -1,15 +1,18 @@
1
1
  """Defines the protocol for a search service."""
2
- from typing import Protocol, List, Dict, Any, Optional, Union, Iterable
2
+
3
3
  from pathlib import Path
4
+ from typing import Any, Dict, Iterable, List, Optional, Protocol, Union
5
+
4
6
  from PIL import Image
5
7
 
8
+ # Forward declare SearchOptions to avoid circular import if needed,
9
+ # or import if structure allows (assuming it's safe here)
10
+ from natural_pdf.search.search_options import BaseSearchOptions, SearchOptions
11
+
6
12
  # Use typing_extensions for Python < 3.8 compatibility if needed,
7
13
  # otherwise, typing.Protocol is fine for >= 3.8
8
14
  # from typing_extensions import Protocol
9
15
 
10
- # Forward declare SearchOptions to avoid circular import if needed,
11
- # or import if structure allows (assuming it's safe here)
12
- from natural_pdf.search.search_options import SearchOptions, BaseSearchOptions
13
16
 
14
17
  # Use Dict as placeholder for external Haystack Document type
15
18
  HaystackDocument = Dict[str, Any]
@@ -17,12 +20,14 @@ HaystackDocument = Dict[str, Any]
17
20
 
18
21
  class IndexConfigurationError(RuntimeError):
19
22
  """Custom exception for configuration mismatches during indexing."""
23
+
20
24
  pass
21
25
 
22
26
 
23
27
  # Add new exception for sync/init safety
24
28
  class IndexExistsError(RuntimeError):
25
29
  """Raised when attempting to index implicitly to an existing persistent index without force_reindex=True."""
30
+
26
31
  pass
27
32
 
28
33
 
@@ -66,6 +71,7 @@ class SearchServiceProtocol(Protocol):
66
71
  with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
67
72
  An instance of a service implementing this protocol is tied to a specific collection name.
68
73
  """
74
+
69
75
  collection_name: str
70
76
  # Removed internal state hints (_persist, _embedding_model) - implementation detail
71
77
 
@@ -98,7 +104,7 @@ class SearchServiceProtocol(Protocol):
98
104
 
99
105
  def search(
100
106
  self,
101
- query: Any, # Allow any query type, service implementation handles it
107
+ query: Any, # Allow any query type, service implementation handles it
102
108
  options: BaseSearchOptions,
103
109
  ) -> List[Dict[str, Any]]:
104
110
  """
@@ -186,4 +192,4 @@ class SearchServiceProtocol(Protocol):
186
192
  ...
187
193
 
188
194
  # Optional: Add methods for getting index stats, etc.
189
- # def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...
195
+ # def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...