natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +209 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +288 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +413 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +512 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +604 -0
  56. docs/tutorials/12-ocr-integration.md +175 -0
  57. docs/tutorials/13-semantic-search.ipynb +1328 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +50 -33
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/gemini.py +264 -0
  67. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  68. natural_pdf/analyzers/layout/layout_manager.py +125 -58
  69. natural_pdf/analyzers/layout/layout_options.py +43 -17
  70. natural_pdf/analyzers/layout/paddle.py +152 -95
  71. natural_pdf/analyzers/layout/surya.py +164 -92
  72. natural_pdf/analyzers/layout/tatr.py +149 -84
  73. natural_pdf/analyzers/layout/yolo.py +89 -45
  74. natural_pdf/analyzers/text_options.py +22 -15
  75. natural_pdf/analyzers/text_structure.py +131 -85
  76. natural_pdf/analyzers/utils.py +30 -23
  77. natural_pdf/collections/pdf_collection.py +146 -97
  78. natural_pdf/core/__init__.py +1 -1
  79. natural_pdf/core/element_manager.py +419 -337
  80. natural_pdf/core/highlighting_service.py +268 -196
  81. natural_pdf/core/page.py +1044 -521
  82. natural_pdf/core/pdf.py +516 -313
  83. natural_pdf/elements/__init__.py +1 -1
  84. natural_pdf/elements/base.py +307 -225
  85. natural_pdf/elements/collections.py +805 -543
  86. natural_pdf/elements/line.py +39 -36
  87. natural_pdf/elements/rect.py +32 -30
  88. natural_pdf/elements/region.py +889 -879
  89. natural_pdf/elements/text.py +127 -99
  90. natural_pdf/exporters/__init__.py +0 -1
  91. natural_pdf/exporters/searchable_pdf.py +261 -102
  92. natural_pdf/ocr/__init__.py +57 -35
  93. natural_pdf/ocr/engine.py +150 -46
  94. natural_pdf/ocr/engine_easyocr.py +146 -150
  95. natural_pdf/ocr/engine_paddle.py +118 -175
  96. natural_pdf/ocr/engine_surya.py +78 -141
  97. natural_pdf/ocr/ocr_factory.py +114 -0
  98. natural_pdf/ocr/ocr_manager.py +122 -124
  99. natural_pdf/ocr/ocr_options.py +16 -20
  100. natural_pdf/ocr/utils.py +98 -0
  101. natural_pdf/qa/__init__.py +1 -1
  102. natural_pdf/qa/document_qa.py +119 -111
  103. natural_pdf/search/__init__.py +37 -31
  104. natural_pdf/search/haystack_search_service.py +312 -189
  105. natural_pdf/search/haystack_utils.py +186 -122
  106. natural_pdf/search/search_options.py +25 -14
  107. natural_pdf/search/search_service_protocol.py +12 -6
  108. natural_pdf/search/searchable_mixin.py +261 -176
  109. natural_pdf/selectors/__init__.py +2 -1
  110. natural_pdf/selectors/parser.py +159 -316
  111. natural_pdf/templates/__init__.py +1 -1
  112. natural_pdf/templates/spa/css/style.css +334 -0
  113. natural_pdf/templates/spa/index.html +31 -0
  114. natural_pdf/templates/spa/js/app.js +472 -0
  115. natural_pdf/templates/spa/words.txt +235976 -0
  116. natural_pdf/utils/debug.py +32 -0
  117. natural_pdf/utils/highlighting.py +8 -2
  118. natural_pdf/utils/identifiers.py +29 -0
  119. natural_pdf/utils/packaging.py +418 -0
  120. natural_pdf/utils/reading_order.py +65 -63
  121. natural_pdf/utils/text_extraction.py +195 -0
  122. natural_pdf/utils/visualization.py +70 -61
  123. natural_pdf/widgets/__init__.py +2 -3
  124. natural_pdf/widgets/viewer.py +749 -718
  125. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
  126. natural_pdf-0.1.6.dist-info/RECORD +141 -0
  127. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
  128. natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
  129. notebooks/Examples.ipynb +1293 -0
  130. pdfs/.gitkeep +0 -0
  131. pdfs/01-practice.pdf +543 -0
  132. pdfs/0500000US42001.pdf +0 -0
  133. pdfs/0500000US42007.pdf +0 -0
  134. pdfs/2014 Statistics.pdf +0 -0
  135. pdfs/2019 Statistics.pdf +0 -0
  136. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  137. pdfs/needs-ocr.pdf +0 -0
  138. natural_pdf/templates/ocr_debug.html +0 -517
  139. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  140. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  141. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,32 @@
1
1
  """Implementation of the SearchServiceProtocol using Haystack components."""
2
2
 
3
+ import copy
3
4
  import logging
4
5
  import os
5
- from typing import List, Dict, Any, Optional, Union, Iterable
6
6
  from pathlib import Path
7
+ from typing import Any, Dict, Iterable, List, Optional, Union
8
+
7
9
  from PIL import Image
8
- import copy
9
10
 
10
11
  # --- Haystack Imports ---
11
12
  try:
12
13
  import haystack
13
14
  from haystack import Pipeline
14
- from haystack.dataclasses import Document as HaystackDocument
15
- from haystack.document_stores.types import DocumentStore, DuplicatePolicy
16
- from haystack_integrations.document_stores.chroma import ChromaDocumentStore
17
- from haystack.document_stores.in_memory import InMemoryDocumentStore
18
15
  from haystack.components.embedders import (
16
+ SentenceTransformersDocumentEmbedder,
19
17
  SentenceTransformersTextEmbedder,
20
- SentenceTransformersDocumentEmbedder
21
18
  )
19
+
22
20
  # Import necessary retrievers, rankers etc. as needed for search()
23
- from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
24
- from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever # Use embedding retriever
21
+ from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
22
+ from haystack.dataclasses import Document as HaystackDocument
23
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
24
+ from haystack.document_stores.types import DocumentStore, DuplicatePolicy
25
+ from haystack_integrations.components.retrievers.chroma import ( # Use embedding retriever
26
+ ChromaEmbeddingRetriever,
27
+ )
28
+ from haystack_integrations.document_stores.chroma import ChromaDocumentStore
29
+
25
30
  # Need Ranker if used
26
31
  try:
27
32
  from haystack.components.rankers import CohereRanker
@@ -39,7 +44,7 @@ except ImportError:
39
44
  SentenceTransformersDocumentEmbedder = None
40
45
  SentenceTransformersTextEmbedder = None
41
46
  InMemoryEmbeddingRetriever = None
42
- ChromaEmbeddingRetriever = None # Fallback definition
47
+ ChromaEmbeddingRetriever = None # Fallback definition
43
48
  CohereRanker = None
44
49
  Pipeline = None
45
50
  DuplicatePolicy = None
@@ -47,16 +52,27 @@ except ImportError:
47
52
  # --- ChromaDB Client Import (for management) ---
48
53
  try:
49
54
  import chromadb
55
+
50
56
  CHROMADB_AVAILABLE = True
51
57
  except ImportError:
52
58
  chromadb = None
53
59
  CHROMADB_AVAILABLE = False
54
60
 
61
+ from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
62
+ from .search_options import (
63
+ BaseSearchOptions,
64
+ MultiModalSearchOptions,
65
+ SearchOptions,
66
+ TextSearchOptions,
67
+ )
68
+
55
69
  # --- Local Imports ---
56
- from .search_service_protocol import SearchServiceProtocol, IndexConfigurationError, IndexExistsError
57
- from .search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
58
- from .search_service_protocol import Indexable
59
- from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
70
+ from .search_service_protocol import (
71
+ Indexable,
72
+ IndexConfigurationError,
73
+ IndexExistsError,
74
+ SearchServiceProtocol,
75
+ )
60
76
 
61
77
  # --- Logging ---
62
78
  logger = logging.getLogger(__name__)
@@ -65,6 +81,7 @@ logger = logging.getLogger(__name__)
65
81
  DEFAULT_PERSIST_PATH = "./natural_pdf_index"
66
82
  DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
67
83
 
84
+
68
85
  class HaystackSearchService(SearchServiceProtocol):
69
86
  """
70
87
  Haystack-based implementation of the search service protocol.
@@ -77,9 +94,9 @@ class HaystackSearchService(SearchServiceProtocol):
77
94
  def __init__(
78
95
  self,
79
96
  collection_name: str,
80
- persist: bool = False, # Store type configuration
97
+ persist: bool = False, # Store type configuration
81
98
  default_persist_path: str = DEFAULT_PERSIST_PATH,
82
- embedding_model: str = DEFAULT_EMBEDDING_MODEL # Renamed for clarity
99
+ embedding_model: str = DEFAULT_EMBEDDING_MODEL, # Renamed for clarity
83
100
  ):
84
101
  """
85
102
  Initialize the service for a specific collection.
@@ -92,18 +109,24 @@ class HaystackSearchService(SearchServiceProtocol):
92
109
  embedding_model: The embedding model this service instance will use.
93
110
  """
94
111
  if not HAS_HAYSTACK_EXTRAS:
95
- raise ImportError("HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]")
112
+ raise ImportError(
113
+ "HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
114
+ )
96
115
 
97
- self.collection_name = collection_name # Store the collection name
98
- self._persist = persist # Store the persistence type for this instance
116
+ self.collection_name = collection_name # Store the collection name
117
+ self._persist = persist # Store the persistence type for this instance
99
118
  self._default_persist_path = default_persist_path
100
- self._embedding_model = embedding_model # Store the configured model
101
-
119
+ self._embedding_model = embedding_model # Store the configured model
120
+
102
121
  # Dictionary to hold InMemoryDocumentStore instances if not persisting
103
- self._in_memory_store: Optional[InMemoryDocumentStore] = None if persist else InMemoryDocumentStore()
104
- self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
122
+ self._in_memory_store: Optional[InMemoryDocumentStore] = (
123
+ None if persist else InMemoryDocumentStore()
124
+ )
125
+ self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
105
126
 
106
- logger.info(f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'")
127
+ logger.info(
128
+ f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'"
129
+ )
107
130
 
108
131
  # --- Internal Helper Methods --- #
109
132
 
@@ -114,27 +137,34 @@ class HaystackSearchService(SearchServiceProtocol):
114
137
  # Use the instance's configured persistence type and collection name
115
138
  if self._persist:
116
139
  if self._chroma_store is None:
117
- # Lazy load Chroma store
118
- logger.debug(f"Initializing ChromaDocumentStore for collection '{self.collection_name}'.")
119
- self._chroma_store = ChromaDocumentStore(
120
- persist_path=self._default_persist_path,
121
- collection_name=self.collection_name # Use instance name
122
- )
140
+ # Lazy load Chroma store
141
+ logger.debug(
142
+ f"Initializing ChromaDocumentStore for collection '{self.collection_name}'."
143
+ )
144
+ self._chroma_store = ChromaDocumentStore(
145
+ persist_path=self._default_persist_path,
146
+ collection_name=self.collection_name, # Use instance name
147
+ )
123
148
  return self._chroma_store
124
149
  else:
125
150
  # Return the instance's InMemory store
126
- if self._in_memory_store is None: # Should have been created in __init__ if persist=False
127
- logger.warning(f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now.")
128
- self._in_memory_store = InMemoryDocumentStore()
151
+ if (
152
+ self._in_memory_store is None
153
+ ): # Should have been created in __init__ if persist=False
154
+ logger.warning(
155
+ f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
156
+ )
157
+ self._in_memory_store = InMemoryDocumentStore()
129
158
  return self._in_memory_store
130
159
 
131
160
  def _get_document_embedder(
132
- self,
133
- device: Optional[str] = None
161
+ self, device: Optional[str] = None
134
162
  ) -> SentenceTransformersDocumentEmbedder:
135
163
  """Creates the Haystack document embedder component."""
136
- model_name = self._embedding_model # Use instance model
137
- logger.debug(f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}")
164
+ model_name = self._embedding_model # Use instance model
165
+ logger.debug(
166
+ f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
167
+ )
138
168
  if not SentenceTransformersDocumentEmbedder:
139
169
  raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
140
170
  try:
@@ -143,56 +173,80 @@ class HaystackSearchService(SearchServiceProtocol):
143
173
  device=device,
144
174
  )
145
175
  embedder.warm_up()
146
- logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
176
+ logger.info(
177
+ f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
178
+ )
147
179
  return embedder
148
180
  except Exception as e:
149
- logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
150
- raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
181
+ logger.error(
182
+ f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
183
+ )
184
+ raise RuntimeError(
185
+ f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
186
+ ) from e
151
187
 
152
- def _get_text_embedder(
153
- self,
154
- device: Optional[str] = None
155
- ) -> SentenceTransformersTextEmbedder:
188
+ def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
156
189
  """Creates the Haystack text embedder component (for queries)."""
157
- model_name = self._embedding_model # Use instance model
158
- logger.debug(f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}")
190
+ model_name = self._embedding_model # Use instance model
191
+ logger.debug(
192
+ f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
193
+ )
159
194
  if not SentenceTransformersTextEmbedder:
160
195
  raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
161
196
  try:
162
197
  embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
163
- embedder.warm_up()
164
- logger.info(f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
198
+ embedder.warm_up()
199
+ logger.info(
200
+ f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
201
+ )
165
202
  return embedder
166
203
  except Exception as e:
167
- logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
168
- raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
204
+ logger.error(
205
+ f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True
206
+ )
207
+ raise RuntimeError(
208
+ f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
209
+ ) from e
169
210
 
170
211
  def _delete_chroma_collection(self) -> bool:
171
212
  """Internal helper to delete the ChromaDB collection managed by this service."""
172
213
  if not CHROMADB_AVAILABLE:
173
- logger.error("Cannot delete ChromaDB collection because 'chromadb' library is not installed.")
214
+ logger.error(
215
+ "Cannot delete ChromaDB collection because 'chromadb' library is not installed."
216
+ )
174
217
  raise ImportError("'chromadb' library required for collection deletion.")
175
218
  if not self._persist:
176
- logger.warning("Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring.")
177
- return False # Cannot delete if not persistent
219
+ logger.warning(
220
+ "Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring."
221
+ )
222
+ return False # Cannot delete if not persistent
178
223
  try:
179
- collection_name_to_delete = self.collection_name # Use instance collection name
180
- logger.warning(f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'.")
224
+ collection_name_to_delete = self.collection_name # Use instance collection name
225
+ logger.warning(
226
+ f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'."
227
+ )
181
228
  chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
182
229
  try:
183
230
  chroma_client.delete_collection(name=collection_name_to_delete)
184
- logger.info(f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'.")
185
- self._chroma_store = None # Reset lazy-loaded store
231
+ logger.info(
232
+ f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'."
233
+ )
234
+ self._chroma_store = None # Reset lazy-loaded store
186
235
  return True
187
236
  except chromadb.errors.InvalidCollectionException:
188
- logger.info(f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed.")
189
- return True # Deletion is effectively successful
237
+ logger.info(
238
+ f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed."
239
+ )
240
+ return True # Deletion is effectively successful
190
241
  finally:
191
- pass # Cleanup if needed
242
+ pass # Cleanup if needed
192
243
  except ImportError as ie:
193
244
  raise ie
194
245
  except Exception as e:
195
- logger.error(f"Error during ChromaDB collection deletion '{self.collection_name}': {e}", exc_info=True)
246
+ logger.error(
247
+ f"Error during ChromaDB collection deletion '{self.collection_name}': {e}",
248
+ exc_info=True,
249
+ )
196
250
  # Don't raise here, let index() decide based on force_reindex
197
251
  return False
198
252
 
@@ -200,37 +254,43 @@ class HaystackSearchService(SearchServiceProtocol):
200
254
 
201
255
  def index(
202
256
  self,
203
- documents: Iterable[Indexable], # Accept Indexable objects
257
+ documents: Iterable[Indexable], # Accept Indexable objects
204
258
  embedder_device: Optional[str] = None,
205
259
  force_reindex: bool = False,
206
260
  ) -> None:
207
261
  # Need to consume the iterable to log count, or log differently
208
262
  # Let's convert to list for now, assuming size isn't prohibitive
209
263
  indexable_list = list(documents)
210
- logger.info(f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}")
211
-
264
+ logger.info(
265
+ f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
266
+ )
267
+
212
268
  if not indexable_list:
213
- logger.warning("No documents provided for indexing. Skipping.")
214
- return
269
+ logger.warning("No documents provided for indexing. Skipping.")
270
+ return
215
271
 
216
272
  # --- 1. Handle Reindexing (Deletion before store/embedder init) ---
217
273
  if force_reindex:
218
- logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
219
- if self._persist:
220
- # Attempt deletion, raises ImportError if chromadb missing
221
- deleted = self._delete_chroma_collection() # Uses self.collection_name
222
- if not deleted:
223
- # If deletion failed for other reasons, log and continue cautiously
224
- logger.warning("Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere.")
225
- else:
226
- # For InMemory, force_reindex means we want a fresh store instance.
227
- # Re-initialize the instance's in-memory store
228
- logger.info(f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'.")
229
- self._in_memory_store = InMemoryDocumentStore() # Create a new one
274
+ logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
275
+ if self._persist:
276
+ # Attempt deletion, raises ImportError if chromadb missing
277
+ deleted = self._delete_chroma_collection() # Uses self.collection_name
278
+ if not deleted:
279
+ # If deletion failed for other reasons, log and continue cautiously
280
+ logger.warning(
281
+ "Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
282
+ )
283
+ else:
284
+ # For InMemory, force_reindex means we want a fresh store instance.
285
+ # Re-initialize the instance's in-memory store
286
+ logger.info(
287
+ f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'."
288
+ )
289
+ self._in_memory_store = InMemoryDocumentStore() # Create a new one
230
290
 
231
291
  # REMOVED try...except around store retrieval
232
292
  # Let store initialization errors propagate directly
233
- store = self._get_store() # No argument needed
293
+ store = self._get_store() # No argument needed
234
294
 
235
295
  # --- 3. Create Embedder ---
236
296
  # Errors during embedder creation will propagate from the helper
@@ -243,42 +303,55 @@ class HaystackSearchService(SearchServiceProtocol):
243
303
  for item in indexable_list:
244
304
  doc_id = item.get_id()
245
305
  metadata = item.get_metadata()
246
- content_obj = item.get_content() # This might be Page, Region, etc.
306
+ content_obj = item.get_content() # This might be Page, Region, etc.
247
307
 
248
308
  # Determine content based on embedder type and content object
249
309
  # For now, assume text content is needed and try to extract it
250
310
  content_text = ""
251
311
  if isinstance(content_obj, str):
252
- # If get_content() already returned text
253
- content_text = content_obj
254
- elif hasattr(content_obj, 'extract_text') and callable(getattr(content_obj, 'extract_text')):
255
- # If content object has extract_text (like Page or Region)
256
- try:
257
- content_text = content_obj.extract_text()
258
- if not isinstance(content_text, str):
259
- logger.warning(f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str().")
260
- content_text = str(content_obj)
261
- except Exception as extraction_error:
262
- logger.error(f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().", exc_info=False)
263
- content_text = str(content_obj)
312
+ # If get_content() already returned text
313
+ content_text = content_obj
314
+ elif hasattr(content_obj, "extract_text") and callable(
315
+ getattr(content_obj, "extract_text")
316
+ ):
317
+ # If content object has extract_text (like Page or Region)
318
+ try:
319
+ content_text = content_obj.extract_text()
320
+ if not isinstance(content_text, str):
321
+ logger.warning(
322
+ f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str()."
323
+ )
324
+ content_text = str(content_obj)
325
+ except Exception as extraction_error:
326
+ logger.error(
327
+ f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().",
328
+ exc_info=False,
329
+ )
330
+ content_text = str(content_obj)
264
331
  else:
265
- # Attempt to convert to string as fallback if no obvious text method
266
- logger.warning(f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str().")
267
- content_text = str(content_obj)
332
+ # Attempt to convert to string as fallback if no obvious text method
333
+ logger.warning(
334
+ f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
335
+ )
336
+ content_text = str(content_obj)
268
337
 
269
338
  # Construct HaystackDocument using data from Indexable protocol methods
270
339
  haystack_doc = HaystackDocument(
271
- id=doc_id, # Use ID from get_id()
340
+ id=doc_id, # Use ID from get_id()
272
341
  content=content_text,
273
- meta=metadata # Use metadata from get_metadata()
342
+ meta=metadata, # Use metadata from get_metadata()
274
343
  )
275
344
  haystack_docs_to_embed.append(haystack_doc)
276
345
 
277
346
  if not haystack_docs_to_embed:
278
- logger.warning("No Haystack documents were prepared. Check conversion logic and input data.")
279
- return
347
+ logger.warning(
348
+ "No Haystack documents were prepared. Check conversion logic and input data."
349
+ )
350
+ return
280
351
 
281
- logger.info(f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'...")
352
+ logger.info(
353
+ f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
354
+ )
282
355
  try:
283
356
  # Embed the documents
284
357
  embedding_results = embedder.run(documents=haystack_docs_to_embed)
@@ -291,33 +364,42 @@ class HaystackSearchService(SearchServiceProtocol):
291
364
  error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
292
365
  if self._persist:
293
366
  error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
294
- error_msg += "Try deleting the persistent storage directory or using force_reindex=True."
367
+ error_msg += (
368
+ "Try deleting the persistent storage directory or using force_reindex=True."
369
+ )
295
370
  else:
296
- error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
371
+ error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
297
372
  logger.error(error_msg, exc_info=True)
298
373
  raise IndexConfigurationError(error_msg) from dim_error
299
374
  # REMOVED broad except Exception for embedding errors. Let them propagate.
300
375
 
301
376
  # --- 5. Write Embedded Documents to Store ---
302
- logger.info(f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'...")
377
+ logger.info(
378
+ f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'..."
379
+ )
303
380
  # REMOVED try...except around store writing. Let errors propagate.
304
381
  write_result = store.write_documents(
305
- documents=embedded_docs,
306
- policy=DuplicatePolicy.OVERWRITE # Or configure as needed
382
+ documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE # Or configure as needed
383
+ )
384
+ logger.info(
385
+ f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
307
386
  )
308
- logger.info(f"Successfully wrote {write_result} documents to store '{self.collection_name}'.")
309
387
  # --- Add explicit count check after writing ---
310
- logger.info(f"Store '{self.collection_name}' document count after write: {store.count_documents()}")
388
+ logger.info(
389
+ f"Store '{self.collection_name}' document count after write: {store.count_documents()}"
390
+ )
311
391
  # --- End count check ---
312
392
 
313
393
  def search(
314
394
  self,
315
- query: Any, # Changed from Union[str, Path, Image.Image] to Any
395
+ query: Any, # Changed from Union[str, Path, Image.Image] to Any
316
396
  options: BaseSearchOptions,
317
397
  ) -> List[Dict[str, Any]]:
318
- logger.info(f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}")
398
+ logger.info(
399
+ f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}"
400
+ )
319
401
 
320
- store = self._get_store() # Let errors propagate
402
+ store = self._get_store() # Let errors propagate
321
403
 
322
404
  # --- 1. Handle Query Type and Embedding ---
323
405
  # This implementation currently only supports text query embedding.
@@ -325,44 +407,63 @@ class HaystackSearchService(SearchServiceProtocol):
325
407
  query_embedding = None
326
408
  query_text = ""
327
409
  if isinstance(query, (str, os.PathLike)):
328
- if isinstance(query, os.PathLike):
329
- logger.warning("Image path query received, but multimodal search not fully implemented. Treating as text path string.")
330
- query_text = str(query)
331
- else:
332
- query_text = query
333
-
334
- text_embedder = self._get_text_embedder()
335
- embedding_result = text_embedder.run(text=query_text)
336
- query_embedding = embedding_result["embedding"]
337
- if not query_embedding:
338
- raise ValueError("Text embedder did not return an embedding for the query.")
339
- logger.debug(f"Successfully generated query text embedding (dim: {len(query_embedding)}).")
410
+ if isinstance(query, os.PathLike):
411
+ logger.warning(
412
+ "Image path query received, but multimodal search not fully implemented. Treating as text path string."
413
+ )
414
+ query_text = str(query)
415
+ else:
416
+ query_text = query
417
+
418
+ text_embedder = self._get_text_embedder()
419
+ embedding_result = text_embedder.run(text=query_text)
420
+ query_embedding = embedding_result["embedding"]
421
+ if not query_embedding:
422
+ raise ValueError("Text embedder did not return an embedding for the query.")
423
+ logger.debug(
424
+ f"Successfully generated query text embedding (dim: {len(query_embedding)})."
425
+ )
340
426
 
341
427
  elif isinstance(query, Image.Image):
342
- logger.error("Multimodal query (PIL Image) is not yet supported by this service implementation.")
343
- raise NotImplementedError("Search with PIL Image queries is not implemented in HaystackSearchService.")
428
+ logger.error(
429
+ "Multimodal query (PIL Image) is not yet supported by this service implementation."
430
+ )
431
+ raise NotImplementedError(
432
+ "Search with PIL Image queries is not implemented in HaystackSearchService."
433
+ )
344
434
  # Check if query is Indexable and try extracting text?
345
- elif hasattr(query, 'extract_text') and callable(getattr(query, 'extract_text')):
346
- logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text for search.")
347
- try:
348
- query_text = query.extract_text()
349
- if not query_text or not query_text.strip():
350
- logger.warning(f"Query object {type(query).__name__} provided empty text. Returning no results.")
351
- return []
352
- # Embed the extracted text
353
- text_embedder = self._get_text_embedder()
354
- embedding_result = text_embedder.run(text=query_text)
355
- query_embedding = embedding_result["embedding"]
356
- if not query_embedding:
357
- raise ValueError(f"Text embedder did not return an embedding for text extracted from {type(query).__name__}.")
358
- logger.debug(f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)}).")
359
- except Exception as e:
360
- logger.error(f"Failed to extract or embed text from query object {type(query).__name__}: {e}", exc_info=True)
361
- raise RuntimeError("Query text extraction or embedding failed.") from e
435
+ elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
436
+ logger.debug(
437
+ f"Query type {type(query).__name__} has extract_text. Extracting text for search."
438
+ )
439
+ try:
440
+ query_text = query.extract_text()
441
+ if not query_text or not query_text.strip():
442
+ logger.warning(
443
+ f"Query object {type(query).__name__} provided empty text. Returning no results."
444
+ )
445
+ return []
446
+ # Embed the extracted text
447
+ text_embedder = self._get_text_embedder()
448
+ embedding_result = text_embedder.run(text=query_text)
449
+ query_embedding = embedding_result["embedding"]
450
+ if not query_embedding:
451
+ raise ValueError(
452
+ f"Text embedder did not return an embedding for text extracted from {type(query).__name__}."
453
+ )
454
+ logger.debug(
455
+ f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)})."
456
+ )
457
+ except Exception as e:
458
+ logger.error(
459
+ f"Failed to extract or embed text from query object {type(query).__name__}: {e}",
460
+ exc_info=True,
461
+ )
462
+ raise RuntimeError("Query text extraction or embedding failed.") from e
362
463
 
363
464
  else:
364
- # Raise specific error for unsupported types by this implementation
365
- raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
465
+ # Raise specific error for unsupported types by this implementation
466
+ raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
366
467
 
367
468
  # --- 2. Select Retriever based on Store Type ---
368
469
  retriever = None
@@ -371,10 +472,10 @@ class HaystackSearchService(SearchServiceProtocol):
371
472
  raise ImportError("ChromaEmbeddingRetriever is required but not available.")
372
473
  retriever = ChromaEmbeddingRetriever(document_store=store)
373
474
  elif isinstance(store, InMemoryDocumentStore):
374
- retriever = InMemoryEmbeddingRetriever(document_store=store)
475
+ retriever = InMemoryEmbeddingRetriever(document_store=store)
375
476
  else:
376
- # Raise specific error for unsupported store
377
- raise TypeError(f"Cannot perform search with store type {type(store)}.")
477
+ # Raise specific error for unsupported store
478
+ raise TypeError(f"Cannot perform search with store type {type(store)}.")
378
479
 
379
480
  # --- 3. Build Retrieval Pipeline ---
380
481
  pipeline = Pipeline()
@@ -385,13 +486,10 @@ class HaystackSearchService(SearchServiceProtocol):
385
486
  # --- 4. Prepare Filters (remains the same) ---
386
487
  haystack_filters = options.filters
387
488
  if haystack_filters:
388
- logger.debug(f"Applying filters: {haystack_filters}")
489
+ logger.debug(f"Applying filters: {haystack_filters}")
389
490
 
390
491
  # --- 5. Prepare Retriever Input Data (Dynamically) ---
391
- retriever_input_data = {
392
- "filters": haystack_filters,
393
- "top_k": options.top_k
394
- }
492
+ retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
395
493
  # Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
396
494
  retriever_input_data["query_embedding"] = query_embedding
397
495
  logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
@@ -415,23 +513,25 @@ class HaystackSearchService(SearchServiceProtocol):
415
513
  meta_with_hash = doc.meta
416
514
  # No need to explicitly add hash here if Haystack store preserves it
417
515
  result_dict = {
418
- "content_snippet": doc.content[:200] if doc.content else "",
419
- "score": doc.score if doc.score is not None else 0.0,
420
- "page_number": meta_with_hash.get("page_number", None),
421
- "pdf_path": meta_with_hash.get("pdf_path", None),
422
- "metadata": meta_with_hash, # Pass full metadata
423
- # "_haystack_document": doc # Optionally include full object
516
+ "content_snippet": doc.content[:200] if doc.content else "",
517
+ "score": doc.score if doc.score is not None else 0.0,
518
+ "page_number": meta_with_hash.get("page_number", None),
519
+ "pdf_path": meta_with_hash.get("pdf_path", None),
520
+ "metadata": meta_with_hash, # Pass full metadata
521
+ # "_haystack_document": doc # Optionally include full object
424
522
  }
425
523
  final_results.append(result_dict)
426
524
  return final_results
427
525
  else:
428
- logger.warning("Pipeline result did not contain expected retriever output.")
429
- return []
526
+ logger.warning("Pipeline result did not contain expected retriever output.")
527
+ return []
430
528
 
431
529
  except FileNotFoundError:
432
- # Keep specific catch for collection not found during retrieval
433
- logger.error(f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'.")
434
- raise # Re-raise the specific FileNotFoundError
530
+ # Keep specific catch for collection not found during retrieval
531
+ logger.error(
532
+ f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'."
533
+ )
534
+ raise # Re-raise the specific FileNotFoundError
435
535
  # REMOVED broad except Exception for pipeline execution. Let errors propagate.
436
536
 
437
537
  def delete_index(
@@ -449,9 +549,11 @@ class HaystackSearchService(SearchServiceProtocol):
449
549
  return self._delete_chroma_collection()
450
550
  else:
451
551
  # For InMemory, "deleting" means re-initializing the store
452
- logger.info(f"Re-initializing InMemory store for '{self.collection_name}' as deletion request.")
552
+ logger.info(
553
+ f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
554
+ )
453
555
  self._in_memory_store = InMemoryDocumentStore()
454
- return True # Considered successful
556
+ return True # Considered successful
455
557
 
456
558
  def index_exists(
457
559
  self,
@@ -462,59 +564,80 @@ class HaystackSearchService(SearchServiceProtocol):
462
564
  For InMemory, it checks if the internal store object exists and has documents.
463
565
  """
464
566
  logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
465
- store = self._get_store() # Get the store instance
567
+ store = self._get_store() # Get the store instance
466
568
  try:
467
569
  count = store.count_documents()
468
570
  exists = count > 0
469
- logger.debug(f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}")
571
+ logger.debug(
572
+ f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}"
573
+ )
470
574
  return exists
471
575
  except Exception as e:
472
- # Catch errors during count_documents (e.g., connection error for persistent stores)
473
- logger.warning(f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}", exc_info=False)
474
- # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
475
- # Assume not exists if count fails
476
- return False
576
+ # Catch errors during count_documents (e.g., connection error for persistent stores)
577
+ logger.warning(
578
+ f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}",
579
+ exc_info=False,
580
+ )
581
+ # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
582
+ # Assume not exists if count fails
583
+ return False
477
584
 
478
585
  # --- Sync Methods Implementation ---
479
586
 
480
587
  def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
481
588
  """Retrieves documents, required for sync.
482
- NOTE: Haystack's filter_documents is the closest match.
483
- Fetches all docs if filters=None.
589
+ NOTE: Haystack's filter_documents is the closest match.
590
+ Fetches all docs if filters=None.
484
591
  """
485
- logger.debug(f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})...")
592
+ logger.debug(
593
+ f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})..."
594
+ )
486
595
  store = self._get_store()
487
596
  try:
488
597
  # Use filter_documents with no filters to get all
489
598
  # This might be inefficient for very large stores.
490
- haystack_docs = store.filter_documents(filters=kwargs.get('filters')) # Pass filters if provided via kwargs
599
+ haystack_docs = store.filter_documents(
600
+ filters=kwargs.get("filters")
601
+ ) # Pass filters if provided via kwargs
491
602
  logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
492
603
  # Convert to simple dicts
493
604
  results = []
494
605
  for doc in haystack_docs:
495
- doc_dict = {"id": doc.id} # ID is essential
496
- if include_metadata:
497
- # Ensure content_hash is included if it exists in meta
498
- doc_dict["meta"] = doc.meta
499
- # Optionally include content? Protocol doesn't require it.
500
- # doc_dict["content"] = doc.content
501
- results.append(doc_dict)
606
+ doc_dict = {"id": doc.id} # ID is essential
607
+ if include_metadata:
608
+ # Ensure content_hash is included if it exists in meta
609
+ doc_dict["meta"] = doc.meta
610
+ # Optionally include content? Protocol doesn't require it.
611
+ # doc_dict["content"] = doc.content
612
+ results.append(doc_dict)
502
613
  return results
503
614
  except Exception as e:
504
- logger.error(f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True)
505
- raise RuntimeError(f"Failed to list documents from store '{self.collection_name}'.") from e
506
-
615
+ logger.error(
616
+ f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True
617
+ )
618
+ raise RuntimeError(
619
+ f"Failed to list documents from store '{self.collection_name}'."
620
+ ) from e
507
621
 
508
622
  def delete_documents(self, ids: List[str]) -> None:
509
623
  """Deletes documents by ID, required for sync."""
510
624
  if not ids:
511
625
  logger.debug("No document IDs provided for deletion. Skipping.")
512
626
  return
513
- logger.warning(f"Request to delete {len(ids)} documents from collection '{self.collection_name}'.")
627
+ logger.warning(
628
+ f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
629
+ )
514
630
  store = self._get_store()
515
631
  try:
516
632
  store.delete_documents(ids=ids)
517
- logger.info(f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}")
633
+ logger.info(
634
+ f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}"
635
+ )
518
636
  except Exception as e:
519
- logger.error(f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}", exc_info=True)
520
- raise RuntimeError(f"Failed to delete documents from store '{self.collection_name}'.") from e
637
+ logger.error(
638
+ f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}",
639
+ exc_info=True,
640
+ )
641
+ raise RuntimeError(
642
+ f"Failed to delete documents from store '{self.collection_name}'."
643
+ ) from e