natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ import logging
2
+ import shutil
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, List, Optional
6
+
7
+ import lancedb
8
+ import pyarrow as pa
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ from .search_options import BaseSearchOptions
12
+ from .search_service_protocol import (
13
+ Indexable,
14
+ IndexConfigurationError,
15
+ SearchServiceProtocol,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
+ DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
22
+
23
+
24
+ class LanceDBSearchService(SearchServiceProtocol):
25
+ """LanceDB-based implementation of the search service protocol."""
26
+
27
+ collection_name: str
28
+
29
+ def __init__(
30
+ self,
31
+ collection_name: str,
32
+ persist: bool = False,
33
+ uri: Optional[str] = None,
34
+ embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
35
+ ):
36
+ self.collection_name = collection_name
37
+ self._persist = persist
38
+ self._uri = uri
39
+ self._embedding_model_name = embedding_model_name
40
+ self._embedding_dims: Optional[int] = None
41
+ self._db = None
42
+ self._table = None
43
+
44
+ self.embedding_model = SentenceTransformer(self._embedding_model_name)
45
+ test_embedding = self.embedding_model.encode("test")
46
+ self._embedding_dims = len(test_embedding)
47
+
48
+ if self._persist:
49
+ self._uri = self._uri if self._uri else DEFAULT_LANCEDB_PERSIST_PATH
50
+ logger.info(f"Initializing Persistent LanceDB client at path: {self._uri}")
51
+ Path(self._uri).mkdir(parents=True, exist_ok=True)
52
+ else:
53
+ self._temp_dir_obj = tempfile.TemporaryDirectory()
54
+ self._uri = self._temp_dir_obj.name
55
+ logger.info(f"Initializing In-Memory LanceDB client using temp path: {self._uri}")
56
+
57
+ self._db = lancedb.connect(self._uri)
58
+ self._open_or_create_table()
59
+ logger.info(
60
+ f"LanceDBSearchService initialized. Table '{self.collection_name}' (persist={self._persist} at '{self._uri}'). Model: '{self._embedding_model_name}', Dims: {self._embedding_dims}"
61
+ )
62
+
63
+ def _get_schema(self) -> pa.Schema:
64
+ if self._embedding_dims is None:
65
+ raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
66
+
67
+ return pa.schema([
68
+ pa.field("id", pa.string(), nullable=False),
69
+ pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
70
+ pa.field("text", pa.string()),
71
+ pa.field("metadata_json", pa.string())
72
+ ])
73
+
74
+ def _open_or_create_table(self):
75
+ if self._db is None:
76
+ raise RuntimeError("LanceDB connection not established.")
77
+
78
+ table_names = self._db.table_names()
79
+
80
+ if self.collection_name in table_names:
81
+ logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
82
+ self._table = self._db.open_table(self.collection_name)
83
+ else:
84
+ logger.debug(f"Creating new LanceDB table: {self.collection_name} with schema.")
85
+ schema = self._get_schema()
86
+ self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
87
+
88
+ def __del__(self):
89
+ if not self._persist and hasattr(self, '_temp_dir_obj') and logger:
90
+ logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
91
+ self._temp_dir_obj.cleanup()
92
+
93
+ def index(
94
+ self,
95
+ documents: Iterable[Indexable],
96
+ embedder_device: Optional[str] = None,
97
+ force_reindex: bool = False,
98
+ ) -> None:
99
+ indexable_list = list(documents)
100
+ logger.info(
101
+ f"Index request for table='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model_name}', force={force_reindex}"
102
+ )
103
+
104
+ if self._table is None or self._db is None:
105
+ raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
106
+
107
+ if not indexable_list:
108
+ logger.warning("No documents provided for indexing. Skipping.")
109
+ return
110
+
111
+ if force_reindex:
112
+ logger.warning(
113
+ f"Force reindex requested for table '{self.collection_name}'. Deleting existing table and recreating."
114
+ )
115
+ self._db.drop_table(self.collection_name)
116
+ self._open_or_create_table()
117
+ logger.info(f"Table '{self.collection_name}' deleted and recreated.")
118
+
119
+ data_to_add = []
120
+ texts_to_embed: List[str] = []
121
+ original_items_info: List[Dict[str, Any]] = []
122
+
123
+ import json
124
+
125
+ for item in indexable_list:
126
+ doc_id = item.get_id()
127
+ metadata = item.get_metadata().copy()
128
+ content_obj = item.get_content()
129
+ content_text = ""
130
+
131
+ if isinstance(content_obj, str):
132
+ content_text = content_obj
133
+ elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
134
+ content_text = content_obj.extract_text()
135
+ if not isinstance(content_text, str): content_text = str(content_obj)
136
+ else:
137
+ content_text = str(content_obj)
138
+
139
+ try:
140
+ content_hash = item.get_content_hash()
141
+ if content_hash: metadata["content_hash"] = content_hash
142
+ except (AttributeError, NotImplementedError): pass
143
+ except Exception as e: logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
144
+
145
+ # Ensure doc_id is not None - use a fallback if needed
146
+ if doc_id is None:
147
+ # Generate a unique ID based on content hash or position in the list
148
+ try:
149
+ doc_id = f"auto_{item.get_content_hash() if hasattr(item, 'get_content_hash') else hash(content_text)}"
150
+ except:
151
+ doc_id = f"auto_{len(texts_to_embed)}"
152
+
153
+ texts_to_embed.append(content_text)
154
+ original_items_info.append({
155
+ "id": doc_id,
156
+ "metadata_json": json.dumps(metadata),
157
+ "text": content_text
158
+ })
159
+
160
+ if not texts_to_embed:
161
+ logger.warning("No text content to embed. Skipping.")
162
+ return
163
+
164
+ logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
165
+ generated_embeddings = self.embedding_model.encode(
166
+ texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
167
+ )
168
+
169
+ for i, item_info in enumerate(original_items_info):
170
+ data_to_add.append({
171
+ "id": item_info["id"],
172
+ "vector": generated_embeddings[i].tolist(),
173
+ "text": item_info["text"],
174
+ "metadata_json": item_info["metadata_json"]
175
+ })
176
+
177
+ if not data_to_add:
178
+ logger.warning("No data prepared for LanceDB. Skipping add.")
179
+ return
180
+
181
+ # Create a PyArrow table with the same schema as the LanceDB table
182
+ schema = self._get_schema()
183
+ arrays = [
184
+ pa.array([item["id"] for item in data_to_add], type=pa.string()),
185
+ pa.array([item["vector"] for item in data_to_add]),
186
+ pa.array([item["text"] for item in data_to_add], type=pa.string()),
187
+ pa.array([item["metadata_json"] for item in data_to_add], type=pa.string()),
188
+ ]
189
+ table = pa.Table.from_arrays(arrays, schema=schema)
190
+
191
+ logger.info(f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'.")
192
+ self._table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(
193
+ table,
194
+ )
195
+ logger.info(f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}")
196
+
197
+ def search(
198
+ self,
199
+ query: Any,
200
+ options: BaseSearchOptions,
201
+ ) -> List[Dict[str, Any]]:
202
+ if self._table is None:
203
+ raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
204
+
205
+ logger.info(f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}")
206
+ query_text = ""
207
+ if isinstance(query, (str, Path)): query_text = str(query)
208
+ elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
209
+ query_text = query.extract_text()
210
+ if not query_text or not query_text.strip(): return []
211
+ else:
212
+ raise TypeError(f"Unsupported query type: {type(query)}")
213
+
214
+ query_vector = self.embedding_model.encode(query_text).tolist()
215
+
216
+ lancedb_filter = None
217
+ if options.filters:
218
+ if isinstance(options.filters, str):
219
+ lancedb_filter = options.filters
220
+ elif isinstance(options.filters, dict):
221
+ filter_parts = []
222
+ for k, v in options.filters.items():
223
+ if isinstance(v, str):
224
+ filter_parts.append(f"{k} = '{v}'")
225
+ else:
226
+ filter_parts.append(f"{k} = {v}")
227
+ if filter_parts:
228
+ lancedb_filter = " AND ".join(filter_parts)
229
+ logger.warning(f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions.")
230
+
231
+ search_query = self._table.search(query_vector).limit(options.top_k)
232
+ if lancedb_filter:
233
+ search_query = search_query.where(lancedb_filter)
234
+
235
+ results_df = search_query.to_df()
236
+ final_results: List[Dict[str, Any]] = []
237
+ import json
238
+
239
+ for _, row in results_df.iterrows():
240
+ metadata = {}
241
+ if "metadata_json" in row and row["metadata_json"]:
242
+ try:
243
+ metadata = json.loads(row["metadata_json"])
244
+ except json.JSONDecodeError:
245
+ logger.warning(f"Failed to parse metadata_json for id {row.get('id')}")
246
+
247
+ score = 1 - row["_distance"] if "_distance" in row else 0.0
248
+
249
+ final_results.append({
250
+ "id": row.get("id"),
251
+ "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
252
+ "score": score,
253
+ "page_number": metadata.get("page_number"),
254
+ "pdf_path": metadata.get("pdf_path"),
255
+ "metadata": metadata,
256
+ })
257
+ logger.info(f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'.")
258
+ return final_results
259
+
260
+ def delete_index(self) -> bool:
261
+ if self._db is None:
262
+ logger.warning("LanceDB connection not initialized. Cannot delete index.")
263
+ return False
264
+ logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
265
+
266
+ self._db.drop_table(self.collection_name)
267
+ self._table = None
268
+ logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
269
+ return True
270
+
271
+ def index_exists(self) -> bool:
272
+ if self._db is None:
273
+ return False
274
+ exists = self.collection_name in self._db.table_names()
275
+ if exists:
276
+ tbl = self._db.open_table(self.collection_name)
277
+ count = tbl.count_rows()
278
+ logger.debug(f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}")
279
+ return count > 0
280
+
281
+ logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
282
+ return False
283
+
284
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
285
+ if self._table is None:
286
+ raise RuntimeError("Table not initialized")
287
+ logger.debug(f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})...")
288
+
289
+ select_columns = ["id"]
290
+ if include_metadata:
291
+ select_columns.append("metadata_json")
292
+
293
+ lancedb_filter = kwargs.get("filters")
294
+
295
+ query = self._table.to_lance().scanner(columns=select_columns, filter=lancedb_filter)
296
+ results_table = query.to_table()
297
+ results_list = results_table.to_pylist()
298
+
299
+ formatted_docs: List[Dict[str, Any]] = []
300
+ import json
301
+ for row in results_list:
302
+ doc_data: Dict[str, Any] = {"id": row.get("id")}
303
+ if include_metadata and "metadata_json" in row and row["metadata_json"]:
304
+ try:
305
+ metadata = json.loads(row["metadata_json"])
306
+ doc_data["meta"] = metadata
307
+ except json.JSONDecodeError:
308
+ doc_data["meta"] = {}
309
+ formatted_docs.append(doc_data)
310
+ logger.info(f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'.")
311
+ return formatted_docs
312
+
313
+ def delete_documents(self, ids: List[str]) -> None:
314
+ if self._table is None:
315
+ raise RuntimeError("Table not initialized")
316
+ if not ids:
317
+ logger.debug("No document IDs provided for deletion. Skipping.")
318
+ return
319
+
320
+ id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
321
+ delete_condition = f"id IN ({id_filter_string})"
322
+ logger.warning(f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}")
323
+
324
+ self._table.delete(delete_condition)
325
+ logger.info(f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}")
@@ -0,0 +1,255 @@
1
+ import logging
2
+ import numpy as np
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, List, Optional, Union
6
+
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ from .search_options import BaseSearchOptions
10
+ from .search_service_protocol import (
11
+ Indexable,
12
+ IndexConfigurationError,
13
+ SearchServiceProtocol,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
+
20
+ class NumpySearchService(SearchServiceProtocol):
21
+ """Basic in-memory vector search implementation using NumPy."""
22
+
23
+ collection_name: str
24
+
25
+ def __init__(
26
+ self,
27
+ collection_name: str,
28
+ persist: bool = False,
29
+ uri: Optional[str] = None,
30
+ embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
31
+ ):
32
+ if persist:
33
+ raise RuntimeError(
34
+ "Persistence requested but LanceDB is not installed. "
35
+ "For persistent vector search, install LanceDB: pip install lancedb"
36
+ )
37
+
38
+ self.collection_name = collection_name
39
+ self._embedding_model_name = embedding_model_name
40
+ self.embedding_model = SentenceTransformer(self._embedding_model_name)
41
+ self._embedding_dims = len(self.embedding_model.encode("test"))
42
+
43
+ # Simple in-memory storage
44
+ self._vectors = []
45
+ self._documents = []
46
+ self._metadata = []
47
+ self._ids = []
48
+
49
+ logger.info(f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'")
50
+
51
+ def index(
52
+ self,
53
+ documents: Iterable[Indexable],
54
+ embedder_device: Optional[str] = None,
55
+ force_reindex: bool = False,
56
+ ) -> None:
57
+ if force_reindex:
58
+ logger.info(f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors.")
59
+ self._vectors = []
60
+ self._documents = []
61
+ self._metadata = []
62
+ self._ids = []
63
+
64
+ items = list(documents)
65
+ logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
66
+
67
+ if not items:
68
+ logger.warning("No documents provided for indexing. Skipping.")
69
+ return
70
+
71
+ texts_to_embed = []
72
+ items_info = []
73
+
74
+ for item in items:
75
+ doc_id = item.get_id()
76
+ metadata = item.get_metadata().copy()
77
+ content_obj = item.get_content()
78
+ content_text = ""
79
+
80
+ if isinstance(content_obj, str):
81
+ content_text = content_obj
82
+ elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
83
+ content_text = content_obj.extract_text()
84
+ if not isinstance(content_text, str):
85
+ content_text = str(content_obj)
86
+ else:
87
+ content_text = str(content_obj)
88
+
89
+ # Try to add content hash to metadata
90
+ try:
91
+ content_hash = item.get_content_hash()
92
+ if content_hash:
93
+ metadata["content_hash"] = content_hash
94
+ except (AttributeError, NotImplementedError):
95
+ pass
96
+ except Exception as e:
97
+ logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
98
+
99
+ texts_to_embed.append(content_text)
100
+ items_info.append({
101
+ "id": doc_id,
102
+ "metadata": metadata,
103
+ "text": content_text
104
+ })
105
+
106
+ if not texts_to_embed:
107
+ logger.warning("No text content to embed. Skipping.")
108
+ return
109
+
110
+ logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
111
+ generated_embeddings = self.embedding_model.encode(
112
+ texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
113
+ )
114
+
115
+ for i, item_info in enumerate(items_info):
116
+ self._vectors.append(generated_embeddings[i])
117
+ self._documents.append(item_info["text"])
118
+ self._metadata.append(item_info["metadata"])
119
+ self._ids.append(item_info["id"])
120
+
121
+ logger.info(f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}")
122
+
123
+ def search(
124
+ self,
125
+ query: Any,
126
+ options: BaseSearchOptions,
127
+ ) -> List[Dict[str, Any]]:
128
+ if not self._vectors:
129
+ logger.debug("No vectors in index. Returning empty results.")
130
+ return []
131
+
132
+ # Process query to text
133
+ query_text = ""
134
+ if isinstance(query, (str, Path)):
135
+ query_text = str(query)
136
+ elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
137
+ query_text = query.extract_text()
138
+ if not query_text or not query_text.strip():
139
+ return []
140
+ else:
141
+ raise TypeError(f"Unsupported query type: {type(query)}")
142
+
143
+ logger.info(f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}")
144
+
145
+ # Encode query and perform similarity search
146
+ query_vector = self.embedding_model.encode(query_text)
147
+
148
+ # Convert list to numpy array for batch operations
149
+ vectors_array = np.array(self._vectors)
150
+
151
+ # Normalize vectors for cosine similarity
152
+ query_norm = np.linalg.norm(query_vector)
153
+ if query_norm > 0:
154
+ query_vector = query_vector / query_norm
155
+
156
+ # Normalize all vectors (avoid division by zero)
157
+ vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
158
+ valid_indices = vector_norms.flatten() > 0
159
+ vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
160
+
161
+ # Calculate cosine similarities
162
+ similarities = np.dot(vectors_array, query_vector)
163
+
164
+ # Apply filters if present
165
+ filtered_indices = np.arange(len(similarities))
166
+ if options.filters:
167
+ # Simple filtering for metadata fields
168
+ # This is a basic implementation and doesn't support complex filters like LanceDB
169
+ if isinstance(options.filters, dict):
170
+ for field, value in options.filters.items():
171
+ new_filtered = []
172
+ for i in filtered_indices:
173
+ metadata = self._metadata[i]
174
+ if field in metadata and metadata[field] == value:
175
+ new_filtered.append(i)
176
+ filtered_indices = np.array(new_filtered)
177
+ else:
178
+ logger.warning(f"Complex filter expressions not supported in NumPy backend: {options.filters}")
179
+
180
+ # Apply filtering and sort by similarity
181
+ if len(filtered_indices) > 0:
182
+ filtered_similarities = similarities[filtered_indices]
183
+ top_k = min(options.top_k, len(filtered_similarities))
184
+ if top_k == 0:
185
+ return []
186
+
187
+ top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
188
+ top_indices = filtered_indices[top_indices_within_filtered]
189
+ else:
190
+ top_k = min(options.top_k, len(similarities))
191
+ if top_k == 0:
192
+ return []
193
+
194
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
195
+
196
+ # Format results
197
+ results = []
198
+ for idx in top_indices:
199
+ metadata = self._metadata[idx]
200
+ results.append({
201
+ "id": self._ids[idx],
202
+ "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
203
+ "score": float(similarities[idx]),
204
+ "page_number": metadata.get("page_number"),
205
+ "pdf_path": metadata.get("pdf_path"),
206
+ "metadata": metadata,
207
+ })
208
+
209
+ logger.info(f"Search returned {len(results)} results from collection '{self.collection_name}'")
210
+ return results
211
+
212
+ def index_exists(self) -> bool:
213
+ return len(self._vectors) > 0
214
+
215
+ def delete_index(self) -> bool:
216
+ logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
217
+ self._vectors = []
218
+ self._documents = []
219
+ self._metadata = []
220
+ self._ids = []
221
+ return True
222
+
223
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
224
+ logger.debug(f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})...")
225
+
226
+ results = []
227
+ for i, doc_id in enumerate(self._ids):
228
+ doc_info = {"id": doc_id}
229
+ if include_metadata:
230
+ doc_info["meta"] = self._metadata[i]
231
+ results.append(doc_info)
232
+
233
+ logger.info(f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'")
234
+ return results
235
+
236
+ def delete_documents(self, ids: List[str]) -> None:
237
+ if not ids:
238
+ logger.debug("No document IDs provided for deletion. Skipping.")
239
+ return
240
+
241
+ logger.warning(f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'")
242
+
243
+ # Find indices to remove
244
+ keep_indices = []
245
+ for i, doc_id in enumerate(self._ids):
246
+ if doc_id not in ids:
247
+ keep_indices.append(i)
248
+
249
+ # Create new filtered lists
250
+ self._ids = [self._ids[i] for i in keep_indices]
251
+ self._vectors = [self._vectors[i] for i in keep_indices]
252
+ self._documents = [self._documents[i] for i in keep_indices]
253
+ self._metadata = [self._metadata[i] for i in keep_indices]
254
+
255
+ logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")