rakam-systems-vectorstore 0.1.1rc7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. rakam_systems_vectorstore/MANIFEST.in +26 -0
  2. rakam_systems_vectorstore/README.md +1071 -0
  3. rakam_systems_vectorstore/__init__.py +93 -0
  4. rakam_systems_vectorstore/components/__init__.py +0 -0
  5. rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  6. rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  7. rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  8. rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  9. rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  10. rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  11. rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  12. rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  13. rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  14. rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  15. rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  16. rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  17. rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  18. rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  19. rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  20. rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  21. rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  22. rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  23. rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  24. rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  25. rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  26. rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  27. rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  28. rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  29. rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  30. rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  31. rakam_systems_vectorstore/config.py +266 -0
  32. rakam_systems_vectorstore/core.py +8 -0
  33. rakam_systems_vectorstore/pyproject.toml +113 -0
  34. rakam_systems_vectorstore/server/README.md +290 -0
  35. rakam_systems_vectorstore/server/__init__.py +20 -0
  36. rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  37. rakam_systems_vectorstore/setup.py +103 -0
  38. rakam_systems_vectorstore-0.1.1rc7.dist-info/METADATA +370 -0
  39. rakam_systems_vectorstore-0.1.1rc7.dist-info/RECORD +40 -0
  40. rakam_systems_vectorstore-0.1.1rc7.dist-info/WHEEL +4 -0
@@ -0,0 +1,878 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import pickle
5
+ import time
6
+ from typing import Any
7
+ from typing import Dict
8
+ from typing import List
9
+ from typing import Optional
10
+
11
+ import dotenv
12
+ import faiss
13
+ import numpy as np
14
+ from openai import OpenAI
15
+ from sentence_transformers import SentenceTransformer
16
+
17
+ from rakam_systems_core.ai_utils import logging
18
+ from rakam_systems_core.ai_core.interfaces.vectorstore import VectorStore
19
+ from rakam_systems_vectorstore.core import Node
20
+ from rakam_systems_vectorstore.core import VSFile
21
+
22
+ dotenv.load_dotenv()
23
+ api_key = os.getenv("OPENAI_API_KEY")
24
+
25
+
26
+ class FaissStore(VectorStore):
27
+ """
28
+ A class for managing collection-based vector stores using FAISS and SentenceTransformers.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ name: str = "faiss_store",
34
+ config=None,
35
+ base_index_path: str = None,
36
+ embedding_model: str = "Snowflake/snowflake-arctic-embed-m",
37
+ initialising: bool = False,
38
+ use_embedding_api: bool = False,
39
+ api_model: str = "text-embedding-3-small",
40
+ ) -> None:
41
+ """
42
+ Initializes the FaissStore with the specified base index path and embedding model.
43
+
44
+ :param name: Name of the vector store component.
45
+ :param config: Configuration object.
46
+ :param base_index_path: Base path to store the FAISS indexes.
47
+ :param embedding_model: Pre-trained SentenceTransformer model name.
48
+ :param initialising: Whether to skip loading existing stores.
49
+ :param use_embedding_api: Whether to use OpenAI API for embeddings.
50
+ :param api_model: OpenAI model to use for embeddings.
51
+ """
52
+ super().__init__(name, config)
53
+
54
+ self.base_index_path = base_index_path or "faiss_indexes"
55
+ if not os.path.exists(self.base_index_path):
56
+ os.makedirs(self.base_index_path)
57
+
58
+ self.use_embedding_api = use_embedding_api
59
+
60
+ if self.use_embedding_api:
61
+ self.client = OpenAI(api_key=api_key)
62
+ self.api_model = api_model
63
+ else:
64
+ self.embedding_model = SentenceTransformer(
65
+ embedding_model, trust_remote_code=True
66
+ )
67
+
68
+ self.collections = {}
69
+
70
+ if not initialising:
71
+ self.load_vector_store()
72
+
73
+ def add(self, vectors: List[List[float]], metadatas: List[Dict[str, Any]]) -> Any:
74
+ """
75
+ Add vectors with metadata to the default collection.
76
+
77
+ :param vectors: List of embedding vectors.
78
+ :param metadatas: List of metadata dictionaries.
79
+ :return: List of assigned IDs.
80
+ """
81
+ collection_name = "default"
82
+ if collection_name not in self.collections:
83
+ # Create empty collection
84
+ self.collections[collection_name] = {
85
+ "index": None,
86
+ "nodes": [],
87
+ "category_index_mapping": {},
88
+ "metadata_index_mapping": {},
89
+ "embeddings": {},
90
+ }
91
+
92
+ # Convert vectors to numpy array
93
+ data_embeddings = np.array(vectors, dtype='float32')
94
+
95
+ # Get existing IDs or start from 0
96
+ existing_ids = set(
97
+ self.collections[collection_name]["category_index_mapping"].keys())
98
+ max_existing_id = max(existing_ids) if existing_ids else -1
99
+
100
+ new_ids = []
101
+ next_id = max_existing_id + 1
102
+ for _ in range(len(vectors)):
103
+ while next_id in existing_ids:
104
+ next_id += 1
105
+ new_ids.append(next_id)
106
+ next_id += 1
107
+
108
+ # Create or update index
109
+ if self.collections[collection_name]["index"] is None:
110
+ index = faiss.IndexIDMap(
111
+ faiss.IndexFlatIP(data_embeddings.shape[1]))
112
+ self.collections[collection_name]["index"] = index
113
+
114
+ # Normalize and add vectors
115
+ faiss.normalize_L2(data_embeddings)
116
+ self.collections[collection_name]["index"].add_with_ids(
117
+ data_embeddings, np.array(new_ids)
118
+ )
119
+
120
+ # Update mappings
121
+ for idx, (vec, meta) in enumerate(zip(vectors, metadatas)):
122
+ node_id = new_ids[idx]
123
+ self.collections[collection_name]["embeddings"][node_id] = vec
124
+ self.collections[collection_name]["metadata_index_mapping"][node_id] = meta
125
+ self.collections[collection_name]["category_index_mapping"][node_id] = meta.get(
126
+ "content", "")
127
+
128
+ return new_ids
129
+
130
+ def query(self, vector: List[float], top_k: int = 5) -> List[Dict[str, Any]]:
131
+ """
132
+ Query the default collection for the closest vectors.
133
+
134
+ :param vector: Query vector.
135
+ :param top_k: Number of results to return.
136
+ :return: List of results with metadata and scores.
137
+ """
138
+ collection_name = "default"
139
+ if collection_name not in self.collections:
140
+ return []
141
+
142
+ return self._query_collection(collection_name, vector, top_k)
143
+
144
+ def count(self) -> Optional[int]:
145
+ """
146
+ Count total vectors across all collections.
147
+
148
+ :return: Total number of vectors.
149
+ """
150
+ total = 0
151
+ for collection in self.collections.values():
152
+ if collection["index"] is not None:
153
+ total += collection["index"].ntotal
154
+ return total
155
+
156
+ def load_vector_store(self) -> None:
157
+ """
158
+ Loads all collections from the base directory.
159
+ """
160
+ for collection_name in os.listdir(self.base_index_path):
161
+ store_path = os.path.join(self.base_index_path, collection_name)
162
+ if os.path.isdir(store_path):
163
+ self.collections[collection_name] = self.load_collection(
164
+ store_path)
165
+
166
+ def load_collection(self, store_path: str) -> Dict[str, Any]:
167
+ """
168
+ Loads a single vector store from the specified directory.
169
+
170
+ :param store_path: Path to the store directory.
171
+ :return: Dictionary containing the store's index, nodes, metadata, and embeddings.
172
+ """
173
+ store = {}
174
+ store["index"] = faiss.read_index(os.path.join(store_path, "index"))
175
+ with open(os.path.join(store_path, "category_index_mapping.pkl"), "rb") as f:
176
+ store["category_index_mapping"] = pickle.load(f)
177
+ with open(os.path.join(store_path, "metadata_index_mapping.pkl"), "rb") as f:
178
+ store["metadata_index_mapping"] = pickle.load(f)
179
+ with open(os.path.join(store_path, "nodes.pkl"), "rb") as f:
180
+ store["nodes"] = pickle.load(f)
181
+ with open(os.path.join(store_path, "embeddings_index_mapping.pkl"), "rb") as f:
182
+ store["embeddings"] = pickle.load(f)
183
+
184
+ logging.info(f"Store loaded successfully from {store_path}.")
185
+ return store
186
+
187
+ def predict_embeddings(self, query: str) -> np.ndarray:
188
+ """
189
+ Predicts embeddings for a given query using the embedding model.
190
+
191
+ :param query: Query string to encode.
192
+ :return: Embedding vector for the query.
193
+ """
194
+ logging.info(f"Predicting embeddings for query: {query}")
195
+
196
+ if self.use_embedding_api:
197
+ query_embedding = (
198
+ self.client.embeddings.create(
199
+ input=[query], model=self.api_model)
200
+ .data[0]
201
+ .embedding
202
+ )
203
+ query_embedding = np.asarray([query_embedding], dtype="float32")
204
+ else:
205
+ query_embedding = self.embedding_model.encode(query)
206
+ query_embedding = np.asarray([query_embedding], dtype="float32")
207
+
208
+ return query_embedding
209
+
210
+ def get_index_copy(self, store: Dict[str, Any]) -> faiss.IndexIDMap:
211
+ """
212
+ Creates a copy of the index from the store and returns it.
213
+ """
214
+ assert len(store["embeddings"]) == len(
215
+ store["category_index_mapping"]
216
+ ), "Mismatch between embeddings and category index mapping."
217
+
218
+ category_index_mapping = store["category_index_mapping"]
219
+ data_embeddings = np.array(list(store["embeddings"].values()))
220
+ index_copy = faiss.IndexIDMap(
221
+ faiss.IndexFlatIP(data_embeddings.shape[1]))
222
+ faiss.normalize_L2(data_embeddings)
223
+ index_copy.add_with_ids(
224
+ data_embeddings, np.array(list(category_index_mapping.keys()))
225
+ )
226
+
227
+ return index_copy
228
+
229
+ def search(
230
+ self,
231
+ collection_name: str,
232
+ query: str,
233
+ distance_type="cosine",
234
+ number=5,
235
+ meta_data_filters: List = None,
236
+ ) -> dict:
237
+ """
238
+ Searches the specified collection for the closest embeddings to the query.
239
+
240
+ :param collection_name: Name of the collection to search.
241
+ :param query: Query string to search for.
242
+ :param distance_type: Type of distance metric to use (default is cosine).
243
+ :param number: Number of results to return (default is 5).
244
+ :param meta_data_filters: List of Node IDs to filter the search results.
245
+ """
246
+ logging.info(
247
+ f"Searching in collection: {collection_name} for query: '{query}'")
248
+
249
+ # Step 1: Retrieve the collection
250
+ store = self.collections.get(collection_name)
251
+ if not store:
252
+ raise ValueError(f"No store found with name: {collection_name}")
253
+
254
+ index_copy = self.get_index_copy(store)
255
+
256
+ # Step 2: Apply metadata filters if provided
257
+ if meta_data_filters:
258
+ logging.info(f"Applying metadata filters: {meta_data_filters}")
259
+
260
+ all_ids = store["category_index_mapping"].keys()
261
+ logging.info(f"Total IDs in the index: {all_ids}")
262
+
263
+ ids_to_remove = list(all_ids - set(meta_data_filters))
264
+ logging.info(f"IDs to remove: {ids_to_remove}")
265
+
266
+ # filtered_index = faiss.clone_index(store["index"])
267
+ filtered_index = index_copy
268
+ logging.info(f"Original index size: {filtered_index.ntotal}")
269
+
270
+ filtered_index.remove_ids(np.array(ids_to_remove))
271
+ logging.info(f"Filtered index size: {filtered_index.ntotal}")
272
+ else:
273
+ # No filters provided; use the original index
274
+ logging.info(
275
+ "No metadata filters provided. Using the entire index for search."
276
+ )
277
+ filtered_index = index_copy
278
+
279
+ # Step 3: Generate the query embedding
280
+ query_embedding = self.predict_embeddings(query)
281
+ logging.info(f"Query embedding shape: {query_embedding.shape}")
282
+ if distance_type == "cosine":
283
+ faiss.normalize_L2(query_embedding)
284
+
285
+ # Step 4: Perform the search
286
+ logging.info("Performing search on the index...")
287
+ D, I = filtered_index.search(query_embedding, number)
288
+ logging.debug(f"Search distances: {D}")
289
+ logging.debug(f"Search indices: {I}")
290
+
291
+ if I.shape[1] == 0 or np.all(I == -1):
292
+ logging.error("Search returned no results.")
293
+ return {}, []
294
+
295
+ # Step 5: Prepare search results
296
+ suggested_nodes = []
297
+ seen_texts = set()
298
+ valid_suggestions = {}
299
+ count = 0
300
+
301
+ for i, id_ in enumerate(I[0]):
302
+ if count >= number:
303
+ break
304
+ if id_ != -1 and id_ in store["category_index_mapping"]:
305
+ suggestion_text = store["category_index_mapping"][id_]
306
+ node_metadata = store["metadata_index_mapping"][id_]
307
+ for node in store["nodes"]:
308
+ if node.metadata.node_id == id_:
309
+ suggested_nodes.append(node)
310
+ if suggestion_text not in seen_texts:
311
+ seen_texts.add(suggestion_text)
312
+ valid_suggestions[str(id_)] = (
313
+ node_metadata,
314
+ suggestion_text,
315
+ float(D[0][i]),
316
+ )
317
+ count += 1
318
+
319
+ logging.info(f"Final search results: {valid_suggestions}")
320
+
321
+ return valid_suggestions, suggested_nodes
322
+
323
+ def _query_collection(
324
+ self, collection_name: str, vector: List[float], top_k: int
325
+ ) -> List[Dict[str, Any]]:
326
+ """
327
+ Internal method to query a specific collection.
328
+
329
+ :param collection_name: Name of the collection.
330
+ :param vector: Query vector.
331
+ :param top_k: Number of results to return.
332
+ :return: List of results.
333
+ """
334
+ store = self.collections.get(collection_name)
335
+ if not store or store["index"] is None:
336
+ return []
337
+
338
+ query_vector = np.array([vector], dtype='float32')
339
+ faiss.normalize_L2(query_vector)
340
+
341
+ D, I = store["index"].search(query_vector, top_k)
342
+
343
+ results = []
344
+ for i, idx in enumerate(I[0]):
345
+ if idx != -1 and idx in store["metadata_index_mapping"]:
346
+ result = store["metadata_index_mapping"][idx].copy()
347
+ result["score"] = float(D[0][i])
348
+ result["id"] = int(idx)
349
+ results.append(result)
350
+
351
+ return results
352
+
353
+ def get_embeddings(
354
+ self, sentences: List[str], parallel: bool = True, batch_size: int = 8
355
+ ) -> np.ndarray:
356
+ """
357
+ Generates embeddings for a list of sentences.
358
+
359
+ :param sentences: List of sentences to encode.
360
+ :param parallel: Whether to use parallel processing (default is True).
361
+ :return: Embedding vectors for the sentences.
362
+ """
363
+ logging.info(f"Generating embeddings for {len(sentences)} sentences.")
364
+ print(f"Generating embeddings for {len(sentences)} sentences.")
365
+ print("Generating embeddings...")
366
+ start = time.time()
367
+
368
+ if self.use_embedding_api:
369
+ embeddings = []
370
+ for sentence in sentences:
371
+ embedding = self.predict_embeddings(sentence)
372
+ embedding = np.squeeze(embedding)
373
+ embeddings.append(embedding)
374
+ embeddings = np.array(embeddings)
375
+ logging.info(
376
+ f"Time taken to encode {len(sentences)} items: {round(time.time() - start, 2)} seconds"
377
+ )
378
+ return embeddings
379
+ else:
380
+ if parallel:
381
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
382
+ pool = self.embedding_model.start_multi_process_pool(
383
+ target_devices=["cpu"] * 5
384
+ )
385
+ embeddings = self.embedding_model.encode_multi_process(
386
+ sentences, pool, batch_size=batch_size
387
+ )
388
+ self.embedding_model.stop_multi_process_pool(pool)
389
+ else:
390
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
391
+ embeddings = self.embedding_model.encode(
392
+ sentences,
393
+ batch_size=batch_size,
394
+ show_progress_bar=True,
395
+ convert_to_tensor=True,
396
+ )
397
+ logging.info(
398
+ f"Time taken to encode {len(sentences)} items: {round(time.time() - start, 2)} seconds"
399
+ )
400
+ return embeddings.cpu().detach().numpy()
401
+
402
+ def create_collection_from_files(
403
+ self, collection_name: str, files: List[VSFile]
404
+ ) -> None:
405
+ """
406
+ Creates FAISS indexes from dictionaries of store names and VSFile objects.
407
+
408
+ :param collection_files: Dictionary where keys are store names and values are lists of VSFile objects.
409
+ """
410
+ logging.info(f"Creating FAISS index for store: {collection_name}")
411
+ text_chunks = []
412
+ metadata = []
413
+ nodes = []
414
+
415
+ for file in files:
416
+ for node in file.nodes:
417
+ nodes.append(node)
418
+ text_chunks.append(node.content)
419
+ formatted_metadata = {
420
+ "node_id": node.metadata.node_id,
421
+ "source_file_uuid": node.metadata.source_file_uuid,
422
+ "position": node.metadata.position,
423
+ "custom": node.metadata.custom,
424
+ }
425
+ metadata.append(formatted_metadata)
426
+
427
+ self._create_and_save_index(
428
+ collection_name, nodes, text_chunks, metadata)
429
+
430
+ def create_collections_from_files(
431
+ self, collection_files: Dict[str, List[VSFile]]
432
+ ) -> None:
433
+ """
434
+ Creates FAISS indexes from dictionaries of store names and VSFile objects.
435
+
436
+ :param collection_files: Dictionary where keys are store names and values are lists of VSFile objects.
437
+ """
438
+ for collection_name, files in collection_files.items():
439
+ self.create_collection_from_files(collection_name, files)
440
+
441
+ def create_collection_from_nodes(
442
+ self, collection_name: str, nodes: List[Any]
443
+ ) -> None:
444
+ """
445
+ Creates a FAISS index from a list of nodes and collection it under the given collection name.
446
+
447
+ :param collection_name: Name of the store to create.
448
+ :param nodes: List of nodes containing the content and metadata.
449
+ """
450
+ logging.info(f"Creating FAISS index for store: {collection_name}")
451
+ text_chunks = []
452
+ metadata = []
453
+
454
+ for node in nodes:
455
+ text_chunks.append(node.content)
456
+ formatted_metadata = {
457
+ "node_id": node.metadata.node_id,
458
+ "source_file_uuid": node.metadata.source_file_uuid,
459
+ "position": node.metadata.position,
460
+ "custom": node.metadata.custom,
461
+ }
462
+ metadata.append(formatted_metadata)
463
+
464
+ self._create_and_save_index(
465
+ collection_name, nodes, text_chunks, metadata)
466
+
467
+ def _create_and_save_index(
468
+ self,
469
+ collection_name: str,
470
+ nodes: List[Any],
471
+ text_chunks: List[str],
472
+ metadata: List[Dict[str, Any]],
473
+ ) -> None:
474
+ """
475
+ Helper function to create and save a FAISS index and embeddings.
476
+
477
+ :param collection_name: Name of the store to create.
478
+ :param nodes: List of nodes.
479
+ :param text_chunks: List of text chunks to encode and index.
480
+ :param metadata: List of metadata associated with the text chunks.
481
+ """
482
+ # Check if the list of nodes or text_chunks is empty
483
+ if not nodes or not text_chunks:
484
+ logging.warning(
485
+ f"Cannot create FAISS index for store '{collection_name}' because nodes or text_chunks are empty."
486
+ )
487
+ self.collections[collection_name] = {
488
+ "index": None,
489
+ "nodes": [],
490
+ "category_index_mapping": None,
491
+ "metadata_index_mapping": None,
492
+ "embeddings": None, # No embeddings
493
+ }
494
+ return
495
+
496
+ assert (
497
+ len(nodes) == len(text_chunks) == len(metadata)
498
+ ), "Length of nodes, text_chunks, and metadata should be equal."
499
+
500
+ store_path = os.path.join(self.base_index_path, collection_name)
501
+ if not os.path.exists(store_path):
502
+ os.makedirs(store_path)
503
+
504
+ # Get embeddings for the text chunks
505
+ data_embeddings = self.get_embeddings(
506
+ sentences=text_chunks, parallel=False)
507
+ category_index_mapping = dict(
508
+ zip(range(len(text_chunks)), text_chunks))
509
+
510
+ # Update the node_id in the metadata for metadata_index_mapping
511
+ for i, meta in enumerate(metadata):
512
+ meta["node_id"] = i
513
+ logging.info(
514
+ f"Assigned node IDs to metadata successfully. For example: {metadata[0]['node_id']}"
515
+ )
516
+
517
+ # Update the node_id in the metadata in the nodes
518
+ for i, node in enumerate(nodes):
519
+ node.metadata.node_id = i
520
+
521
+ # Save category index mapping to file
522
+ with open(os.path.join(store_path, "category_index_mapping.pkl"), "wb") as f:
523
+ pickle.dump(category_index_mapping, f)
524
+
525
+ # Save nodes to file
526
+ with open(os.path.join(store_path, "nodes.pkl"), "wb") as f:
527
+ pickle.dump(nodes, f)
528
+
529
+ # Save embeddings to file
530
+ embeddings_index_mapping = dict(
531
+ zip(range(len(data_embeddings)), data_embeddings)
532
+ )
533
+ with open(os.path.join(store_path, "embeddings_index_mapping.pkl"), "wb") as f:
534
+ pickle.dump(embeddings_index_mapping, f)
535
+
536
+ # Create FAISS index and add embeddings
537
+ index = faiss.IndexIDMap(faiss.IndexFlatIP(data_embeddings.shape[1]))
538
+ faiss.normalize_L2(data_embeddings)
539
+ index.add_with_ids(
540
+ data_embeddings, np.array(list(category_index_mapping.keys()))
541
+ )
542
+ faiss.write_index(index, os.path.join(store_path, "index"))
543
+
544
+ # Save metadata index mapping to file
545
+ metadata_index_mapping = dict(zip(range(len(text_chunks)), metadata))
546
+ with open(os.path.join(store_path, "metadata_index_mapping.pkl"), "wb") as f:
547
+ pickle.dump(metadata_index_mapping, f)
548
+
549
+ # Update the collections dictionary
550
+ self.collections[collection_name] = {
551
+ "index": index,
552
+ "nodes": nodes,
553
+ "category_index_mapping": category_index_mapping,
554
+ "metadata_index_mapping": metadata_index_mapping,
555
+ "embeddings": embeddings_index_mapping, # Store the embeddings mapping
556
+ }
557
+ print(
558
+ (
559
+ f"FAISS index and embeddings for store {collection_name} created and saved successfully."
560
+ )
561
+ )
562
+ logging.info(
563
+ f"FAISS index and embeddings for store {collection_name} created and saved successfully."
564
+ )
565
+
566
+ def add_nodes(self, collection_name: str, nodes: List[Node]) -> None:
567
+ """
568
+ Adds nodes to an existing store and updates the index.
569
+
570
+ :param collection_name: Name of the store to update.
571
+ :param nodes: List of nodes to be added.
572
+ """
573
+ logging.info(f"Adding nodes to store: {collection_name}")
574
+
575
+ if not nodes:
576
+ logging.warning("No nodes to add.")
577
+ return
578
+
579
+ store = self.collections.get(collection_name)
580
+ if not store:
581
+ raise ValueError(f"No store found with name: {collection_name}")
582
+
583
+ assert (
584
+ len(store["category_index_mapping"])
585
+ == len(store["metadata_index_mapping"])
586
+ == len(store["embeddings"])
587
+ == len(store["nodes"])
588
+ ), "Mismatch between mappings and embeddings."
589
+ assert (
590
+ store["category_index_mapping"].keys()
591
+ == store["metadata_index_mapping"].keys()
592
+ == store["embeddings"].keys()
593
+ == {node.metadata.node_id for node in store["nodes"]}
594
+ ), "Mismatch between mappings and embeddings."
595
+ assert all(
596
+ node.metadata.node_id not in {
597
+ n.metadata.node_id for n in store["nodes"]}
598
+ for node in nodes
599
+ ), "Duplicate node IDs detected in the new nodes."
600
+
601
+ # Get the existing text chunks from the given nodes
602
+ new_text_chunks = [node.content for node in nodes]
603
+
604
+ # Get embeddings for the new text chunks
605
+ new_embeddings = self.get_embeddings(
606
+ sentences=new_text_chunks, parallel=False)
607
+
608
+ existing_ids = set(store["category_index_mapping"].keys())
609
+ max_existing_id = max(existing_ids) if existing_ids else -1
610
+ new_ids = []
611
+ next_id = max_existing_id + 1
612
+ for _ in range(len(new_text_chunks)):
613
+ while next_id in existing_ids:
614
+ next_id += 1
615
+ new_ids.append(next_id)
616
+ next_id += 1
617
+
618
+ logging.info(f"New IDs: {new_ids}")
619
+ logging.info(f"Existing IDs: {existing_ids}")
620
+ logging.info(f"New text chunks count: {len(new_text_chunks)}")
621
+
622
+ # # Get the new Mapping Indices for the new nodes
623
+ # new_ids = list(range(
624
+ # len(store["category_index_mapping"]),
625
+ # len(store["category_index_mapping"]) + len(new_text_chunks),
626
+ # ))
627
+
628
+ # Check if the length of new embeddings and new Indices are equal
629
+ assert len(new_embeddings) == len(
630
+ new_ids
631
+ ), "Mismatch between new embeddings and IDs."
632
+
633
+ # Add the new embeddings to the existing index
634
+ store["index"].add_with_ids(new_embeddings, np.array(list(new_ids)))
635
+
636
+ # Store new embeddings persistently
637
+ for idx, embedding in zip(new_ids, new_embeddings):
638
+ store["embeddings"][idx] = embedding
639
+
640
+ # Update the node_ids in metadata for the new nodes
641
+ for idx, node in enumerate(nodes):
642
+ node.metadata.node_id = new_ids[idx]
643
+ store["nodes"].extend(nodes)
644
+
645
+ # Update the node_id in metadata index mapping from the new nodes
646
+ new_metadata = [
647
+ {
648
+ "node_id": node.metadata.node_id,
649
+ "source_file_uuid": node.metadata.source_file_uuid,
650
+ "position": node.metadata.position,
651
+ "custom": node.metadata.custom,
652
+ }
653
+ for node in nodes
654
+ ]
655
+
656
+ # Update the mappings
657
+ store["category_index_mapping"].update(
658
+ dict(zip(new_ids, new_text_chunks)))
659
+ store["metadata_index_mapping"].update(
660
+ dict(zip(new_ids, new_metadata)))
661
+
662
+ assert (
663
+ len(store["category_index_mapping"])
664
+ == len(store["metadata_index_mapping"])
665
+ == len(store["embeddings"])
666
+ == len(store["nodes"])
667
+ ), "Mismatch between mappings and embeddings."
668
+ assert (
669
+ store["category_index_mapping"].keys()
670
+ == store["metadata_index_mapping"].keys()
671
+ == store["embeddings"].keys()
672
+ == {node.metadata.node_id for node in store["nodes"]}
673
+ ), "Mismatch between mappings and embeddings."
674
+
675
+ # Save updated store
676
+ self.collections[collection_name] = store
677
+ self._save_collection(collection_name)
678
+
679
+ self.load_collection(os.path.join(
680
+ self.base_index_path, collection_name))
681
+ saved_store = self.collections[collection_name]
682
+ assert (
683
+ len(saved_store["category_index_mapping"])
684
+ == len(saved_store["metadata_index_mapping"])
685
+ == len(saved_store["embeddings"])
686
+ == len(saved_store["nodes"])
687
+ ), "Mismatch in saved store mappings."
688
+ assert (
689
+ store["category_index_mapping"].keys()
690
+ == store["metadata_index_mapping"].keys()
691
+ == store["embeddings"].keys()
692
+ == {node.metadata.node_id for node in store["nodes"]}
693
+ ), "Mismatch between mappings and embeddings."
694
+
695
+ def delete_nodes(self, collection_name: str, node_ids: List[int]) -> None:
696
+ """
697
+ Deletes nodes from an existing store and updates the index using remove_ids method.
698
+
699
+ :param collection_name: Name of the store to update.
700
+ :param node_ids: List of node IDs to be deleted.
701
+ """
702
+ logging.info(
703
+ f"Deleting nodes {node_ids} from store: {collection_name}")
704
+
705
+ store = self.collections.get(collection_name)
706
+ if not store:
707
+ raise ValueError(f"No store found with name: {collection_name}")
708
+
709
+ assert (
710
+ len(store["category_index_mapping"])
711
+ == len(store["metadata_index_mapping"])
712
+ == len(store["embeddings"])
713
+ == len(store["nodes"])
714
+ ), "Mismatch between mappings and embeddings."
715
+ assert (
716
+ store["category_index_mapping"].keys()
717
+ == store["metadata_index_mapping"].keys()
718
+ == store["embeddings"].keys()
719
+ == {node.metadata.node_id for node in store["nodes"]}
720
+ ), "Mismatch between mappings and embeddings."
721
+
722
+ existed_ids = set(store["category_index_mapping"].keys())
723
+ logging.info(f"Existed IDs before deletion: {existed_ids}")
724
+
725
+ missing_ids = []
726
+ ids_to_delete = []
727
+
728
+ # Find the valid ids to delete
729
+ for node_id in node_ids:
730
+ if node_id not in existed_ids:
731
+ missing_ids.append(node_id)
732
+ else:
733
+ ids_to_delete.append(node_id)
734
+
735
+ if not ids_to_delete:
736
+ logging.warning(
737
+ f"No valid IDs to delete for store: {collection_name}")
738
+ return
739
+
740
+ # Remove the IDs from the FAISS index using remove_ids method
741
+ faiss_index = store["index"]
742
+
743
+ logging.info(f"FAISS Index Size Before Deletion: {faiss_index.ntotal}")
744
+
745
+ faiss_index.remove_ids(np.array(ids_to_delete))
746
+
747
+ logging.info(f"FAISS Index Size After Deletion: {faiss_index.ntotal}")
748
+
749
+ # Remove the nodes and mappings from the store
750
+ store["category_index_mapping"] = {
751
+ i: chunk
752
+ for i, chunk in store["category_index_mapping"].items()
753
+ if i not in ids_to_delete
754
+ }
755
+ store["metadata_index_mapping"] = {
756
+ i: metadata
757
+ for i, metadata in store["metadata_index_mapping"].items()
758
+ if i not in ids_to_delete
759
+ }
760
+
761
+ # Filter the nodes based on the ID, not based on list index
762
+ store["nodes"] = [
763
+ node
764
+ for node in store["nodes"]
765
+ if node.metadata.node_id not in ids_to_delete
766
+ ]
767
+
768
+ # Filter embeddings to remove those corresponding to deleted IDs
769
+ store["embeddings"] = {
770
+ i: emb for i, emb in store["embeddings"].items() if i not in ids_to_delete
771
+ }
772
+
773
+ assert (
774
+ len(store["category_index_mapping"])
775
+ == len(store["metadata_index_mapping"])
776
+ == len(store["embeddings"])
777
+ == len(store["nodes"])
778
+ ), "Mismatch between mappings and embeddings."
779
+ assert (
780
+ store["category_index_mapping"].keys()
781
+ == store["metadata_index_mapping"].keys()
782
+ == store["embeddings"].keys()
783
+ == {node.metadata.node_id for node in store["nodes"]}
784
+ ), "Mismatch between mappings and embeddings."
785
+
786
+ # Save the updated store
787
+ self.collections[collection_name] = store
788
+ self._save_collection(collection_name)
789
+
790
+ self.load_collection(os.path.join(
791
+ self.base_index_path, collection_name))
792
+ saved_store = self.collections[collection_name]
793
+ assert (
794
+ len(saved_store["category_index_mapping"])
795
+ == len(saved_store["metadata_index_mapping"])
796
+ == len(saved_store["embeddings"])
797
+ == len(saved_store["nodes"])
798
+ ), "Mismatch in saved store mappings."
799
+ assert (
800
+ store["category_index_mapping"].keys()
801
+ == store["metadata_index_mapping"].keys()
802
+ == store["embeddings"].keys()
803
+ == {node.metadata.node_id for node in store["nodes"]}
804
+ ), "Mismatch between mappings and embeddings."
805
+
806
+ logging.info(
807
+ f"Nodes {ids_to_delete} deleted and index updated for store: {collection_name} successfully."
808
+ )
809
+ if missing_ids:
810
+ logging.warning(
811
+ f"Node ID(s) {missing_ids} do not exist in the collection {collection_name}."
812
+ )
813
+ logging.info(
814
+ f"Remaining Node ID(s): {store['category_index_mapping'].keys()}")
815
+
816
+ def add_files(self, collection_name: str, files: List[VSFile]) -> None:
817
+ """
818
+ Adds file nodes to the specified store by extracting nodes from the files and adding them to the index.
819
+
820
+ :param collection_name: Name of the store to update.
821
+ :param files: List of VSFile objects whose nodes are to be added.
822
+ """
823
+ logging.info(f"Adding files to store: {collection_name}")
824
+ all_nodes = []
825
+
826
+ for file in files:
827
+ all_nodes.extend(file.nodes)
828
+
829
+ self.add_nodes(collection_name, all_nodes)
830
+
831
+ def delete_files(self, collection_name: str, files: List[VSFile]) -> None:
832
+ """
833
+ Deletes file nodes from the specified store by removing nodes corresponding to the given files.
834
+
835
+ :param collection_name: Name of the store to update.
836
+ :param files: List of VSFile objects whose nodes are to be deleted.
837
+ """
838
+ logging.info(f"Deleting files from store: {collection_name}")
839
+ node_ids_to_delete = []
840
+
841
+ for file in files:
842
+ for node in file.nodes:
843
+ node_ids_to_delete.append(node.metadata.node_id)
844
+
845
+ self.delete_nodes(collection_name, node_ids_to_delete)
846
+
847
+ def _save_collection(self, collection_name: str) -> None:
848
+ """
849
+ Helper function to save the updated store back to the file system.
850
+
851
+ :param collection_name: Name of the store to save.
852
+ """
853
+
854
+ store_path = os.path.join(self.base_index_path, collection_name)
855
+ store = self.collections[collection_name]
856
+ if len(store["nodes"]) == 0:
857
+ logging.warning(
858
+ f"Cannot sve FAISS index for store {collection_name} because nodes are empty."
859
+ )
860
+ return
861
+ # Save category index mapping to file
862
+ with open(os.path.join(store_path, "category_index_mapping.pkl"), "wb") as f:
863
+ pickle.dump(store["category_index_mapping"], f)
864
+
865
+ # Save metadata nodes to file
866
+ with open(os.path.join(store_path, "metadata_index_mapping.pkl"), "wb") as f:
867
+ pickle.dump(store["metadata_index_mapping"], f)
868
+
869
+ # Save nodes to file
870
+ with open(os.path.join(store_path, "nodes.pkl"), "wb") as f:
871
+ pickle.dump(store["nodes"], f)
872
+
873
+ # Save embeddings to file
874
+ with open(os.path.join(store_path, "embeddings_index_mapping.pkl"), "wb") as f:
875
+ pickle.dump(store["embeddings"], f)
876
+
877
+ faiss.write_index(store["index"], os.path.join(store_path, "index"))
878
+ logging.info(f"Store {collection_name} saved successfully.")