ddi-fw 0.0.239__tar.gz → 0.0.240__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/PKG-INFO +1 -1
  2. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/pyproject.toml +1 -1
  3. ddi_fw-0.0.240/src/ddi_fw/langchain/chroma_storage.py +243 -0
  4. ddi_fw-0.0.240/src/ddi_fw/langchain/faiss_storage.py +223 -0
  5. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw.egg-info/PKG-INFO +1 -1
  6. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw.egg-info/SOURCES.txt +2 -0
  7. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/README.md +0 -0
  8. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/setup.cfg +0 -0
  9. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/datasets/__init__.py +0 -0
  10. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/datasets/core.py +0 -0
  11. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/datasets/dataset_splitter.py +0 -0
  12. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/datasets/db_utils.py +0 -0
  13. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/datasets/setup_._py +0 -0
  14. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/langchain/__init__.py +0 -0
  15. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/langchain/embeddings.py +0 -0
  16. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/langchain/sentence_splitter.py +0 -0
  17. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/langchain/storage.py +0 -0
  18. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/__init__.py +0 -0
  19. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/evaluation_helper.py +0 -0
  20. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/ml_helper.py +0 -0
  21. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/model_wrapper.py +0 -0
  22. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/pytorch_wrapper.py +0 -0
  23. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/tensorflow_wrapper.py +0 -0
  24. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ml/tracking_service.py +0 -0
  25. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ner/__init__.py +0 -0
  26. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ner/mmlrestclient.py +0 -0
  27. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/ner/ner.py +0 -0
  28. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/__init__.py +0 -0
  29. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/multi_modal_combination_strategy.py +0 -0
  30. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/multi_pipeline.py +0 -0
  31. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/multi_pipeline_org.py +0 -0
  32. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/ner_pipeline.py +0 -0
  33. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/pipeline/pipeline.py +0 -0
  34. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/__init__.py +0 -0
  35. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/categorical_data_encoding_checker.py +0 -0
  36. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/enums.py +0 -0
  37. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/json_helper.py +0 -0
  38. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/kaggle.py +0 -0
  39. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/numpy_utils.py +0 -0
  40. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/package_helper.py +0 -0
  41. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/py7zr_helper.py +0 -0
  42. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/utils.py +0 -0
  43. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/utils/zip_helper.py +0 -0
  44. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/vectorization/__init__.py +0 -0
  45. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/vectorization/feature_vector_generation.py +0 -0
  46. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw/vectorization/idf_helper.py +0 -0
  47. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw.egg-info/dependency_links.txt +0 -0
  48. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw.egg-info/requires.txt +0 -0
  49. {ddi_fw-0.0.239 → ddi_fw-0.0.240}/src/ddi_fw.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.239
3
+ Version: 0.0.240
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
 
7
7
  [project]
8
8
  name = "ddi_fw"
9
- version = "0.0.239"
9
+ version = "0.0.240"
10
10
  description = "Do not use :)"
11
11
  readme = "README.md"
12
12
  authors = [
@@ -0,0 +1,243 @@
1
+ import pandas as pd
2
+ from langchain.vectorstores import Chroma
3
+ from langchain_core.embeddings import Embeddings
4
+ from langchain_core.documents import Document
5
+ from langchain.text_splitter import TextSplitter
6
+ from typing import Callable, Optional, Dict, Any, List
7
+ import numpy as np
8
+
9
+ from ddi_fw.langchain.faiss_storage import BaseVectorStoreManager
10
+ from langchain.document_loaders import DataFrameLoader
11
+
12
+
13
+ def split_dataframe(df, min_size=512):
14
+ total_size = len(df)
15
+ # If the dataframe is smaller than min_size, return the dataframe as a whole
16
+ if total_size <= min_size:
17
+ return [df]
18
+
19
+ # List to store partial DataFrames
20
+ partial_dfs = []
21
+ start_idx = 0
22
+
23
+ # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
24
+ num_chunks = total_size // min_size
25
+ remaining_rows = total_size
26
+ # Split into chunks
27
+ for i in range(num_chunks):
28
+ # If there are fewer rows left than the size of the chunk, adjust the chunk size
29
+ chunk_size = min_size
30
+ if (remaining_rows - chunk_size) < min_size:
31
+ chunk_size = remaining_rows # Last chunk takes all remaining rows
32
+
33
+ partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
34
+
35
+ # Update the start index and remaining rows
36
+ start_idx += chunk_size
37
+ remaining_rows -= chunk_size
38
+
39
+ # If there are any remaining rows left after the loop, they should form the last chunk
40
+ if remaining_rows > 0:
41
+ partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
42
+
43
+ return partial_dfs
44
+
45
+
46
+ def split_dataframe_indices(df, min_size=512):
47
+ total_size = len(df)
48
+
49
+ # If the dataframe is smaller than min_size, return the entire range
50
+ if total_size <= min_size:
51
+ return [(0, total_size - 1)]
52
+
53
+ # List to store the start and end indices of each chunk
54
+ chunk_indices = []
55
+ start_idx = 0
56
+
57
+ # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
58
+ num_chunks = total_size // min_size
59
+ remaining_rows = total_size
60
+
61
+ # Split into chunks
62
+ for i in range(num_chunks):
63
+ chunk_size = min_size
64
+ if (remaining_rows - chunk_size) < min_size:
65
+ chunk_size = remaining_rows # Last chunk takes all remaining rows
66
+
67
+ # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
68
+ end_idx = start_idx + chunk_size - 1
69
+ chunk_indices.append((start_idx, end_idx))
70
+
71
+ # Update the start index and remaining rows
72
+ start_idx += chunk_size
73
+ remaining_rows -= chunk_size
74
+
75
+ # If there are any remaining rows after the loop, they should form the last chunk
76
+ if remaining_rows > 0:
77
+ end_idx = start_idx + remaining_rows - 1
78
+ chunk_indices.append((start_idx, end_idx))
79
+
80
+ return chunk_indices
81
+
82
+
83
+
84
+ class ChromaVectorStoreManager(BaseVectorStoreManager):
85
+ def __init__(
86
+ self,
87
+ embeddings: Embeddings,
88
+ collection_name: str,
89
+ persist_directory: str,
90
+ text_splitter: TextSplitter,
91
+ batch_size: int = 1024
92
+ ):
93
+ super().__init__(embeddings)
94
+ self.collection_name = collection_name
95
+ self.persist_directory = persist_directory
96
+ self.text_splitter = text_splitter
97
+ self.batch_size = batch_size
98
+
99
+
100
+
101
+ # def __split_docs(self, documents):
102
+ # docs = self.text_splitter.split_documents(documents)
103
+ # return docs
104
+
105
+ # def __split_list(self, input_list, batch_size):
106
+ # # for i in range(0, len(input_list), batch_size):
107
+ # batch_size = len(input_list) if batch_size == None else batch_size
108
+ # for s, e in split_dataframe_indices(input_list, batch_size):
109
+ # yield input_list[s:e+1]
110
+
111
+ # def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
112
+ # """
113
+ # Core function that processes the documents and adds them to the vector database.
114
+ # """
115
+ # for page_content_column in page_content_columns:
116
+ # copy_columns = columns.copy()
117
+ # copy_columns.append(page_content_column)
118
+ # col_df = df[copy_columns].copy()
119
+ # col_df.dropna(subset=[page_content_column], inplace=True)
120
+ # col_df['type'] = page_content_column # Set the type column
121
+ # if partial_df_size:
122
+ # total = 0
123
+ # partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
124
+ # for partial_df in partial_dfs:
125
+ # # import torch
126
+
127
+ # documents = []
128
+ # loader = DataFrameLoader(
129
+ # data_frame=partial_df, page_content_column=page_content_column)
130
+ # loaded_docs = loader.load()
131
+ # # print(loaded_docs)
132
+ # documents.extend(self.__split_docs(loaded_docs))
133
+ # split_docs_chunked = self.__split_list(
134
+ # documents, self.batch_size)
135
+ # for split_docs_chunk in split_docs_chunked:
136
+ # print("entered chunks")
137
+ # self.vector_store.add_documents(split_docs_chunk)
138
+ # self.vector_store.persist()
139
+ # total += len(partial_df)
140
+ # print(f"{page_content_column}: {total}/{len(col_df)}")
141
+ # else:
142
+ # documents = []
143
+ # print(col_df.shape)
144
+ # loader = DataFrameLoader(
145
+ # data_frame=col_df, page_content_column=page_content_column)
146
+ # loaded_docs = loader.load()
147
+ # documents.extend(self.__split_docs(loaded_docs))
148
+ # print(f"Documents size: {len(loaded_docs)}")
149
+ # split_docs_chunked = self.__split_list(
150
+ # documents, self.batch_size)
151
+ # for split_docs_chunk in split_docs_chunked:
152
+ # # import torch
153
+ # # torch.cuda.empty_cache()
154
+ # self.vector_store.add_documents(split_docs_chunk)
155
+ # self.vector_store.persist()
156
+ # print(f"{page_content_column}, size:{len(split_docs_chunk)}")
157
+
158
+
159
+
160
+ def generate_vector_store(self, docs: List[Document]):
161
+ self.vector_store = Chroma(
162
+ collection_name=self.collection_name,
163
+ persist_directory=self.persist_directory,
164
+ embedding_function=self.embeddings
165
+ )
166
+ if self.text_splitter:
167
+ docs = self.text_splitter.split_documents(docs)
168
+ # Chunk docs for batch processing
169
+ for i in range(0, len(docs), self.batch_size):
170
+ chunk = docs[i:i+self.batch_size]
171
+ self.vector_store.add_documents(chunk)
172
+ self.vector_store.persist()
173
+ print(f"✅ Chroma vector store created with {len(docs)} documents.")
174
+
175
+ def save(self, path):
176
+ # Chroma persists automatically, but you can copy files if needed
177
+ print("ChromaDB persists automatically. No explicit save needed.")
178
+
179
+ def load(self, path):
180
+ self.vector_store = Chroma(
181
+ collection_name=self.collection_name,
182
+ persist_directory=path,
183
+ embedding_function=self.embeddings
184
+ )
185
+
186
+ def as_dataframe(
187
+ self,
188
+ formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None
189
+ ) -> pd.DataFrame:
190
+ # Chroma does not expose direct vector access, so we fetch all docs and embeddings
191
+ results = self.vector_store.get()
192
+ docs = results['documents']
193
+ metadatas = results['metadatas']
194
+ embeddings = results['embeddings']
195
+ items = []
196
+ for doc, meta, emb in zip(docs, metadatas, embeddings):
197
+ document = Document(page_content=doc, metadata=meta)
198
+ if formatter_fn:
199
+ item = formatter_fn(document, np.array(emb))
200
+ else:
201
+ item = {"embedding": emb, **meta}
202
+ items.append(item)
203
+ return pd.DataFrame(items)
204
+
205
+ def get_data(self, id):
206
+ # Chroma does not use integer IDs, but document IDs (UUIDs)
207
+ results = self.vector_store.get(ids=[id])
208
+ if not results['documents']:
209
+ raise ValueError("Document not found.")
210
+ return {
211
+ "doc_id": id,
212
+ "document": Document(page_content=results['documents'][0], metadata=results['metadatas'][0]),
213
+ "vector": np.array(results['embeddings'][0])
214
+ }
215
+
216
+ def get_all_vectors(self):
217
+ results = self.vector_store.get()
218
+ return np.array(results['embeddings'])
219
+
220
+ def get_vector_by_id(self, id):
221
+ results = self.vector_store.get(ids=[id])
222
+ if not results['embeddings']:
223
+ raise ValueError("Vector not found.")
224
+ return np.array(results['embeddings'][0])
225
+
226
+ def get_document_by_index(self, index):
227
+ results = self.vector_store.get()
228
+ docs = results['documents']
229
+ metadatas = results['metadatas']
230
+ if index >= len(docs):
231
+ raise IndexError("Index out of range.")
232
+ return Document(page_content=docs[index], metadata=metadatas[index])
233
+
234
+ def get_similar_embeddings(self, embedding_list, k):
235
+ # Chroma does not provide direct similarity search on arbitrary embeddings
236
+ # You can use vector_store.similarity_search_by_vector for a single embedding
237
+ raise NotImplementedError("Chroma does not support batch similarity search by embedding list.")
238
+
239
+ def get_similar_docs(self, embedding, filter=None, top_k=3):
240
+ results = self.vector_store.similarity_search_by_vector(
241
+ embedding, k=top_k, filter=filter
242
+ )
243
+ return results[:top_k]
@@ -0,0 +1,223 @@
1
+ import faiss
2
+ import pandas as pd
3
+ from uuid import uuid4
4
+ from langchain_community.vectorstores.faiss import FAISS
5
+ from langchain_community.docstore.in_memory import InMemoryDocstore
6
+ from typing import Callable, Optional, Dict, Any
7
+ from langchain_core.documents import Document
8
+ import numpy as np # optional, if you're using NumPy vectors
9
+ from langchain_core.embeddings import Embeddings
10
+
11
+ class BaseVectorStoreManager:
12
+ def __init__(self, embeddings: Embeddings):
13
+ self.embeddings = embeddings
14
+
15
+ def generate_vector_store(self, docs):
16
+ raise NotImplementedError("This method should be implemented by subclasses.")
17
+
18
+ def save(self, path):
19
+ raise NotImplementedError("This method should be implemented by subclasses.")
20
+
21
+ def load(self, path):
22
+ raise NotImplementedError("This method should be implemented by subclasses.")
23
+
24
+ def as_dataframe(self, formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None) -> pd.DataFrame:
25
+ raise NotImplementedError("This method should be implemented by subclasses.")
26
+
27
+ class VectorStoreManager:
28
+ def __init__(self, embeddings:Embeddings):
29
+ self.embeddings = embeddings
30
+ self.index = None
31
+ self.vector_store = None
32
+
33
+ # def generate_vector_store(self, docs):
34
+ # dimension = len(self.embeddings.embed_query("hello world"))
35
+ # self.index = faiss.IndexFlatL2(dimension)
36
+ # index_to_docstore_id = {}
37
+
38
+ # self.vector_store = FAISS(
39
+ # embedding_function=self.embeddings,
40
+ # index=self.index,
41
+ # docstore=InMemoryDocstore(),
42
+ # index_to_docstore_id=index_to_docstore_id,
43
+ # )
44
+
45
+ # uuids = [str(uuid4()) for _ in range(len(docs))]
46
+ # self.vector_store.add_documents(documents=docs, ids=uuids)
47
+
48
+
49
+ def generate_vector_store(self, docs, handle_empty='zero'):
50
+ """
51
+ Generate a FAISS vector store from documents.
52
+
53
+ Parameters:
54
+ docs (list[Document]): List of LangChain Document objects.
55
+ handle_empty (str): How to handle empty docs. Options:
56
+ - 'zero': assign zero-vector
57
+ - 'skip': skip the document
58
+ - 'error': raise ValueError
59
+ """
60
+
61
+ # Step 1: Get embedding dimension from a sample input
62
+ sample_embedding = self.embeddings.embed_query("hello world")
63
+ dimension = len(sample_embedding)
64
+ zero_vector = np.zeros(dimension, dtype=np.float32)
65
+
66
+ self.index = faiss.IndexFlatL2(dimension)
67
+ index_to_docstore_id = {}
68
+ docstore = InMemoryDocstore()
69
+ self.vector_store = FAISS(
70
+ embedding_function=self.embeddings,
71
+ index=self.index,
72
+ docstore=docstore,
73
+ index_to_docstore_id=index_to_docstore_id,
74
+ )
75
+
76
+ valid_docs = []
77
+ valid_ids = []
78
+
79
+ for doc in docs:
80
+ content = doc.page_content if hasattr(doc, 'page_content') else ""
81
+ if content and content.strip():
82
+ valid_docs.append(doc)
83
+ valid_ids.append(str(uuid4()))
84
+ else:
85
+ if handle_empty == 'skip':
86
+ continue
87
+ elif handle_empty == 'zero':
88
+ # Assign zero vector manually
89
+ doc_id = str(uuid4())
90
+ index_to_docstore_id[len(docstore._dict)] = doc_id
91
+ docstore._dict[doc_id] = doc
92
+ self.index.add(np.array([zero_vector]))
93
+ elif handle_empty == 'error':
94
+ raise ValueError("Document has empty or blank content.")
95
+ else:
96
+ raise ValueError(f"Unknown handle_empty mode: {handle_empty}")
97
+
98
+ # Step 2: Embed and add valid documents
99
+ if valid_docs:
100
+ self.vector_store.add_documents(documents=valid_docs, ids=valid_ids)
101
+ elif handle_empty != 'zero':
102
+ raise ValueError("No valid documents to embed.")
103
+
104
+ print(f"✅ Vector store created with {self.index.ntotal} vectors.")
105
+
106
+ def save(self, path):
107
+ if self.vector_store:
108
+ self.vector_store.save_local(path)
109
+ else:
110
+ raise ValueError("No vector store to save.")
111
+
112
+ def load(self, path):
113
+ self.vector_store = FAISS.load_local(
114
+ path, self.embeddings, allow_dangerous_deserialization=True
115
+ )
116
+ self.index = self.vector_store.index
117
+
118
+ def as_dataframe(
119
+ self,
120
+ formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None
121
+ ) -> pd.DataFrame:
122
+
123
+ if not self.index or not self.vector_store:
124
+ raise ValueError("Index or vector store not initialized.")
125
+
126
+ vector_dict = {}
127
+ for i in range(self.index.ntotal):
128
+ vector = self.index.reconstruct(i)
129
+ doc_id = self.vector_store.index_to_docstore_id[i]
130
+ document = self.vector_store.docstore.search(doc_id)
131
+
132
+ if formatter_fn:
133
+ item = formatter_fn(document, vector)
134
+ else:
135
+ item = {
136
+ "embedding": vector,
137
+ **document.metadata
138
+ }
139
+
140
+ vector_dict[i] = item
141
+
142
+ return pd.DataFrame.from_dict(vector_dict, orient='index')
143
+
144
+ def get_data(self, id):
145
+ if not self.index or not self.vector_store:
146
+ raise ValueError("Index or vector store not initialized.")
147
+
148
+ vector = self.index.reconstruct(id)
149
+ doc_id = self.vector_store.index_to_docstore_id[id]
150
+ document = self.vector_store.docstore.search(doc_id)
151
+ return {"doc_id": doc_id, "document": document, "vector": vector}
152
+
153
+ def get_all_vectors(self):
154
+ if not self.index:
155
+ raise ValueError("Index not initialized.")
156
+ return self.index.reconstruct_n(0, self.index.ntotal)
157
+
158
+ def get_vector_by_id(self, id):
159
+ if not self.index:
160
+ raise ValueError("Index not initialized.")
161
+ return self.index.reconstruct(id)
162
+
163
+ def get_document_by_index(self,index):
164
+ doc_id = self.vector_store.index_to_docstore_id[index]
165
+ document = self.vector_store.docstore.search(doc_id)
166
+ return document
167
+
168
+ def get_similar_embeddings(self, embedding_list, k):
169
+ num_vectors, dim = embedding_list.shape
170
+
171
+ # 2. Normalize for cosine similarity
172
+ faiss.normalize_L2(embedding_list)
173
+
174
+ # 3. Build FAISS index
175
+ index = faiss.IndexFlatIP(dim)
176
+ index.add(embedding_list)
177
+
178
+ # 4. Query top-k+1 to exclude self-match
179
+ # k = 4 # Request top 4, so we can drop self and keep 3
180
+ D, I = index.search(embedding_list, k+1)
181
+
182
+ # 5. Prepare output arrays
183
+ top_k_ids_list = []
184
+ top_k_avg_embeddings = []
185
+
186
+ # id_list = desc_df['drugbank_id'].tolist()
187
+
188
+ for i in range(num_vectors):
189
+ indices = I[i]
190
+
191
+ # Exclude self (assume it's the first match)
192
+ filtered = [idx for idx in indices if idx != i][:k]
193
+
194
+ # top_ids = [id_list[j] for j in filtered]
195
+ top_embeds = embedding_list[filtered]
196
+
197
+ avg_embed = np.mean(top_embeds, axis=0) if len(top_embeds) > 0 else np.zeros(dim)
198
+
199
+ # top_k_ids_list.append(top_ids)
200
+ top_k_ids_list.append(filtered)
201
+ top_k_avg_embeddings.append(avg_embed)
202
+ return top_k_ids_list, top_k_avg_embeddings
203
+
204
+ def get_similar_docs(self, embedding, filter, top_k = 3):
205
+ # Perform similarity search
206
+ results = self.vector_store.similarity_search_with_score_by_vector(
207
+ embedding,
208
+ k=top_k , # Fetch more in case original sneaks in
209
+ filter=filter
210
+ )
211
+
212
+ # Extract top-k drugbank_ids
213
+ # top_k_ids = [doc.metadata.get("drugbank_id") for doc, score in results[:top_k]]
214
+ # return top_k_ids
215
+ return results[:top_k]
216
+
217
+
218
+ def custom_formatter(document: Document, vector: np.ndarray) -> Dict[str, Any]:
219
+ return {
220
+ "drugbank_id": document.metadata.get("drugbank_id", None),
221
+ "type": document.metadata.get("type", None),
222
+ "embedding": vector
223
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.239
3
+ Version: 0.0.240
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -11,7 +11,9 @@ src/ddi_fw/datasets/dataset_splitter.py
11
11
  src/ddi_fw/datasets/db_utils.py
12
12
  src/ddi_fw/datasets/setup_._py
13
13
  src/ddi_fw/langchain/__init__.py
14
+ src/ddi_fw/langchain/chroma_storage.py
14
15
  src/ddi_fw/langchain/embeddings.py
16
+ src/ddi_fw/langchain/faiss_storage.py
15
17
  src/ddi_fw/langchain/sentence_splitter.py
16
18
  src/ddi_fw/langchain/storage.py
17
19
  src/ddi_fw/ml/__init__.py
File without changes
File without changes
File without changes