PyPI - ddi-fw - Versions diffs - 0.0.239__py3-none-any.whl → 0.0.240__py3-none-any.whl - Mend

ddi-fw 0.0.239py3-none-any.whl → 0.0.240py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

ddi_fw/langchain/chroma_storage.py ADDED Viewed

@@ -0,0 +1,243 @@
+import pandas as pd
+from langchain.vectorstores import Chroma
+from langchain_core.embeddings import Embeddings
+from langchain_core.documents import Document
+from langchain.text_splitter import TextSplitter
+from typing import Callable, Optional, Dict, Any, List
+import numpy as np
+from ddi_fw.langchain.faiss_storage import BaseVectorStoreManager
+from langchain.document_loaders import DataFrameLoader
+def split_dataframe(df, min_size=512):
+    total_size = len(df)
+    # If the dataframe is smaller than min_size, return the dataframe as a whole
+    if total_size <= min_size:
+        return [df]
+    # List to store partial DataFrames
+    partial_dfs = []
+    start_idx = 0
+    # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
+    num_chunks = total_size // min_size
+    remaining_rows = total_size
+    # Split into chunks
+    for i in range(num_chunks):
+        # If there are fewer rows left than the size of the chunk, adjust the chunk size
+        chunk_size = min_size
+        if (remaining_rows - chunk_size) < min_size:
+            chunk_size = remaining_rows  # Last chunk takes all remaining rows
+        partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
+        # Update the start index and remaining rows
+        start_idx += chunk_size
+        remaining_rows -= chunk_size
+    # If there are any remaining rows left after the loop, they should form the last chunk
+    if remaining_rows > 0:
+        partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
+    return partial_dfs
+def split_dataframe_indices(df, min_size=512):
+    total_size = len(df)
+    # If the dataframe is smaller than min_size, return the entire range
+    if total_size <= min_size:
+        return [(0, total_size - 1)]
+    # List to store the start and end indices of each chunk
+    chunk_indices = []
+    start_idx = 0
+    # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
+    num_chunks = total_size // min_size
+    remaining_rows = total_size
+    # Split into chunks
+    for i in range(num_chunks):
+        chunk_size = min_size
+        if (remaining_rows - chunk_size) < min_size:
+            chunk_size = remaining_rows  # Last chunk takes all remaining rows
+        # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
+        end_idx = start_idx + chunk_size - 1
+        chunk_indices.append((start_idx, end_idx))
+        # Update the start index and remaining rows
+        start_idx += chunk_size
+        remaining_rows -= chunk_size
+    # If there are any remaining rows after the loop, they should form the last chunk
+    if remaining_rows > 0:
+        end_idx = start_idx + remaining_rows - 1
+        chunk_indices.append((start_idx, end_idx))
+    return chunk_indices
+class ChromaVectorStoreManager(BaseVectorStoreManager):
+    def __init__(
+        self,
+        embeddings: Embeddings,
+        collection_name: str,
+        persist_directory: str,
+        text_splitter: TextSplitter,
+        batch_size: int = 1024
+    ):
+        super().__init__(embeddings)
+        self.collection_name = collection_name
+        self.persist_directory = persist_directory
+        self.text_splitter = text_splitter
+        self.batch_size = batch_size
+    # def __split_docs(self, documents):
+    #     docs = self.text_splitter.split_documents(documents)
+    #     return docs
+    # def __split_list(self, input_list, batch_size):
+    #     # for i in range(0, len(input_list), batch_size):
+    #     batch_size = len(input_list) if batch_size == None else batch_size
+    #     for s, e in split_dataframe_indices(input_list, batch_size):
+    #         yield input_list[s:e+1]
+    # def store_documents(self, df, columns, page_content_columns, partial_df_size=None):
+    #     """
+    #     Core function that processes the documents and adds them to the vector database.
+    #     """
+    #     for page_content_column in page_content_columns:
+    #         copy_columns = columns.copy()
+    #         copy_columns.append(page_content_column)
+    #         col_df = df[copy_columns].copy()
+    #         col_df.dropna(subset=[page_content_column], inplace=True)
+    #         col_df['type'] = page_content_column  # Set the type column
+    #         if partial_df_size:
+    #             total = 0
+    #             partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
+    #             for partial_df in partial_dfs:
+    #                 # import torch
+    #                 documents = []
+    #                 loader = DataFrameLoader(
+    #                     data_frame=partial_df, page_content_column=page_content_column)
+    #                 loaded_docs = loader.load()
+    #                 # print(loaded_docs)
+    #                 documents.extend(self.__split_docs(loaded_docs))
+    #                 split_docs_chunked = self.__split_list(
+    #                     documents, self.batch_size)
+    #                 for split_docs_chunk in split_docs_chunked:
+    #                     print("entered chunks")
+    #                     self.vector_store.add_documents(split_docs_chunk)
+    #                     self.vector_store.persist()
+    #                 total += len(partial_df)
+    #                 print(f"{page_content_column}: {total}/{len(col_df)}")
+    #         else:
+    #             documents = []
+    #             print(col_df.shape)
+    #             loader = DataFrameLoader(
+    #                 data_frame=col_df, page_content_column=page_content_column)
+    #             loaded_docs = loader.load()
+    #             documents.extend(self.__split_docs(loaded_docs))
+    #             print(f"Documents size: {len(loaded_docs)}")
+    #             split_docs_chunked = self.__split_list(
+    #                 documents, self.batch_size)
+    #             for split_docs_chunk in split_docs_chunked:
+    #                 # import torch
+    #                 # torch.cuda.empty_cache()
+    #                 self.vector_store.add_documents(split_docs_chunk)
+    #                 self.vector_store.persist()
+    #                 print(f"{page_content_column}, size:{len(split_docs_chunk)}")
+    def generate_vector_store(self, docs: List[Document]):
+        self.vector_store = Chroma(
+            collection_name=self.collection_name,
+            persist_directory=self.persist_directory,
+            embedding_function=self.embeddings
+        )
+        if self.text_splitter:
+            docs = self.text_splitter.split_documents(docs)
+        # Chunk docs for batch processing
+        for i in range(0, len(docs), self.batch_size):
+            chunk = docs[i:i+self.batch_size]
+            self.vector_store.add_documents(chunk)
+            self.vector_store.persist()
+        print(f"✅ Chroma vector store created with {len(docs)} documents.")
+    def save(self, path):
+        # Chroma persists automatically, but you can copy files if needed
+        print("ChromaDB persists automatically. No explicit save needed.")
+    def load(self, path):
+        self.vector_store = Chroma(
+            collection_name=self.collection_name,
+            persist_directory=path,
+            embedding_function=self.embeddings
+        )
+    def as_dataframe(
+        self,
+        formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None
+    ) -> pd.DataFrame:
+        # Chroma does not expose direct vector access, so we fetch all docs and embeddings
+        results = self.vector_store.get()
+        docs = results['documents']
+        metadatas = results['metadatas']
+        embeddings = results['embeddings']
+        items = []
+        for doc, meta, emb in zip(docs, metadatas, embeddings):
+            document = Document(page_content=doc, metadata=meta)
+            if formatter_fn:
+                item = formatter_fn(document, np.array(emb))
+            else:
+                item = {"embedding": emb, **meta}
+            items.append(item)
+        return pd.DataFrame(items)
+    def get_data(self, id):
+        # Chroma does not use integer IDs, but document IDs (UUIDs)
+        results = self.vector_store.get(ids=[id])
+        if not results['documents']:
+            raise ValueError("Document not found.")
+        return {
+            "doc_id": id,
+            "document": Document(page_content=results['documents'][0], metadata=results['metadatas'][0]),
+            "vector": np.array(results['embeddings'][0])
+        }
+    def get_all_vectors(self):
+        results = self.vector_store.get()
+        return np.array(results['embeddings'])
+    def get_vector_by_id(self, id):
+        results = self.vector_store.get(ids=[id])
+        if not results['embeddings']:
+            raise ValueError("Vector not found.")
+        return np.array(results['embeddings'][0])
+    def get_document_by_index(self, index):
+        results = self.vector_store.get()
+        docs = results['documents']
+        metadatas = results['metadatas']
+        if index >= len(docs):
+            raise IndexError("Index out of range.")
+        return Document(page_content=docs[index], metadata=metadatas[index])
+    def get_similar_embeddings(self, embedding_list, k):
+        # Chroma does not provide direct similarity search on arbitrary embeddings
+        # You can use vector_store.similarity_search_by_vector for a single embedding
+        raise NotImplementedError("Chroma does not support batch similarity search by embedding list.")
+    def get_similar_docs(self, embedding, filter=None, top_k=3):
+        results = self.vector_store.similarity_search_by_vector(
+            embedding, k=top_k, filter=filter
+        )
+        return results[:top_k]

ddi_fw/langchain/faiss_storage.py ADDED Viewed

@@ -0,0 +1,223 @@
+import faiss
+import pandas as pd
+from uuid import uuid4
+from langchain_community.vectorstores.faiss import FAISS
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from typing import Callable, Optional, Dict, Any
+from langchain_core.documents import Document
+import numpy as np  # optional, if you're using NumPy vectors
+from langchain_core.embeddings import Embeddings
+class BaseVectorStoreManager:
+    def __init__(self, embeddings: Embeddings):
+        self.embeddings = embeddings
+    def generate_vector_store(self, docs):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+    def save(self, path):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+    def load(self, path):
+        raise NotImplementedError("This method should be implemented by subclasses.")
+    def as_dataframe(self, formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None) -> pd.DataFrame:
+        raise NotImplementedError("This method should be implemented by subclasses.")
+class VectorStoreManager:
+    def __init__(self, embeddings:Embeddings):
+        self.embeddings = embeddings
+        self.index = None
+        self.vector_store = None
+    # def generate_vector_store(self, docs):
+    #     dimension = len(self.embeddings.embed_query("hello world"))
+    #     self.index = faiss.IndexFlatL2(dimension)
+    #     index_to_docstore_id = {}
+    #     self.vector_store = FAISS(
+    #         embedding_function=self.embeddings,
+    #         index=self.index,
+    #         docstore=InMemoryDocstore(),
+    #         index_to_docstore_id=index_to_docstore_id,
+    #     )
+    #     uuids = [str(uuid4()) for _ in range(len(docs))]
+    #     self.vector_store.add_documents(documents=docs, ids=uuids)
+    def generate_vector_store(self, docs, handle_empty='zero'):
+        """
+        Generate a FAISS vector store from documents.
+        Parameters:
+            docs (list[Document]): List of LangChain Document objects.
+            handle_empty (str): How to handle empty docs. Options:
+                - 'zero': assign zero-vector
+                - 'skip': skip the document
+                - 'error': raise ValueError
+        """
+        # Step 1: Get embedding dimension from a sample input
+        sample_embedding = self.embeddings.embed_query("hello world")
+        dimension = len(sample_embedding)
+        zero_vector = np.zeros(dimension, dtype=np.float32)
+        self.index = faiss.IndexFlatL2(dimension)
+        index_to_docstore_id = {}
+        docstore = InMemoryDocstore()
+        self.vector_store = FAISS(
+            embedding_function=self.embeddings,
+            index=self.index,
+            docstore=docstore,
+            index_to_docstore_id=index_to_docstore_id,
+        )
+        valid_docs = []
+        valid_ids = []
+        for doc in docs:
+            content = doc.page_content if hasattr(doc, 'page_content') else ""
+            if content and content.strip():
+                valid_docs.append(doc)
+                valid_ids.append(str(uuid4()))
+            else:
+                if handle_empty == 'skip':
+                    continue
+                elif handle_empty == 'zero':
+                    # Assign zero vector manually
+                    doc_id = str(uuid4())
+                    index_to_docstore_id[len(docstore._dict)] = doc_id
+                    docstore._dict[doc_id] = doc
+                    self.index.add(np.array([zero_vector]))
+                elif handle_empty == 'error':
+                    raise ValueError("Document has empty or blank content.")
+                else:
+                    raise ValueError(f"Unknown handle_empty mode: {handle_empty}")
+        # Step 2: Embed and add valid documents
+        if valid_docs:
+            self.vector_store.add_documents(documents=valid_docs, ids=valid_ids)
+        elif handle_empty != 'zero':
+            raise ValueError("No valid documents to embed.")
+        print(f"✅ Vector store created with {self.index.ntotal} vectors.")
+    def save(self, path):
+        if self.vector_store:
+            self.vector_store.save_local(path)
+        else:
+            raise ValueError("No vector store to save.")
+    def load(self, path):
+        self.vector_store = FAISS.load_local(
+            path, self.embeddings, allow_dangerous_deserialization=True
+        )
+        self.index = self.vector_store.index
+    def as_dataframe(
+		self,
+		formatter_fn: Optional[Callable[[Document, np.ndarray], Dict[str, Any]]] = None
+	) -> pd.DataFrame:
+        if not self.index or not self.vector_store:
+            raise ValueError("Index or vector store not initialized.")
+        vector_dict = {}
+        for i in range(self.index.ntotal):
+            vector = self.index.reconstruct(i)
+            doc_id = self.vector_store.index_to_docstore_id[i]
+            document = self.vector_store.docstore.search(doc_id)
+            if formatter_fn:
+                item = formatter_fn(document, vector)
+            else:
+                item = {
+                    "embedding": vector,
+                    **document.metadata
+                }
+            vector_dict[i] = item
+        return pd.DataFrame.from_dict(vector_dict, orient='index')
+    def get_data(self, id):
+        if not self.index or not self.vector_store:
+            raise ValueError("Index or vector store not initialized.")
+        vector = self.index.reconstruct(id)
+        doc_id = self.vector_store.index_to_docstore_id[id]
+        document = self.vector_store.docstore.search(doc_id)
+        return {"doc_id": doc_id, "document": document, "vector": vector}
+    def get_all_vectors(self):
+        if not self.index:
+            raise ValueError("Index not initialized.")
+        return self.index.reconstruct_n(0, self.index.ntotal)
+    def get_vector_by_id(self, id):
+        if not self.index:
+            raise ValueError("Index not initialized.")
+        return self.index.reconstruct(id)
+    def get_document_by_index(self,index):
+        doc_id = self.vector_store.index_to_docstore_id[index]
+        document = self.vector_store.docstore.search(doc_id)
+        return document
+    def get_similar_embeddings(self, embedding_list, k):
+        num_vectors, dim = embedding_list.shape
+        # 2. Normalize for cosine similarity
+        faiss.normalize_L2(embedding_list)
+        # 3. Build FAISS index
+        index = faiss.IndexFlatIP(dim)
+        index.add(embedding_list)
+        # 4. Query top-k+1 to exclude self-match
+        # k = 4  # Request top 4, so we can drop self and keep 3
+        D, I = index.search(embedding_list, k+1)
+        # 5. Prepare output arrays
+        top_k_ids_list = []
+        top_k_avg_embeddings = []
+        # id_list = desc_df['drugbank_id'].tolist()
+        for i in range(num_vectors):
+            indices = I[i]
+            # Exclude self (assume it's the first match)
+            filtered = [idx for idx in indices if idx != i][:k]
+            # top_ids = [id_list[j] for j in filtered]
+            top_embeds = embedding_list[filtered]
+            avg_embed = np.mean(top_embeds, axis=0) if len(top_embeds) > 0 else np.zeros(dim)
+            # top_k_ids_list.append(top_ids)
+            top_k_ids_list.append(filtered)
+            top_k_avg_embeddings.append(avg_embed)
+        return top_k_ids_list, top_k_avg_embeddings
+    def get_similar_docs(self, embedding, filter, top_k = 3):
+        # Perform similarity search
+        results = self.vector_store.similarity_search_with_score_by_vector(
+            embedding,
+            k=top_k ,  # Fetch more in case original sneaks in
+            filter=filter
+        )
+        # Extract top-k drugbank_ids
+        # top_k_ids = [doc.metadata.get("drugbank_id") for doc, score in results[:top_k]]
+        # return top_k_ids
+        return results[:top_k]
+def custom_formatter(document: Document, vector: np.ndarray) -> Dict[str, Any]:
+    return {
+        "drugbank_id": document.metadata.get("drugbank_id", None),
+        "type": document.metadata.get("type", None),
+        "embedding": vector
+    }

{ddi_fw-0.0.239.dist-info → ddi_fw-0.0.240.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ddi_fw
-Version: 0.0.239
+Version: 0.0.240
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.239.dist-info → ddi_fw-0.0.240.dist-info}/RECORD RENAMED Viewed

@@ -4,7 +4,9 @@ ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl
 ddi_fw/datasets/db_utils.py,sha256=xRj28U_uXTRPHcz3yIICczFUHXUPiAOZtAj5BM6kH44,6465
 ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
 ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
+ddi_fw/langchain/chroma_storage.py,sha256=wzJoGixoUHfAbuockB6CGoI0eXsFXRl4Xzl2x8PDz0E,9927
 ddi_fw/langchain/embeddings.py,sha256=eEWy4okcjdhUJHi4N48Wd8XauPXyeaQVLUdNWEvtEcY,6754
+ddi_fw/langchain/faiss_storage.py,sha256=lE2TKtDBp2Pi0sRozARxlT40_lFq_LJxl0N__yuHIQw,8389
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
 ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
 ddi_fw/ml/__init__.py,sha256=FteYEawCkVQOaK-cTv2VrHZ2ZnfeFr31BD6VucO7_DQ,268
@@ -36,7 +38,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
 ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
 ddi_fw/vectorization/feature_vector_generation.py,sha256=QQQGhCti653BdU343Ag1bH_g1fzi2hlic7dgNy7otjE,7694
 ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
-ddi_fw-0.0.239.dist-info/METADATA,sha256=A3eq8omMGL9tJhqt_tbRk1WS_w8COowUV76jVunMchc,2632
-ddi_fw-0.0.239.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-ddi_fw-0.0.239.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.239.dist-info/RECORD,,
+ddi_fw-0.0.240.dist-info/METADATA,sha256=tS_nqwTtm_g2aqgtzYXH-irscCiJjgSCWs3lbmWhGco,2632
+ddi_fw-0.0.240.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+ddi_fw-0.0.240.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.240.dist-info/RECORD,,

{ddi_fw-0.0.239.dist-info → ddi_fw-0.0.240.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.239.dist-info → ddi_fw-0.0.240.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.239__py3-none-any.whl → 0.0.240__py3-none-any.whl

ddi-fw 0.0.239py3-none-any.whl → 0.0.240py3-none-any.whl