PyPI - ddi-fw - Versions diffs - 0.0.105__py3-none-any.whl → 0.0.106__py3-none-any.whl - Mend

ddi-fw 0.0.105py3-none-any.whl → 0.0.106py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

ddi_fw/langchain/storage.py CHANGED Viewed

@@ -10,6 +10,7 @@ from langchain.docstore.document import Document
 from langchain.document_loaders import DataFrameLoader
 from langchain.text_splitter import TextSplitter
+import numpy as np
 # from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
 from ddi_fw.langchain.embeddings import SBertEmbeddings
@@ -24,6 +25,46 @@ def load_configuration(config_file):
         config = json.load(f)
     return config
+def split_dataframe(df, min_size=512, max_size=1024):
+    # Ensure the total size of the DataFrame is larger than the desired split size
+    total_size = len(df)
+    # Check if the dataframe is large enough
+    if total_size <= min_size:
+       return df
+    # List to store partial DataFrames
+    partial_dfs = []
+    # Start splitting the DataFrame
+    start_idx = 0
+    while start_idx < total_size:
+        # Calculate the size of the next chunk: it should be between min_size and max_size
+        chunk_size = np.random.randint(min_size, max_size + 1)
+        # Ensure that the chunk size does not exceed the remaining data
+        chunk_size = min(chunk_size, total_size - start_idx)
+        # Create the partial DataFrame and append to the list
+        partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
+        # Update the start index for the next chunk
+        start_idx += chunk_size
+    return partial_dfs
+# Example Usage:
+# Assuming df is your large DataFrame
+# df = pd.read_csv('your_large_file.csv')  # or your DataFrame
+# Split the DataFrame into smaller chunks
+partial_dfs = split_dataframe(df)
+# Verify the sizes of the partial DataFrames
+for i, partial_df in enumerate(partial_dfs):
+    print(f"Partial DataFrame {i+1} size: {len(partial_df)}")
 class DataFrameToVectorDB:
     def __init__(self,
@@ -83,11 +124,15 @@ class DataFrameToVectorDB:
         Store function to handle both full and partial dataframe processing.
         """
         if partial_df_size:
-            # Process the dataframe in chunks if partial_df_size is provided
-            for i in range(0, len(df), partial_df_size):
-                batch = df[i: i + partial_df_size]
-                self.store_documents(df=batch, columns=columns,
+            partial_dfs  = split_dataframe(df, min_size = partial_df_size)
+            for partial_df in partial_dfs:
+                self.store_documents(df=partial_df, columns=columns,
                                      page_content_columns=page_content_columns)
+            # Process the dataframe in chunks if partial_df_size is provided
+            # for i in range(0, len(df), partial_df_size):
+            #     batch = df[i: i + partial_df_size]
+            #     self.store_documents(df=batch, columns=columns,
+            #                          page_content_columns=page_content_columns)
         else:
             # Process the entire dataframe if no partial_df_size is specified
             self.store_documents(df=df, columns=columns,

{ddi_fw-0.0.105.dist-info → ddi_fw-0.0.106.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ddi_fw
-Version: 0.0.105
+Version: 0.0.106
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.105.dist-info → ddi_fw-0.0.106.dist-info}/RECORD RENAMED Viewed

@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
 ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
 ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
-ddi_fw/langchain/storage.py,sha256=F7mvNxLQV2wRh2C__qyY4jpJK9udzkETEvQDXlqKbxk,7505
+ddi_fw/langchain/storage.py,sha256=SUhwEXH3OWl-6mmTJsAsHwuvR6XWhiWjrVqi5kW6q3U,9171
 ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
 ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
 ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
 ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
 ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
 ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
-ddi_fw-0.0.105.dist-info/METADATA,sha256=lALwj-QIiaHuYlqdeT1L5wp23Lz7FI9ZeihBHHfUCos,1967
-ddi_fw-0.0.105.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-ddi_fw-0.0.105.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.105.dist-info/RECORD,,
+ddi_fw-0.0.106.dist-info/METADATA,sha256=P1ygEaW4bV2r8dXkyfzSmratZdacfPzOzr35O-cujGQ,1967
+ddi_fw-0.0.106.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+ddi_fw-0.0.106.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.106.dist-info/RECORD,,

{ddi_fw-0.0.105.dist-info → ddi_fw-0.0.106.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.105.dist-info → ddi_fw-0.0.106.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.105__py3-none-any.whl → 0.0.106__py3-none-any.whl

ddi-fw 0.0.105py3-none-any.whl → 0.0.106py3-none-any.whl