PyPI - ddi-fw - Versions diffs - 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl - Mend

ddi-fw 0.0.107py3-none-any.whl → 0.0.109py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

ddi_fw/langchain/storage.py CHANGED Viewed

@@ -25,35 +25,74 @@ def load_configuration(config_file):
         config = json.load(f)
     return config
-def split_dataframe(df, min_size=512, max_size=1024):
-    # Ensure the total size of the DataFrame is larger than the desired split size
+def split_dataframe(df, min_size=512):
     total_size = len(df)
-    # Check if the dataframe is large enough
+    # If the dataframe is smaller than min_size, return the dataframe as a whole
     if total_size <= min_size:
-       return df
+        return [df]
     # List to store partial DataFrames
     partial_dfs = []
-    # Start splitting the DataFrame
     start_idx = 0
-    while start_idx < total_size:
-        # Calculate the size of the next chunk: it should be between min_size and max_size
-        chunk_size = np.random.randint(min_size, max_size + 1)
-        # Ensure that the chunk size does not exceed the remaining data
-        chunk_size = min(chunk_size, total_size - start_idx)
+    # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
+    num_chunks = total_size // min_size
+    remaining_rows = total_size
+    # Split into chunks
+    for i in range(num_chunks):
+        # If there are fewer rows left than the size of the chunk, adjust the chunk size
+        chunk_size = min_size
+        if (remaining_rows - chunk_size) < min_size:
+            chunk_size = remaining_rows  # Last chunk takes all remaining rows
-        # Create the partial DataFrame and append to the list
         partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
-        # Update the start index for the next chunk
+        # Update the start index and remaining rows
         start_idx += chunk_size
+        remaining_rows -= chunk_size
+    # If there are any remaining rows left after the loop, they should form the last chunk
+    if remaining_rows > 0:
+        partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
     return partial_dfs
+def split_dataframe_indices(df, min_size=512):
+    total_size = len(df)
+    # If the dataframe is smaller than min_size, return the entire range
+    if total_size <= min_size:
+        return [(0, total_size - 1)]
+    # List to store the start and end indices of each chunk
+    chunk_indices = []
+    start_idx = 0
+    # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
+    num_chunks = total_size // min_size
+    remaining_rows = total_size
+    # Split into chunks
+    for i in range(num_chunks):
+        chunk_size = min_size
+        if (remaining_rows - chunk_size) < min_size:
+            chunk_size = remaining_rows  # Last chunk takes all remaining rows
+        # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
+        end_idx = start_idx + chunk_size - 1
+        chunk_indices.append((start_idx, end_idx))
+        # Update the start index and remaining rows
+        start_idx += chunk_size
+        remaining_rows -= chunk_size
+    # If there are any remaining rows after the loop, they should form the last chunk
+    if remaining_rows > 0:
+        end_idx = start_idx + remaining_rows - 1
+        chunk_indices.append((start_idx, end_idx))
+    return chunk_indices
 class DataFrameToVectorDB:
     def __init__(self,
                  collection_name,
@@ -75,8 +114,9 @@ class DataFrameToVectorDB:
         return docs
     def __split_list(self, input_list, batch_size):
-        for i in range(0, len(input_list), batch_size):
-            yield input_list[i:i + batch_size]
+        # for i in range(0, len(input_list), batch_size):
+        for s,e in split_dataframe_indices(input_list):
+            yield input_list[s:e+1]
     def store_documents(self, df, columns, page_content_columns):
         """

{ddi_fw-0.0.107.dist-info → ddi_fw-0.0.109.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ddi_fw
-Version: 0.0.107
+Version: 0.0.109
 Summary: Do not use :)
 Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
 Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>

{ddi_fw-0.0.107.dist-info → ddi_fw-0.0.109.dist-info}/RECORD RENAMED Viewed

@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
 ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
 ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
 ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
-ddi_fw/langchain/storage.py,sha256=MyBIHouhc-mcPfrL_Gei119H_cHv3Ds9iKRxPRQt07g,8809
+ddi_fw/langchain/storage.py,sha256=02cIFOrPxo2Nav5ufkKLOh8cfVY8QupxiO6rzRjNB7I,10399
 ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
 ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
 ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
 ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
 ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
 ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
-ddi_fw-0.0.107.dist-info/METADATA,sha256=KwxDl9oFHzYRfW5aYfqoNopANTv9qUXGiRQhtEO-Sqw,1967
-ddi_fw-0.0.107.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
-ddi_fw-0.0.107.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
-ddi_fw-0.0.107.dist-info/RECORD,,
+ddi_fw-0.0.109.dist-info/METADATA,sha256=0kpJYBb0pe7lXlETKf2vSuK0_dOizijcPX0N4h4aBN8,1967
+ddi_fw-0.0.109.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
+ddi_fw-0.0.109.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
+ddi_fw-0.0.109.dist-info/RECORD,,

{ddi_fw-0.0.107.dist-info → ddi_fw-0.0.109.dist-info}/WHEEL RENAMED Viewed

File without changes

{ddi_fw-0.0.107.dist-info → ddi_fw-0.0.109.dist-info}/top_level.txt RENAMED Viewed

File without changes

ddi-fw 0.0.107__py3-none-any.whl → 0.0.109__py3-none-any.whl

ddi-fw 0.0.107py3-none-any.whl → 0.0.109py3-none-any.whl