ddi-fw 0.0.106__py3-none-any.whl → 0.0.108__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,47 +25,38 @@ def load_configuration(config_file):
25
25
  config = json.load(f)
26
26
  return config
27
27
 
28
- def split_dataframe(df, min_size=512, max_size=1024):
29
- # Ensure the total size of the DataFrame is larger than the desired split size
28
+ def split_dataframe(df, min_size=512):
30
29
  total_size = len(df)
31
-
32
- # Check if the dataframe is large enough
30
+ # If the dataframe is smaller than min_size, return the dataframe as a whole
33
31
  if total_size <= min_size:
34
- return df
32
+ return [df]
35
33
 
36
34
  # List to store partial DataFrames
37
35
  partial_dfs = []
38
-
39
- # Start splitting the DataFrame
40
36
  start_idx = 0
41
37
 
42
- while start_idx < total_size:
43
- # Calculate the size of the next chunk: it should be between min_size and max_size
44
- chunk_size = np.random.randint(min_size, max_size + 1)
38
+ # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
39
+ num_chunks = total_size // min_size
40
+ remaining_rows = total_size
41
+ # Split into chunks
42
+ for i in range(num_chunks):
43
+ # If there are fewer rows left than the size of the chunk, adjust the chunk size
44
+ chunk_size = min_size
45
+ if (remaining_rows - chunk_size) < min_size:
46
+ chunk_size = remaining_rows # Last chunk takes all remaining rows
45
47
 
46
- # Ensure that the chunk size does not exceed the remaining data
47
- chunk_size = min(chunk_size, total_size - start_idx)
48
-
49
- # Create the partial DataFrame and append to the list
50
48
  partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
51
49
 
52
- # Update the start index for the next chunk
50
+ # Update the start index and remaining rows
53
51
  start_idx += chunk_size
52
+ remaining_rows -= chunk_size
53
+
54
+ # If there are any remaining rows left after the loop, they should form the last chunk
55
+ if remaining_rows > 0:
56
+ partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
54
57
 
55
58
  return partial_dfs
56
59
 
57
- # Example Usage:
58
- # Assuming df is your large DataFrame
59
- # df = pd.read_csv('your_large_file.csv') # or your DataFrame
60
-
61
- # Split the DataFrame into smaller chunks
62
- partial_dfs = split_dataframe(df)
63
-
64
- # Verify the sizes of the partial DataFrames
65
- for i, partial_df in enumerate(partial_dfs):
66
- print(f"Partial DataFrame {i+1} size: {len(partial_df)}")
67
-
68
-
69
60
  class DataFrameToVectorDB:
70
61
  def __init__(self,
71
62
  collection_name,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.106
3
+ Version: 0.0.108
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=SUhwEXH3OWl-6mmTJsAsHwuvR6XWhiWjrVqi5kW6q3U,9171
61
+ ddi_fw/langchain/storage.py,sha256=LHbrN9QJ6-aV6jaxYHCcJ2shgdrgH4Y4vCuFMSxYvrw,9028
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.106.dist-info/METADATA,sha256=P1ygEaW4bV2r8dXkyfzSmratZdacfPzOzr35O-cujGQ,1967
95
- ddi_fw-0.0.106.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.106.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.106.dist-info/RECORD,,
94
+ ddi_fw-0.0.108.dist-info/METADATA,sha256=g_nsmA5N7aVJkBiWJNZAbkdcoCXGxBxnspfpVRg_n-0,1967
95
+ ddi_fw-0.0.108.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.108.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.108.dist-info/RECORD,,