ddi-fw 0.0.105__py3-none-any.whl → 0.0.106__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,7 @@ from langchain.docstore.document import Document
10
10
  from langchain.document_loaders import DataFrameLoader
11
11
 
12
12
  from langchain.text_splitter import TextSplitter
13
+ import numpy as np
13
14
 
14
15
  # from langchain_community.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
15
16
  from ddi_fw.langchain.embeddings import SBertEmbeddings
@@ -24,6 +25,46 @@ def load_configuration(config_file):
24
25
  config = json.load(f)
25
26
  return config
26
27
 
28
+ def split_dataframe(df, min_size=512, max_size=1024):
29
+ # Ensure the total size of the DataFrame is larger than the desired split size
30
+ total_size = len(df)
31
+
32
+ # Check if the dataframe is large enough
33
+ if total_size <= min_size:
34
+ return df
35
+
36
+ # List to store partial DataFrames
37
+ partial_dfs = []
38
+
39
+ # Start splitting the DataFrame
40
+ start_idx = 0
41
+
42
+ while start_idx < total_size:
43
+ # Calculate the size of the next chunk: it should be between min_size and max_size
44
+ chunk_size = np.random.randint(min_size, max_size + 1)
45
+
46
+ # Ensure that the chunk size does not exceed the remaining data
47
+ chunk_size = min(chunk_size, total_size - start_idx)
48
+
49
+ # Create the partial DataFrame and append to the list
50
+ partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
51
+
52
+ # Update the start index for the next chunk
53
+ start_idx += chunk_size
54
+
55
+ return partial_dfs
56
+
57
+ # Example Usage:
58
+ # Assuming df is your large DataFrame
59
+ # df = pd.read_csv('your_large_file.csv') # or your DataFrame
60
+
61
+ # Split the DataFrame into smaller chunks
62
+ partial_dfs = split_dataframe(df)
63
+
64
+ # Verify the sizes of the partial DataFrames
65
+ for i, partial_df in enumerate(partial_dfs):
66
+ print(f"Partial DataFrame {i+1} size: {len(partial_df)}")
67
+
27
68
 
28
69
  class DataFrameToVectorDB:
29
70
  def __init__(self,
@@ -83,11 +124,15 @@ class DataFrameToVectorDB:
83
124
  Store function to handle both full and partial dataframe processing.
84
125
  """
85
126
  if partial_df_size:
86
- # Process the dataframe in chunks if partial_df_size is provided
87
- for i in range(0, len(df), partial_df_size):
88
- batch = df[i: i + partial_df_size]
89
- self.store_documents(df=batch, columns=columns,
127
+ partial_dfs = split_dataframe(df, min_size = partial_df_size)
128
+ for partial_df in partial_dfs:
129
+ self.store_documents(df=partial_df, columns=columns,
90
130
  page_content_columns=page_content_columns)
131
+ # Process the dataframe in chunks if partial_df_size is provided
132
+ # for i in range(0, len(df), partial_df_size):
133
+ # batch = df[i: i + partial_df_size]
134
+ # self.store_documents(df=batch, columns=columns,
135
+ # page_content_columns=page_content_columns)
91
136
  else:
92
137
  # Process the entire dataframe if no partial_df_size is specified
93
138
  self.store_documents(df=df, columns=columns,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.105
3
+ Version: 0.0.106
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=F7mvNxLQV2wRh2C__qyY4jpJK9udzkETEvQDXlqKbxk,7505
61
+ ddi_fw/langchain/storage.py,sha256=SUhwEXH3OWl-6mmTJsAsHwuvR6XWhiWjrVqi5kW6q3U,9171
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.105.dist-info/METADATA,sha256=lALwj-QIiaHuYlqdeT1L5wp23Lz7FI9ZeihBHHfUCos,1967
95
- ddi_fw-0.0.105.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.105.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.105.dist-info/RECORD,,
94
+ ddi_fw-0.0.106.dist-info/METADATA,sha256=P1ygEaW4bV2r8dXkyfzSmratZdacfPzOzr35O-cujGQ,1967
95
+ ddi_fw-0.0.106.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.106.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.106.dist-info/RECORD,,