ddi-fw 0.0.108__py3-none-any.whl → 0.0.110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,16 +25,17 @@ def load_configuration(config_file):
25
25
  config = json.load(f)
26
26
  return config
27
27
 
28
+
28
29
  def split_dataframe(df, min_size=512):
29
30
  total_size = len(df)
30
31
  # If the dataframe is smaller than min_size, return the dataframe as a whole
31
32
  if total_size <= min_size:
32
33
  return [df]
33
-
34
+
34
35
  # List to store partial DataFrames
35
36
  partial_dfs = []
36
37
  start_idx = 0
37
-
38
+
38
39
  # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
39
40
  num_chunks = total_size // min_size
40
41
  remaining_rows = total_size
@@ -44,19 +45,57 @@ def split_dataframe(df, min_size=512):
44
45
  chunk_size = min_size
45
46
  if (remaining_rows - chunk_size) < min_size:
46
47
  chunk_size = remaining_rows # Last chunk takes all remaining rows
47
-
48
+
48
49
  partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
49
-
50
+
50
51
  # Update the start index and remaining rows
51
52
  start_idx += chunk_size
52
53
  remaining_rows -= chunk_size
53
-
54
+
54
55
  # If there are any remaining rows left after the loop, they should form the last chunk
55
56
  if remaining_rows > 0:
56
57
  partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
57
-
58
+
58
59
  return partial_dfs
59
60
 
61
+
62
+ def split_dataframe_indices(df, min_size=512):
63
+ total_size = len(df)
64
+
65
+ # If the dataframe is smaller than min_size, return the entire range
66
+ if total_size <= min_size:
67
+ return [(0, total_size - 1)]
68
+
69
+ # List to store the start and end indices of each chunk
70
+ chunk_indices = []
71
+ start_idx = 0
72
+
73
+ # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
74
+ num_chunks = total_size // min_size
75
+ remaining_rows = total_size
76
+
77
+ # Split into chunks
78
+ for i in range(num_chunks):
79
+ chunk_size = min_size
80
+ if (remaining_rows - chunk_size) < min_size:
81
+ chunk_size = remaining_rows # Last chunk takes all remaining rows
82
+
83
+ # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
84
+ end_idx = start_idx + chunk_size - 1
85
+ chunk_indices.append((start_idx, end_idx))
86
+
87
+ # Update the start index and remaining rows
88
+ start_idx += chunk_size
89
+ remaining_rows -= chunk_size
90
+
91
+ # If there are any remaining rows after the loop, they should form the last chunk
92
+ if remaining_rows > 0:
93
+ end_idx = start_idx + remaining_rows - 1
94
+ chunk_indices.append((start_idx, end_idx))
95
+
96
+ return chunk_indices
97
+
98
+
60
99
  class DataFrameToVectorDB:
61
100
  def __init__(self,
62
101
  collection_name,
@@ -78,8 +117,9 @@ class DataFrameToVectorDB:
78
117
  return docs
79
118
 
80
119
  def __split_list(self, input_list, batch_size):
81
- for i in range(0, len(input_list), batch_size):
82
- yield input_list[i:i + batch_size]
120
+ # for i in range(0, len(input_list), batch_size):
121
+ for s, e in split_dataframe_indices(input_list):
122
+ yield input_list[s:e+1]
83
123
 
84
124
  def store_documents(self, df, columns, page_content_columns):
85
125
  """
@@ -91,6 +131,26 @@ class DataFrameToVectorDB:
91
131
  col_df = df[copy_columns].copy()
92
132
  col_df.dropna(subset=[page_content_column], inplace=True)
93
133
  col_df['type'] = page_content_column # Set the type column
134
+ # if partial_df_size:
135
+ # documents = []
136
+
137
+ # partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
+ # for partial_df in partial_dfs:
139
+ # print(f"{page_content_column}, size:{len(partial_df)}")
140
+ # print(partial_df.count())
141
+ # loader = DataFrameLoader(
142
+ # data_frame=partial_df, page_content_column=page_content_column)
143
+ # loaded_docs = loader.load()
144
+ # documents.extend(self.__split_docs(loaded_docs))
145
+ # print(f"Documents size: {len(documents)}")
146
+ # split_docs_chunked = self.__split_list(
147
+ # documents, self.batch_size)
148
+
149
+ # for split_docs_chunk in split_docs_chunked:
150
+ # print(f"Split docs size: {len(split_docs_chunk)}")
151
+ # self.vectordb.add_documents(split_docs_chunk)
152
+ # self.vectordb.persist()
153
+ # else:
94
154
  documents = []
95
155
 
96
156
  loader = DataFrameLoader(
@@ -98,37 +158,13 @@ class DataFrameToVectorDB:
98
158
  loaded_docs = loader.load()
99
159
  documents.extend(self.__split_docs(loaded_docs))
100
160
 
101
- split_docs_chunked = self.__split_list(documents, self.batch_size)
161
+ split_docs_chunked = self.__split_list(
162
+ documents, self.batch_size)
102
163
 
103
164
  for split_docs_chunk in split_docs_chunked:
104
- # vectordb = Chroma.from_documents(
105
- # collection_name=collection_name,
106
- # documents=split_docs_chunk,
107
- # embedding=embeddings,
108
- # persist_directory=persist_directory,
109
- # )
110
165
  self.vectordb.add_documents(split_docs_chunk)
111
166
  self.vectordb.persist()
112
167
 
113
- def store(self, df, columns, page_content_columns, partial_df_size=None):
114
- """
115
- Store function to handle both full and partial dataframe processing.
116
- """
117
- if partial_df_size:
118
- partial_dfs = split_dataframe(df, min_size = partial_df_size)
119
- for partial_df in partial_dfs:
120
- self.store_documents(df=partial_df, columns=columns,
121
- page_content_columns=page_content_columns)
122
- # Process the dataframe in chunks if partial_df_size is provided
123
- # for i in range(0, len(df), partial_df_size):
124
- # batch = df[i: i + partial_df_size]
125
- # self.store_documents(df=batch, columns=columns,
126
- # page_content_columns=page_content_columns)
127
- else:
128
- # Process the entire dataframe if no partial_df_size is specified
129
- self.store_documents(df=df, columns=columns,
130
- page_content_columns=page_content_columns)
131
-
132
168
 
133
169
  def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
134
170
  """
@@ -159,11 +195,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
159
195
  # print(f"Configuration for collection {id} not found.")
160
196
  # continue
161
197
 
162
- embedding_model_type = collection_config['embedding_model_type']
163
- text_splitters_types = collection_config['text_splitters_types']
164
- batch_size = collection_config['batch_size']
165
- columns = collection_config['columns']
166
- page_content_columns = collection_config['page_content_columns']
198
+ embedding_model_type = collection_config.get('embedding_model_type')
199
+ text_splitters_types = collection_config.get('text_splitters_types')
200
+ batch_size = collection_config.get('batch_size')
201
+ columns = collection_config.get('columns')
202
+ page_content_columns = collection_config.get('page_content_columns')
167
203
  persist_directory = f'{persist_directory}/{id}'
168
204
 
169
205
  # Load the embedding model and text splitter dynamically
@@ -171,8 +207,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
171
207
 
172
208
  # Assuming the classes for the embeddings and splitters are available
173
209
  try:
174
- model_kwargs = collection_config['model_kwargs']
175
- SBertEmbeddings(model_name="a", model_config={})
210
+ model_kwargs = collection_config.get('model_kwargs')
176
211
  model = get_import(embedding_model_type)(
177
212
  model_name=name, **model_kwargs)
178
213
  except:
@@ -183,7 +218,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
183
218
  text_splitters_suffixes = []
184
219
  for text_splitter_type in text_splitters_types:
185
220
  try:
186
- type_of_text_splitter = get_import(text_splitter_type.get("type"))
221
+ type_of_text_splitter = get_import(
222
+ text_splitter_type.get("type"))
187
223
  kwargs = text_splitter_type.get("params")
188
224
  suffix = text_splitter_type.get("suffix")
189
225
  if kwargs:
@@ -204,5 +240,5 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
204
240
  embeddings=model,
205
241
  text_splitter=text_splitter,
206
242
  batch_size=1024)
207
- to_vector_db.store(
208
- df, columns, page_content_columns, partial_df_size=batch_size)
243
+ to_vector_db.store_documents(
244
+ df, columns, page_content_columns)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.108
3
+ Version: 0.0.110
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=LHbrN9QJ6-aV6jaxYHCcJ2shgdrgH4Y4vCuFMSxYvrw,9028
61
+ ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.108.dist-info/METADATA,sha256=g_nsmA5N7aVJkBiWJNZAbkdcoCXGxBxnspfpVRg_n-0,1967
95
- ddi_fw-0.0.108.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.108.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.108.dist-info/RECORD,,
94
+ ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
95
+ ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.110.dist-info/RECORD,,