ddi-fw 0.0.109__py3-none-any.whl → 0.0.110__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,16 +25,17 @@ def load_configuration(config_file):
25
25
  config = json.load(f)
26
26
  return config
27
27
 
28
+
28
29
  def split_dataframe(df, min_size=512):
29
30
  total_size = len(df)
30
31
  # If the dataframe is smaller than min_size, return the dataframe as a whole
31
32
  if total_size <= min_size:
32
33
  return [df]
33
-
34
+
34
35
  # List to store partial DataFrames
35
36
  partial_dfs = []
36
37
  start_idx = 0
37
-
38
+
38
39
  # Calculate the minimum number of chunks we need to ensure each chunk has at least min_size
39
40
  num_chunks = total_size // min_size
40
41
  remaining_rows = total_size
@@ -44,55 +45,57 @@ def split_dataframe(df, min_size=512):
44
45
  chunk_size = min_size
45
46
  if (remaining_rows - chunk_size) < min_size:
46
47
  chunk_size = remaining_rows # Last chunk takes all remaining rows
47
-
48
+
48
49
  partial_dfs.append(df.iloc[start_idx:start_idx + chunk_size])
49
-
50
+
50
51
  # Update the start index and remaining rows
51
52
  start_idx += chunk_size
52
53
  remaining_rows -= chunk_size
53
-
54
+
54
55
  # If there are any remaining rows left after the loop, they should form the last chunk
55
56
  if remaining_rows > 0:
56
57
  partial_dfs.append(df.iloc[start_idx:start_idx + remaining_rows])
57
-
58
+
58
59
  return partial_dfs
59
60
 
61
+
60
62
  def split_dataframe_indices(df, min_size=512):
61
63
  total_size = len(df)
62
-
64
+
63
65
  # If the dataframe is smaller than min_size, return the entire range
64
66
  if total_size <= min_size:
65
67
  return [(0, total_size - 1)]
66
-
68
+
67
69
  # List to store the start and end indices of each chunk
68
70
  chunk_indices = []
69
71
  start_idx = 0
70
-
72
+
71
73
  # Calculate the minimum number of chunks needed to ensure each chunk has at least min_size
72
74
  num_chunks = total_size // min_size
73
75
  remaining_rows = total_size
74
-
76
+
75
77
  # Split into chunks
76
78
  for i in range(num_chunks):
77
79
  chunk_size = min_size
78
80
  if (remaining_rows - chunk_size) < min_size:
79
81
  chunk_size = remaining_rows # Last chunk takes all remaining rows
80
-
82
+
81
83
  # Calculate the ending index of the chunk (exclusive, hence chunk_size - 1)
82
84
  end_idx = start_idx + chunk_size - 1
83
85
  chunk_indices.append((start_idx, end_idx))
84
-
86
+
85
87
  # Update the start index and remaining rows
86
88
  start_idx += chunk_size
87
89
  remaining_rows -= chunk_size
88
-
90
+
89
91
  # If there are any remaining rows after the loop, they should form the last chunk
90
92
  if remaining_rows > 0:
91
93
  end_idx = start_idx + remaining_rows - 1
92
94
  chunk_indices.append((start_idx, end_idx))
93
-
95
+
94
96
  return chunk_indices
95
97
 
98
+
96
99
  class DataFrameToVectorDB:
97
100
  def __init__(self,
98
101
  collection_name,
@@ -115,7 +118,7 @@ class DataFrameToVectorDB:
115
118
 
116
119
  def __split_list(self, input_list, batch_size):
117
120
  # for i in range(0, len(input_list), batch_size):
118
- for s,e in split_dataframe_indices(input_list):
121
+ for s, e in split_dataframe_indices(input_list):
119
122
  yield input_list[s:e+1]
120
123
 
121
124
  def store_documents(self, df, columns, page_content_columns):
@@ -128,6 +131,26 @@ class DataFrameToVectorDB:
128
131
  col_df = df[copy_columns].copy()
129
132
  col_df.dropna(subset=[page_content_column], inplace=True)
130
133
  col_df['type'] = page_content_column # Set the type column
134
+ # if partial_df_size:
135
+ # documents = []
136
+
137
+ # partial_dfs = split_dataframe(col_df, min_size=partial_df_size)
138
+ # for partial_df in partial_dfs:
139
+ # print(f"{page_content_column}, size:{len(partial_df)}")
140
+ # print(partial_df.count())
141
+ # loader = DataFrameLoader(
142
+ # data_frame=partial_df, page_content_column=page_content_column)
143
+ # loaded_docs = loader.load()
144
+ # documents.extend(self.__split_docs(loaded_docs))
145
+ # print(f"Documents size: {len(documents)}")
146
+ # split_docs_chunked = self.__split_list(
147
+ # documents, self.batch_size)
148
+
149
+ # for split_docs_chunk in split_docs_chunked:
150
+ # print(f"Split docs size: {len(split_docs_chunk)}")
151
+ # self.vectordb.add_documents(split_docs_chunk)
152
+ # self.vectordb.persist()
153
+ # else:
131
154
  documents = []
132
155
 
133
156
  loader = DataFrameLoader(
@@ -135,37 +158,13 @@ class DataFrameToVectorDB:
135
158
  loaded_docs = loader.load()
136
159
  documents.extend(self.__split_docs(loaded_docs))
137
160
 
138
- split_docs_chunked = self.__split_list(documents, self.batch_size)
161
+ split_docs_chunked = self.__split_list(
162
+ documents, self.batch_size)
139
163
 
140
164
  for split_docs_chunk in split_docs_chunked:
141
- # vectordb = Chroma.from_documents(
142
- # collection_name=collection_name,
143
- # documents=split_docs_chunk,
144
- # embedding=embeddings,
145
- # persist_directory=persist_directory,
146
- # )
147
165
  self.vectordb.add_documents(split_docs_chunk)
148
166
  self.vectordb.persist()
149
167
 
150
- def store(self, df, columns, page_content_columns, partial_df_size=None):
151
- """
152
- Store function to handle both full and partial dataframe processing.
153
- """
154
- if partial_df_size:
155
- partial_dfs = split_dataframe(df, min_size = partial_df_size)
156
- for partial_df in partial_dfs:
157
- self.store_documents(df=partial_df, columns=columns,
158
- page_content_columns=page_content_columns)
159
- # Process the dataframe in chunks if partial_df_size is provided
160
- # for i in range(0, len(df), partial_df_size):
161
- # batch = df[i: i + partial_df_size]
162
- # self.store_documents(df=batch, columns=columns,
163
- # page_content_columns=page_content_columns)
164
- else:
165
- # Process the entire dataframe if no partial_df_size is specified
166
- self.store_documents(df=df, columns=columns,
167
- page_content_columns=page_content_columns)
168
-
169
168
 
170
169
  def generate_embeddings(df, config_file, new_model_names, collections=None, persist_directory="embeddings"):
171
170
  """
@@ -196,11 +195,11 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
196
195
  # print(f"Configuration for collection {id} not found.")
197
196
  # continue
198
197
 
199
- embedding_model_type = collection_config['embedding_model_type']
200
- text_splitters_types = collection_config['text_splitters_types']
201
- batch_size = collection_config['batch_size']
202
- columns = collection_config['columns']
203
- page_content_columns = collection_config['page_content_columns']
198
+ embedding_model_type = collection_config.get('embedding_model_type')
199
+ text_splitters_types = collection_config.get('text_splitters_types')
200
+ batch_size = collection_config.get('batch_size')
201
+ columns = collection_config.get('columns')
202
+ page_content_columns = collection_config.get('page_content_columns')
204
203
  persist_directory = f'{persist_directory}/{id}'
205
204
 
206
205
  # Load the embedding model and text splitter dynamically
@@ -208,8 +207,7 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
208
207
 
209
208
  # Assuming the classes for the embeddings and splitters are available
210
209
  try:
211
- model_kwargs = collection_config['model_kwargs']
212
- SBertEmbeddings(model_name="a", model_config={})
210
+ model_kwargs = collection_config.get('model_kwargs')
213
211
  model = get_import(embedding_model_type)(
214
212
  model_name=name, **model_kwargs)
215
213
  except:
@@ -220,7 +218,8 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
220
218
  text_splitters_suffixes = []
221
219
  for text_splitter_type in text_splitters_types:
222
220
  try:
223
- type_of_text_splitter = get_import(text_splitter_type.get("type"))
221
+ type_of_text_splitter = get_import(
222
+ text_splitter_type.get("type"))
224
223
  kwargs = text_splitter_type.get("params")
225
224
  suffix = text_splitter_type.get("suffix")
226
225
  if kwargs:
@@ -241,5 +240,5 @@ def generate_embeddings(df, config_file, new_model_names, collections=None, pers
241
240
  embeddings=model,
242
241
  text_splitter=text_splitter,
243
242
  batch_size=1024)
244
- to_vector_db.store(
245
- df, columns, page_content_columns, partial_df_size=batch_size)
243
+ to_vector_db.store_documents(
244
+ df, columns, page_content_columns)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ddi_fw
3
- Version: 0.0.109
3
+ Version: 0.0.110
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -58,7 +58,7 @@ ddi_fw/drugbank/event_extractor.py,sha256=6odoZohhK7OdLF-LF0l-5BFq0_NMG_5jrFJbHr
58
58
  ddi_fw/langchain/__init__.py,sha256=zS0CQrakWEP19biSRewFJGcBT8WBZq4899HrEKiMqUY,269
59
59
  ddi_fw/langchain/embeddings.py,sha256=lU64a5AZ62jP8U3hTSwK0kXt7gThbwPACLfJMZ1baPA,7538
60
60
  ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSzu5L2PXzg,280
61
- ddi_fw/langchain/storage.py,sha256=02cIFOrPxo2Nav5ufkKLOh8cfVY8QupxiO6rzRjNB7I,10399
61
+ ddi_fw/langchain/storage.py,sha256=gIQfpRG1t8SY1r3jWZGp-MOblQ_-8EgqZ55A1ZQ8kBg,10047
62
62
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
63
63
  ddi_fw/ml/evaluation_helper.py,sha256=o4-w5Xa3t4olLW4ymx_8L-Buhe5wfQEmT2bh4Zz544c,13066
64
64
  ddi_fw/ml/ml_helper.py,sha256=fySjIAFzkeEOvaLJhDwtCOgRhgYQ7H106eqaP16GhDY,4489
@@ -91,7 +91,7 @@ ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5
91
91
  ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
92
92
  ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
93
93
  ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,5567
94
- ddi_fw-0.0.109.dist-info/METADATA,sha256=0kpJYBb0pe7lXlETKf2vSuK0_dOizijcPX0N4h4aBN8,1967
95
- ddi_fw-0.0.109.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
- ddi_fw-0.0.109.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
- ddi_fw-0.0.109.dist-info/RECORD,,
94
+ ddi_fw-0.0.110.dist-info/METADATA,sha256=bozy0pU7E9nOOqwWvqXof28dNdfr9UQqoyb1OVKyfHw,1967
95
+ ddi_fw-0.0.110.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
96
+ ddi_fw-0.0.110.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
97
+ ddi_fw-0.0.110.dist-info/RECORD,,