ddi-fw 0.0.185__py3-none-any.whl → 0.0.187__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -60,10 +60,10 @@ class BaseDataset(BaseModel):
60
60
  dataset_splitter_type: Type[DatasetSplitter]
61
61
  class_column: str = 'class'
62
62
  dataframe: Optional[pd.DataFrame] = None
63
- X_train: Optional[pd.DataFrame | np.ndarray] = None
64
- X_test: Optional[pd.DataFrame | np.ndarray] = None
65
- y_train: Optional[pd.Series | np.ndarray] = None
66
- y_test: Optional[pd.Series | np.ndarray] = None
63
+ X_train: Optional[np.ndarray] = None
64
+ X_test: Optional[np.ndarray] = None
65
+ y_train: Optional[np.ndarray] = None
66
+ y_test: Optional[np.ndarray] = None
67
67
  train_indexes: Optional[pd.Index] = None
68
68
  test_indexes: Optional[pd.Index] = None
69
69
  train_idx_arr: Optional[List[np.ndarray]] = None
@@ -81,7 +81,7 @@ class BaseDataset(BaseModel):
81
81
  self.y_train), np.array(self.y_test)
82
82
 
83
83
  if self.columns is None or len(self.columns) == 0:
84
- items.append([f'defaukt', np.nan_to_num(self.X_train),
84
+ items.append([f'default', np.nan_to_num(self.X_train),
85
85
  y_train_label, np.nan_to_num(self.X_test), y_test_label])
86
86
  else:
87
87
  for index, column in enumerate(self.columns):
@@ -127,11 +127,12 @@ class BaseDataset(BaseModel):
127
127
  Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
128
128
  skip deriving them. Otherwise, derive them from the dataframe and indices.
129
129
  """
130
- if self.X_train is not None and self.y_train is not None and self.X_test is not None and self.y_test is not None:
130
+ if self.X_train and self.y_train and self.X_test and self.y_test :
131
131
  # Data is already provided, no need to calculate
132
132
  logging.info(
133
133
  "X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
134
- return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
134
+ return
135
+ # return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
135
136
 
136
137
  self.prep()
137
138
 
@@ -150,24 +151,26 @@ class BaseDataset(BaseModel):
150
151
 
151
152
  train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
152
153
  test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
154
+ X_train = train.drop(self.class_column, axis=1)
155
+ X_train = train.drop(self.class_column, axis=1)
156
+ y_train = train[self.class_column]
157
+ X_test = test.drop(self.class_column, axis=1)
158
+ y_test = test[self.class_column]
159
+
160
+ self.X_train = np.array(X_train)
161
+ self.y_train = np.array(y_train)
162
+ self.X_test = np.array(X_test)
163
+ self.y_test = np.array(y_test)
153
164
 
154
- self.X_train = train.drop(self.class_column, axis=1)
155
- self.y_train = train[self.class_column]
156
- self.X_test = test.drop(self.class_column, axis=1)
157
- self.y_test = test[self.class_column]
158
-
159
- self.train_indexes = self.X_train.index
160
- self.test_indexes = self.X_test.index
165
+ self.train_indexes = X_train.index
166
+ self.test_indexes = X_test.index
161
167
  self.train_idx_arr = train_idx_arr
162
168
  self.val_idx_arr = val_idx_arr
163
169
 
164
170
  # Dataframe to numpy array conversion
165
- self.X_train = np.array(self.X_train)
166
- self.y_train = np.array(self.y_train)
167
- self.X_test = np.array(self.X_test)
168
- self.y_test = np.array(self.y_test)
171
+
169
172
 
170
- return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
173
+ # return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
171
174
 
172
175
  def __get_indexes__(self, path):
173
176
  train_index_path = path+'/train_indexes.txt'
@@ -219,21 +222,16 @@ class BaseDataset(BaseModel):
219
222
 
220
223
  X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
221
224
  X=X, y=y)
222
- self.X_train = X_train
223
- self.X_test = X_test
224
- self.y_train = y_train
225
- self.y_test = y_test
225
+ self.X_train = np.array(self.X_train)
226
+ self.X_test = np.array(self.X_test)
227
+ self.y_train = np.array(y_train.tolist())
228
+ self.y_test = np.array(y_test.tolist())
226
229
  self.train_indexes = X_train.index
227
230
  self.test_indexes = X_test.index
228
231
  self.train_idx_arr = train_idx_arr
229
232
  self.val_idx_arr = val_idx_arr
230
233
 
231
- # Dataframe to numpy array conversion
232
- self.X_train = np.array(self.X_train)
233
- self.y_train = np.array(self.y_train.tolist())
234
- self.X_test = np.array(self.X_test)
235
- self.y_test = np.array(self.y_test.tolist())
236
-
234
+
237
235
  if save_indexes:
238
236
  # train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
239
237
  self.__save_indexes__(
ddi_fw/ml/ml_helper.py CHANGED
@@ -32,7 +32,7 @@ import ddi_fw.utils as utils
32
32
 
33
33
  class MultiModalRunner:
34
34
  # todo model related parameters to config
35
- def __init__(self, library, multi_modal, use_mlflow=True):
35
+ def __init__(self, library, multi_modal, use_mlflow=False):
36
36
  self.library = library
37
37
  self.multi_modal = multi_modal
38
38
  self.use_mlflow = use_mlflow
@@ -111,44 +111,6 @@ class MultiModalRunner:
111
111
  if self.use_mlflow:
112
112
  with mlflow.start_run(run_name=self.prefix, description="***") as run:
113
113
  self.__predict(single_results)
114
- # self.level_0_run_id = run.info.run_id
115
- # item_dict = {t[0]: t for t in self.items}
116
- # print("multi_modal")
117
- # print(self.multi_modal)
118
- # print(item_dict.keys())
119
-
120
- # for m in self.multi_modal:
121
- # name = m.get('name')
122
- # input_type = m.get('input_type')
123
- # input = m.get('input')
124
- # inputs = m.get('inputs')
125
- # model_type = get_import(m.get("model_type"))
126
- # kwargs = m.get('params')
127
- # T = self.__create_model(self.library)
128
- # single_modal = T(self.date, name, model_type, **kwargs)
129
- # if input_type == '1D':
130
- # item = item_dict[input]
131
- # single_modal.set_data(
132
- # self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
133
- # elif input_type == '2D':
134
- # # check keys
135
- # filtered_dict = {k: item_dict[k]
136
- # for k in inputs if k in item_dict}
137
- # print(filtered_dict.keys())
138
- # first_input = next(iter(filtered_dict.values()))
139
- # train_data_list = [f[1] for f in filtered_dict.values()]
140
- # test_data_list = [f[3] for f in filtered_dict.values()]
141
- # train_data = np.stack(train_data_list, axis=1)
142
- # test_data = np.stack(test_data_list, axis=1)
143
- # train_label = first_input[2]
144
- # test_label = first_input[4]
145
- # single_modal.set_data(
146
- # self.train_idx_arr, self.val_idx_arr, train_data, train_label, test_data, test_label)
147
- # else:
148
- # raise Exception("check configurations")
149
- # logs, metrics, prediction = single_modal.fit_and_evaluate()
150
- # self.result.add_metric(name, metrics)
151
- # single_results[name] = prediction
152
114
  else:
153
115
  self.__predict(single_results)
154
116
  if combinations:
@@ -120,7 +120,7 @@ class TFModelWrapper(ModelWrapper):
120
120
  print(self.train_data.shape)
121
121
  models = {}
122
122
  models_val_acc = {}
123
- if self.train_idx_arr is not None and self.val_idx_arr is not None:
123
+ if self.train_idx_arr and self.val_idx_arr:
124
124
  for i, (train_idx, val_idx) in enumerate(zip(self.train_idx_arr, self.val_idx_arr)):
125
125
  print(f"Validation {i}")
126
126
 
@@ -155,7 +155,8 @@ class TFModelWrapper(ModelWrapper):
155
155
  self.train_data, self.train_label, None, None)
156
156
  models[self.descriptor] = model
157
157
  models_val_acc[self.descriptor] = checkpoint.best
158
-
158
+ if models_val_acc == {}:
159
+ return model, None
159
160
  best_model_key = max(models_val_acc, key=lambda k: models_val_acc[k])
160
161
  # best_model_key = max(models_val_acc, key=models_val_acc.get)
161
162
  best_model = models[best_model_key]
@@ -106,7 +106,7 @@ class NerParameterSearch:
106
106
  **kwargs)
107
107
 
108
108
  # train_idx_arr, val_idx_arr bir kez hesaplanması yeterli aslında
109
- X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = dataset.load()
109
+ dataset.load()
110
110
  group_items = dataset.produce_inputs()
111
111
  for item in group_items:
112
112
  # item[0] = f'threshold_{threshold}_{item[0]}'
@@ -115,8 +115,8 @@ class NerParameterSearch:
115
115
 
116
116
  self.items.extend(group_items)
117
117
  self.y_test_label = self.items[0][4]
118
- self.train_idx_arr = train_idx_arr
119
- self.val_idx_arr = val_idx_arr
118
+ self.train_idx_arr = dataset.train_idx_arr
119
+ self.val_idx_arr = dataset.val_idx_arr
120
120
 
121
121
  def run(self, model_func, batch_size=128, epochs=100):
122
122
  mlflow.set_tracking_uri(self.tracking_uri)
@@ -36,7 +36,7 @@ class Pipeline(BaseModel):
36
36
  combinations: Optional[List[str]] = None
37
37
  model: Optional[Any] = None
38
38
  multi_modal: Optional[Any] = None
39
- use_mlflow: bool = True
39
+ use_mlflow: bool = False
40
40
  _items:List=[]
41
41
  _train_idx_arr:List|None=[]
42
42
  _val_idx_arr:List|None=[]
@@ -53,45 +53,7 @@ class Pipeline(BaseModel):
53
53
 
54
54
  class Config:
55
55
  arbitrary_types_allowed = True
56
-
57
- # class Pipeline:
58
- # def __init__(self,
59
- # library='tensorflow',
60
- # experiment_name=None,
61
- # experiment_description=None,
62
- # experiment_tags=None,
63
- # artifact_location=None,
64
- # tracking_uri=None,
65
- # dataset_type: BaseDataset = None,
66
- # columns=None,
67
- # embedding_dict=None,
68
- # column_embedding_configs=None,
69
- # vector_db_persist_directory=None,
70
- # vector_db_collection_name=None,
71
- # embedding_pooling_strategy_type: PoolingStrategy = None,
72
- # ner_data_file=None,
73
- # ner_threshold=None,
74
- # combinations=None,
75
- # model=None,
76
- # multi_modal = None ):
77
- # self.library = library
78
- # self.experiment_name = experiment_name
79
- # self.experiment_description = experiment_description
80
- # self.experiment_tags = experiment_tags
81
- # self.artifact_location = artifact_location
82
- # self.tracking_uri = tracking_uri
83
- # self.dataset_type = dataset_type
84
- # self.columns = columns
85
- # self.embedding_dict = embedding_dict
86
- # self.column_embedding_configs = column_embedding_configs
87
- # self.vector_db_persist_directory = vector_db_persist_directory
88
- # self.vector_db_collection_name = vector_db_collection_name
89
- # self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
90
- # self.ner_data_file = ner_data_file
91
- # self.ner_threshold = ner_threshold
92
- # self.combinations = combinations
93
- # self.model = model
94
- # self.multi_modal = multi_modal
56
+
95
57
 
96
58
  def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
97
59
  """
@@ -194,35 +156,27 @@ class Pipeline(BaseModel):
194
156
  embedding_dict=embedding_dict,
195
157
  embedding_size=embedding_size,
196
158
  embeddings_pooling_strategy=pooling_strategy,
197
- dataset_splitter = dataset_splitter,
159
+ dataset_splitter_type = self.dataset_splitter_type,
160
+ **kwargs)
161
+ elif self.dataset_type == BaseDataset:
162
+ dataset = self.dataset_type(
163
+ dataset_splitter_type = self.dataset_splitter_type,
198
164
  **kwargs)
199
165
  else:
200
166
  dataset = self.dataset_type(**kwargs)
201
167
 
202
168
  # X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
203
169
  dataset.load()
204
-
205
170
  dataframe = dataset.dataframe
171
+ b = not( dataset.X_train and dataset.y_train and dataset.X_test and dataset.y_test)
172
+ c = dataframe is None or dataframe.empty
206
173
 
207
- if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
174
+ if b or c:
208
175
  raise ValueError("The dataset is not loaded")
209
-
210
- # dataframe.dropna()
211
- # X_train = dataset.X_train
212
- # X_test = dataset.X_test
213
- # y_train = dataset.y_train
214
- # y_test = dataset.y_test
215
- # self._train_idx_arr = dataset.train_idx_arr
216
- # self._val_idx_arr = dataset.val_idx_arr
217
- # Logic to set up the experiment
176
+
218
177
  # column name, train data, train label, test data, test label
219
178
  self._items = dataset.produce_inputs()
220
-
221
- # unique_classes = pd.unique(dataframe[dataset.class_column])
222
- # event_num = len(unique_classes)
223
- # droprate = 0.3
224
- # vector_size = self.dataset.drugs_df.shape[0]
225
-
179
+
226
180
  print("Building the experiment with the following settings:")
227
181
  print(
228
182
  f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
@@ -244,7 +198,7 @@ class Pipeline(BaseModel):
244
198
 
245
199
  y_test_label = self.items[0][4]
246
200
  multi_modal_runner = MultiModalRunner(
247
- library=self.library, multi_modal=self.multi_modal)
201
+ library=self.library, multi_modal=self.multi_modal, use_mlflow=self.use_mlflow)
248
202
  # multi_modal_runner = MultiModalRunner(
249
203
  # library=self.library, model_func=model_func, batch_size=batch_size, epochs=epochs)
250
204
  # multi_modal = TFMultiModal(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.185
3
+ Version: 0.0.187
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
2
- ddi_fw/datasets/core.py,sha256=FYFKzKpaPqRXl5UqLoD6DNZEaJEAgxLlfCPtHULjc_s,10865
2
+ ddi_fw/datasets/core.py,sha256=mZcGqP3Ukx5FbYSMi08uq4vYDr7jbHR3xg1qOPJmU0s,10640
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
5
5
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
@@ -74,18 +74,18 @@ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSz
74
74
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
75
75
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
76
76
  ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
77
- ddi_fw/ml/ml_helper.py,sha256=vT_k-m0wGWQOufBYP4qLGdJThMkFh9046UqjoaEJ3Pc,8549
77
+ ddi_fw/ml/ml_helper.py,sha256=l1ZLYL3x5bHxD2bh2ezEgWDlV0ni8zGZGgj07x7KR40,6310
78
78
  ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
79
79
  ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
80
- ddi_fw/ml/tensorflow_wrapper.py,sha256=xX_rP6nzB2yQiNvGP9_PbbQt1bXiPPWEozIzpbV6Ens,12911
80
+ ddi_fw/ml/tensorflow_wrapper.py,sha256=-zcbd0LBg9QNMF9K1I-JC379cS3rTO7ibgsDIOnMsoc,12951
81
81
  ddi_fw/ner/__init__.py,sha256=JwhGXrepomxPSsGsg2b_xPRC72AjvxOIn2CW5Mvscn0,26
82
82
  ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6859
83
83
  ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
84
84
  ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
85
85
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
86
86
  ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
87
- ddi_fw/pipeline/ner_pipeline.py,sha256=kNGtkg5rNX5MDywzvRxmvyk-DxXAjEbYzZkp8pNlAZo,6023
88
- ddi_fw/pipeline/pipeline.py,sha256=11CgBgNxzo1KqKWudezSM2iFruoUVG-JMNbwznvt1KA,11362
87
+ ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
88
+ ddi_fw/pipeline/pipeline.py,sha256=VSILkxot_O1DJMWPavzFUH3le4zVKQydcH32SbuHZlQ,9355
89
89
  ddi_fw/utils/__init__.py,sha256=bqIC0YjbD0YSHtO0nWUkRs4w5nu7qBV0yU72sRzwCj8,475
90
90
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
91
91
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
@@ -98,7 +98,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
98
98
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
99
99
  ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
100
100
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
101
- ddi_fw-0.0.185.dist-info/METADATA,sha256=cafBi3CO83LDvyYHTP5qXVRTkBYXTdHKhDu_r5Fki5E,2542
102
- ddi_fw-0.0.185.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
103
- ddi_fw-0.0.185.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
104
- ddi_fw-0.0.185.dist-info/RECORD,,
101
+ ddi_fw-0.0.187.dist-info/METADATA,sha256=dzH9YAqsPxQcvuS9h0JRNx5qtd8vGNr-1c5f0uE3c7M,2542
102
+ ddi_fw-0.0.187.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
103
+ ddi_fw-0.0.187.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
104
+ ddi_fw-0.0.187.dist-info/RECORD,,