ddi-fw 0.0.186__py3-none-any.whl → 0.0.188__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ddi_fw/datasets/core.py CHANGED
@@ -60,10 +60,10 @@ class BaseDataset(BaseModel):
60
60
  dataset_splitter_type: Type[DatasetSplitter]
61
61
  class_column: str = 'class'
62
62
  dataframe: Optional[pd.DataFrame] = None
63
- X_train: Optional[pd.DataFrame | np.ndarray] = None
64
- X_test: Optional[pd.DataFrame | np.ndarray] = None
65
- y_train: Optional[pd.Series | np.ndarray] = None
66
- y_test: Optional[pd.Series | np.ndarray] = None
63
+ X_train: Optional[np.ndarray] = None
64
+ X_test: Optional[np.ndarray] = None
65
+ y_train: Optional[np.ndarray] = None
66
+ y_test: Optional[np.ndarray] = None
67
67
  train_indexes: Optional[pd.Index] = None
68
68
  test_indexes: Optional[pd.Index] = None
69
69
  train_idx_arr: Optional[List[np.ndarray]] = None
@@ -81,7 +81,7 @@ class BaseDataset(BaseModel):
81
81
  self.y_train), np.array(self.y_test)
82
82
 
83
83
  if self.columns is None or len(self.columns) == 0:
84
- items.append([f'defaukt', np.nan_to_num(self.X_train),
84
+ items.append([f'default', np.nan_to_num(self.X_train),
85
85
  y_train_label, np.nan_to_num(self.X_test), y_test_label])
86
86
  else:
87
87
  for index, column in enumerate(self.columns):
@@ -127,11 +127,12 @@ class BaseDataset(BaseModel):
127
127
  Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
128
128
  skip deriving them. Otherwise, derive them from the dataframe and indices.
129
129
  """
130
- if self.X_train is not None and self.y_train is not None and self.X_test is not None and self.y_test is not None:
130
+ if self.X_train and self.y_train and self.X_test and self.y_test :
131
131
  # Data is already provided, no need to calculate
132
132
  logging.info(
133
133
  "X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
134
- return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
134
+ return
135
+ # return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
135
136
 
136
137
  self.prep()
137
138
 
@@ -150,24 +151,26 @@ class BaseDataset(BaseModel):
150
151
 
151
152
  train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
152
153
  test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
154
+ X_train = train.drop(self.class_column, axis=1)
155
+ X_train = train.drop(self.class_column, axis=1)
156
+ y_train = train[self.class_column]
157
+ X_test = test.drop(self.class_column, axis=1)
158
+ y_test = test[self.class_column]
159
+
160
+ self.X_train = np.array(X_train)
161
+ self.y_train = np.array(y_train)
162
+ self.X_test = np.array(X_test)
163
+ self.y_test = np.array(y_test)
153
164
 
154
- self.X_train = train.drop(self.class_column, axis=1)
155
- self.y_train = train[self.class_column]
156
- self.X_test = test.drop(self.class_column, axis=1)
157
- self.y_test = test[self.class_column]
158
-
159
- self.train_indexes = self.X_train.index
160
- self.test_indexes = self.X_test.index
165
+ self.train_indexes = X_train.index
166
+ self.test_indexes = X_test.index
161
167
  self.train_idx_arr = train_idx_arr
162
168
  self.val_idx_arr = val_idx_arr
163
169
 
164
170
  # Dataframe to numpy array conversion
165
- self.X_train = np.array(self.X_train)
166
- self.y_train = np.array(self.y_train)
167
- self.X_test = np.array(self.X_test)
168
- self.y_test = np.array(self.y_test)
171
+
169
172
 
170
- return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
173
+ # return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
171
174
 
172
175
  def __get_indexes__(self, path):
173
176
  train_index_path = path+'/train_indexes.txt'
@@ -219,21 +222,16 @@ class BaseDataset(BaseModel):
219
222
 
220
223
  X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
221
224
  X=X, y=y)
222
- self.X_train = X_train
223
- self.X_test = X_test
224
- self.y_train = y_train
225
- self.y_test = y_test
225
+ self.X_train = np.array(X_train)
226
+ self.X_test = np.array(X_test)
227
+ self.y_train = np.array(y_train.tolist())
228
+ self.y_test = np.array(y_test.tolist())
226
229
  self.train_indexes = X_train.index
227
230
  self.test_indexes = X_test.index
228
231
  self.train_idx_arr = train_idx_arr
229
232
  self.val_idx_arr = val_idx_arr
230
233
 
231
- # Dataframe to numpy array conversion
232
- self.X_train = np.array(self.X_train)
233
- self.y_train = np.array(self.y_train.tolist())
234
- self.X_test = np.array(self.X_test)
235
- self.y_test = np.array(self.y_test.tolist())
236
-
234
+
237
235
  if save_indexes:
238
236
  # train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
239
237
  self.__save_indexes__(
ddi_fw/ml/ml_helper.py CHANGED
@@ -111,44 +111,6 @@ class MultiModalRunner:
111
111
  if self.use_mlflow:
112
112
  with mlflow.start_run(run_name=self.prefix, description="***") as run:
113
113
  self.__predict(single_results)
114
- # self.level_0_run_id = run.info.run_id
115
- # item_dict = {t[0]: t for t in self.items}
116
- # print("multi_modal")
117
- # print(self.multi_modal)
118
- # print(item_dict.keys())
119
-
120
- # for m in self.multi_modal:
121
- # name = m.get('name')
122
- # input_type = m.get('input_type')
123
- # input = m.get('input')
124
- # inputs = m.get('inputs')
125
- # model_type = get_import(m.get("model_type"))
126
- # kwargs = m.get('params')
127
- # T = self.__create_model(self.library)
128
- # single_modal = T(self.date, name, model_type, **kwargs)
129
- # if input_type == '1D':
130
- # item = item_dict[input]
131
- # single_modal.set_data(
132
- # self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
133
- # elif input_type == '2D':
134
- # # check keys
135
- # filtered_dict = {k: item_dict[k]
136
- # for k in inputs if k in item_dict}
137
- # print(filtered_dict.keys())
138
- # first_input = next(iter(filtered_dict.values()))
139
- # train_data_list = [f[1] for f in filtered_dict.values()]
140
- # test_data_list = [f[3] for f in filtered_dict.values()]
141
- # train_data = np.stack(train_data_list, axis=1)
142
- # test_data = np.stack(test_data_list, axis=1)
143
- # train_label = first_input[2]
144
- # test_label = first_input[4]
145
- # single_modal.set_data(
146
- # self.train_idx_arr, self.val_idx_arr, train_data, train_label, test_data, test_label)
147
- # else:
148
- # raise Exception("check configurations")
149
- # logs, metrics, prediction = single_modal.fit_and_evaluate()
150
- # self.result.add_metric(name, metrics)
151
- # single_results[name] = prediction
152
114
  else:
153
115
  self.__predict(single_results)
154
116
  if combinations:
@@ -36,63 +36,26 @@ class Pipeline(BaseModel):
36
36
  combinations: Optional[List[str]] = None
37
37
  model: Optional[Any] = None
38
38
  multi_modal: Optional[Any] = None
39
- use_mlflow: bool = True
40
- _items:List=[]
41
- _train_idx_arr:List|None=[]
42
- _val_idx_arr:List|None=[]
43
-
39
+ use_mlflow: bool = False
40
+ _items: List = []
41
+ _train_idx_arr: List | None = []
42
+ _val_idx_arr: List | None = []
43
+
44
44
  @property
45
45
  def items(self) -> List:
46
46
  return self._items
47
+
47
48
  @property
48
- def train_idx_arr(self) -> List|None:
49
+ def train_idx_arr(self) -> List | None:
49
50
  return self._train_idx_arr
51
+
50
52
  @property
51
- def val_idx_arr(self) -> List|None:
53
+ def val_idx_arr(self) -> List | None:
52
54
  return self._val_idx_arr
53
55
 
54
56
  class Config:
55
57
  arbitrary_types_allowed = True
56
58
 
57
- # class Pipeline:
58
- # def __init__(self,
59
- # library='tensorflow',
60
- # experiment_name=None,
61
- # experiment_description=None,
62
- # experiment_tags=None,
63
- # artifact_location=None,
64
- # tracking_uri=None,
65
- # dataset_type: BaseDataset = None,
66
- # columns=None,
67
- # embedding_dict=None,
68
- # column_embedding_configs=None,
69
- # vector_db_persist_directory=None,
70
- # vector_db_collection_name=None,
71
- # embedding_pooling_strategy_type: PoolingStrategy = None,
72
- # ner_data_file=None,
73
- # ner_threshold=None,
74
- # combinations=None,
75
- # model=None,
76
- # multi_modal = None ):
77
- # self.library = library
78
- # self.experiment_name = experiment_name
79
- # self.experiment_description = experiment_description
80
- # self.experiment_tags = experiment_tags
81
- # self.artifact_location = artifact_location
82
- # self.tracking_uri = tracking_uri
83
- # self.dataset_type = dataset_type
84
- # self.columns = columns
85
- # self.embedding_dict = embedding_dict
86
- # self.column_embedding_configs = column_embedding_configs
87
- # self.vector_db_persist_directory = vector_db_persist_directory
88
- # self.vector_db_collection_name = vector_db_collection_name
89
- # self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
90
- # self.ner_data_file = ner_data_file
91
- # self.ner_threshold = ner_threshold
92
- # self.combinations = combinations
93
- # self.model = model
94
- # self.multi_modal = multi_modal
95
-
96
59
  def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
97
60
  """
98
61
  Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
@@ -184,45 +147,45 @@ class Pipeline(BaseModel):
184
147
  # filename=self.ner_data_file) if self.ner_data_file else None
185
148
 
186
149
  dataset_splitter = self.dataset_splitter_type()
187
-
150
+
188
151
  if issubclass(self.dataset_type, TextDatasetMixin):
189
152
  key, value = next(iter(embedding_dict.items()))
190
153
  embedding_size = value[next(iter(value))][0].shape[0]
191
- pooling_strategy = self.embedding_pooling_strategy_type() if self.embedding_pooling_strategy_type else None
154
+ pooling_strategy = self.embedding_pooling_strategy_type(
155
+ ) if self.embedding_pooling_strategy_type else None
192
156
 
193
157
  dataset = self.dataset_type(
194
158
  embedding_dict=embedding_dict,
195
159
  embedding_size=embedding_size,
196
160
  embeddings_pooling_strategy=pooling_strategy,
197
- dataset_splitter = dataset_splitter,
161
+ dataset_splitter_type=self.dataset_splitter_type,
162
+ **kwargs)
163
+ elif self.dataset_type == BaseDataset:
164
+ dataset = self.dataset_type(
165
+ dataset_splitter_type=self.dataset_splitter_type,
198
166
  **kwargs)
199
167
  else:
200
168
  dataset = self.dataset_type(**kwargs)
201
169
 
202
170
  # X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
203
171
  dataset.load()
204
-
205
172
  dataframe = dataset.dataframe
206
173
 
207
- if dataframe is None: # if the dataframe is None, it means that the dataset is not loaded
174
+ # Check if any of the arrays are None or empty
175
+ is_data_valid = (dataset.X_train is not None and dataset.X_train.size > 0 and
176
+ dataset.y_train is not None and dataset.y_train.size > 0 and
177
+ dataset.X_test is not None and dataset.X_test.size > 0 and
178
+ dataset.y_test is not None and dataset.y_test.size > 0)
179
+
180
+ # Check if the dataframe is None or empty
181
+ is_dataframe_valid = dataframe is not None and not dataframe.empty
182
+
183
+ if not (is_data_valid and is_dataframe_valid):
208
184
  raise ValueError("The dataset is not loaded")
209
185
 
210
- # dataframe.dropna()
211
- # X_train = dataset.X_train
212
- # X_test = dataset.X_test
213
- # y_train = dataset.y_train
214
- # y_test = dataset.y_test
215
- # self._train_idx_arr = dataset.train_idx_arr
216
- # self._val_idx_arr = dataset.val_idx_arr
217
- # Logic to set up the experiment
218
186
  # column name, train data, train label, test data, test label
219
187
  self._items = dataset.produce_inputs()
220
188
 
221
- # unique_classes = pd.unique(dataframe[dataset.class_column])
222
- # event_num = len(unique_classes)
223
- # droprate = 0.3
224
- # vector_size = self.dataset.drugs_df.shape[0]
225
-
226
189
  print("Building the experiment with the following settings:")
227
190
  print(
228
191
  f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ddi_fw
3
- Version: 0.0.186
3
+ Version: 0.0.188
4
4
  Summary: Do not use :)
5
5
  Author-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
6
6
  Maintainer-email: Kıvanç Bayraktar <bayraktarkivanc@gmail.com>
@@ -1,5 +1,5 @@
1
1
  ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
2
- ddi_fw/datasets/core.py,sha256=FYFKzKpaPqRXl5UqLoD6DNZEaJEAgxLlfCPtHULjc_s,10865
2
+ ddi_fw/datasets/core.py,sha256=eKPbntiDhqpqaV1SlrPmuSUq_9i_5INlnJuAlwj61Nk,10630
3
3
  ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
4
4
  ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
5
5
  ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
@@ -74,7 +74,7 @@ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSz
74
74
  ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
75
75
  ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
76
76
  ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
77
- ddi_fw/ml/ml_helper.py,sha256=MSxdr3UpS5qFJN7TWdXDaNwBfYjzMXp7cHs2PWTpX6o,8550
77
+ ddi_fw/ml/ml_helper.py,sha256=l1ZLYL3x5bHxD2bh2ezEgWDlV0ni8zGZGgj07x7KR40,6310
78
78
  ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
79
79
  ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
80
80
  ddi_fw/ml/tensorflow_wrapper.py,sha256=-zcbd0LBg9QNMF9K1I-JC379cS3rTO7ibgsDIOnMsoc,12951
@@ -85,7 +85,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
85
85
  ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
86
86
  ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
87
87
  ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
88
- ddi_fw/pipeline/pipeline.py,sha256=uMpkZnqEzH5rQDkgySdDKPzfMKfXNyO0QCsFVKUfrJ4,11390
88
+ ddi_fw/pipeline/pipeline.py,sha256=dCXZuXOlW74ZO0e_OhS9OX0dqI9abj7CQz_lkKrDIWY,9787
89
89
  ddi_fw/utils/__init__.py,sha256=bqIC0YjbD0YSHtO0nWUkRs4w5nu7qBV0yU72sRzwCj8,475
90
90
  ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
91
91
  ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
@@ -98,7 +98,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
98
98
  ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
99
99
  ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
100
100
  ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
101
- ddi_fw-0.0.186.dist-info/METADATA,sha256=a8oR_ifI2j--Lmx1432hOdmHWwDvXI-j-fJ-301IWVE,2542
102
- ddi_fw-0.0.186.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
103
- ddi_fw-0.0.186.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
104
- ddi_fw-0.0.186.dist-info/RECORD,,
101
+ ddi_fw-0.0.188.dist-info/METADATA,sha256=SRAoTA4fu0suxghXx5okr-RsfC512VEotrkTCUeXBck,2542
102
+ ddi_fw-0.0.188.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
103
+ ddi_fw-0.0.188.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
104
+ ddi_fw-0.0.188.dist-info/RECORD,,