ddi-fw 0.0.186__py3-none-any.whl → 0.0.187__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +27 -29
- ddi_fw/ml/ml_helper.py +0 -38
- ddi_fw/pipeline/pipeline.py +12 -58
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.187.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.187.dist-info}/RECORD +7 -7
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.187.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.187.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -60,10 +60,10 @@ class BaseDataset(BaseModel):
|
|
60
60
|
dataset_splitter_type: Type[DatasetSplitter]
|
61
61
|
class_column: str = 'class'
|
62
62
|
dataframe: Optional[pd.DataFrame] = None
|
63
|
-
X_train: Optional[
|
64
|
-
X_test: Optional[
|
65
|
-
y_train: Optional[
|
66
|
-
y_test: Optional[
|
63
|
+
X_train: Optional[np.ndarray] = None
|
64
|
+
X_test: Optional[np.ndarray] = None
|
65
|
+
y_train: Optional[np.ndarray] = None
|
66
|
+
y_test: Optional[np.ndarray] = None
|
67
67
|
train_indexes: Optional[pd.Index] = None
|
68
68
|
test_indexes: Optional[pd.Index] = None
|
69
69
|
train_idx_arr: Optional[List[np.ndarray]] = None
|
@@ -81,7 +81,7 @@ class BaseDataset(BaseModel):
|
|
81
81
|
self.y_train), np.array(self.y_test)
|
82
82
|
|
83
83
|
if self.columns is None or len(self.columns) == 0:
|
84
|
-
items.append([f'
|
84
|
+
items.append([f'default', np.nan_to_num(self.X_train),
|
85
85
|
y_train_label, np.nan_to_num(self.X_test), y_test_label])
|
86
86
|
else:
|
87
87
|
for index, column in enumerate(self.columns):
|
@@ -127,11 +127,12 @@ class BaseDataset(BaseModel):
|
|
127
127
|
Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
|
128
128
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
129
129
|
"""
|
130
|
-
if self.X_train
|
130
|
+
if self.X_train and self.y_train and self.X_test and self.y_test :
|
131
131
|
# Data is already provided, no need to calculate
|
132
132
|
logging.info(
|
133
133
|
"X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
|
134
|
-
return
|
134
|
+
return
|
135
|
+
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
135
136
|
|
136
137
|
self.prep()
|
137
138
|
|
@@ -150,24 +151,26 @@ class BaseDataset(BaseModel):
|
|
150
151
|
|
151
152
|
train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
152
153
|
test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
154
|
+
X_train = train.drop(self.class_column, axis=1)
|
155
|
+
X_train = train.drop(self.class_column, axis=1)
|
156
|
+
y_train = train[self.class_column]
|
157
|
+
X_test = test.drop(self.class_column, axis=1)
|
158
|
+
y_test = test[self.class_column]
|
159
|
+
|
160
|
+
self.X_train = np.array(X_train)
|
161
|
+
self.y_train = np.array(y_train)
|
162
|
+
self.X_test = np.array(X_test)
|
163
|
+
self.y_test = np.array(y_test)
|
153
164
|
|
154
|
-
self.
|
155
|
-
self.
|
156
|
-
self.X_test = test.drop(self.class_column, axis=1)
|
157
|
-
self.y_test = test[self.class_column]
|
158
|
-
|
159
|
-
self.train_indexes = self.X_train.index
|
160
|
-
self.test_indexes = self.X_test.index
|
165
|
+
self.train_indexes = X_train.index
|
166
|
+
self.test_indexes = X_test.index
|
161
167
|
self.train_idx_arr = train_idx_arr
|
162
168
|
self.val_idx_arr = val_idx_arr
|
163
169
|
|
164
170
|
# Dataframe to numpy array conversion
|
165
|
-
|
166
|
-
self.y_train = np.array(self.y_train)
|
167
|
-
self.X_test = np.array(self.X_test)
|
168
|
-
self.y_test = np.array(self.y_test)
|
171
|
+
|
169
172
|
|
170
|
-
return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
173
|
+
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
171
174
|
|
172
175
|
def __get_indexes__(self, path):
|
173
176
|
train_index_path = path+'/train_indexes.txt'
|
@@ -219,21 +222,16 @@ class BaseDataset(BaseModel):
|
|
219
222
|
|
220
223
|
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
|
221
224
|
X=X, y=y)
|
222
|
-
self.X_train = X_train
|
223
|
-
self.X_test = X_test
|
224
|
-
self.y_train =
|
225
|
-
self.y_test = y_test
|
225
|
+
self.X_train = np.array(self.X_train)
|
226
|
+
self.X_test = np.array(self.X_test)
|
227
|
+
self.y_train = np.array(y_train.tolist())
|
228
|
+
self.y_test = np.array(y_test.tolist())
|
226
229
|
self.train_indexes = X_train.index
|
227
230
|
self.test_indexes = X_test.index
|
228
231
|
self.train_idx_arr = train_idx_arr
|
229
232
|
self.val_idx_arr = val_idx_arr
|
230
233
|
|
231
|
-
|
232
|
-
self.X_train = np.array(self.X_train)
|
233
|
-
self.y_train = np.array(self.y_train.tolist())
|
234
|
-
self.X_test = np.array(self.X_test)
|
235
|
-
self.y_test = np.array(self.y_test.tolist())
|
236
|
-
|
234
|
+
|
237
235
|
if save_indexes:
|
238
236
|
# train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
|
239
237
|
self.__save_indexes__(
|
ddi_fw/ml/ml_helper.py
CHANGED
@@ -111,44 +111,6 @@ class MultiModalRunner:
|
|
111
111
|
if self.use_mlflow:
|
112
112
|
with mlflow.start_run(run_name=self.prefix, description="***") as run:
|
113
113
|
self.__predict(single_results)
|
114
|
-
# self.level_0_run_id = run.info.run_id
|
115
|
-
# item_dict = {t[0]: t for t in self.items}
|
116
|
-
# print("multi_modal")
|
117
|
-
# print(self.multi_modal)
|
118
|
-
# print(item_dict.keys())
|
119
|
-
|
120
|
-
# for m in self.multi_modal:
|
121
|
-
# name = m.get('name')
|
122
|
-
# input_type = m.get('input_type')
|
123
|
-
# input = m.get('input')
|
124
|
-
# inputs = m.get('inputs')
|
125
|
-
# model_type = get_import(m.get("model_type"))
|
126
|
-
# kwargs = m.get('params')
|
127
|
-
# T = self.__create_model(self.library)
|
128
|
-
# single_modal = T(self.date, name, model_type, **kwargs)
|
129
|
-
# if input_type == '1D':
|
130
|
-
# item = item_dict[input]
|
131
|
-
# single_modal.set_data(
|
132
|
-
# self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
133
|
-
# elif input_type == '2D':
|
134
|
-
# # check keys
|
135
|
-
# filtered_dict = {k: item_dict[k]
|
136
|
-
# for k in inputs if k in item_dict}
|
137
|
-
# print(filtered_dict.keys())
|
138
|
-
# first_input = next(iter(filtered_dict.values()))
|
139
|
-
# train_data_list = [f[1] for f in filtered_dict.values()]
|
140
|
-
# test_data_list = [f[3] for f in filtered_dict.values()]
|
141
|
-
# train_data = np.stack(train_data_list, axis=1)
|
142
|
-
# test_data = np.stack(test_data_list, axis=1)
|
143
|
-
# train_label = first_input[2]
|
144
|
-
# test_label = first_input[4]
|
145
|
-
# single_modal.set_data(
|
146
|
-
# self.train_idx_arr, self.val_idx_arr, train_data, train_label, test_data, test_label)
|
147
|
-
# else:
|
148
|
-
# raise Exception("check configurations")
|
149
|
-
# logs, metrics, prediction = single_modal.fit_and_evaluate()
|
150
|
-
# self.result.add_metric(name, metrics)
|
151
|
-
# single_results[name] = prediction
|
152
114
|
else:
|
153
115
|
self.__predict(single_results)
|
154
116
|
if combinations:
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -36,7 +36,7 @@ class Pipeline(BaseModel):
|
|
36
36
|
combinations: Optional[List[str]] = None
|
37
37
|
model: Optional[Any] = None
|
38
38
|
multi_modal: Optional[Any] = None
|
39
|
-
use_mlflow: bool =
|
39
|
+
use_mlflow: bool = False
|
40
40
|
_items:List=[]
|
41
41
|
_train_idx_arr:List|None=[]
|
42
42
|
_val_idx_arr:List|None=[]
|
@@ -53,45 +53,7 @@ class Pipeline(BaseModel):
|
|
53
53
|
|
54
54
|
class Config:
|
55
55
|
arbitrary_types_allowed = True
|
56
|
-
|
57
|
-
# class Pipeline:
|
58
|
-
# def __init__(self,
|
59
|
-
# library='tensorflow',
|
60
|
-
# experiment_name=None,
|
61
|
-
# experiment_description=None,
|
62
|
-
# experiment_tags=None,
|
63
|
-
# artifact_location=None,
|
64
|
-
# tracking_uri=None,
|
65
|
-
# dataset_type: BaseDataset = None,
|
66
|
-
# columns=None,
|
67
|
-
# embedding_dict=None,
|
68
|
-
# column_embedding_configs=None,
|
69
|
-
# vector_db_persist_directory=None,
|
70
|
-
# vector_db_collection_name=None,
|
71
|
-
# embedding_pooling_strategy_type: PoolingStrategy = None,
|
72
|
-
# ner_data_file=None,
|
73
|
-
# ner_threshold=None,
|
74
|
-
# combinations=None,
|
75
|
-
# model=None,
|
76
|
-
# multi_modal = None ):
|
77
|
-
# self.library = library
|
78
|
-
# self.experiment_name = experiment_name
|
79
|
-
# self.experiment_description = experiment_description
|
80
|
-
# self.experiment_tags = experiment_tags
|
81
|
-
# self.artifact_location = artifact_location
|
82
|
-
# self.tracking_uri = tracking_uri
|
83
|
-
# self.dataset_type = dataset_type
|
84
|
-
# self.columns = columns
|
85
|
-
# self.embedding_dict = embedding_dict
|
86
|
-
# self.column_embedding_configs = column_embedding_configs
|
87
|
-
# self.vector_db_persist_directory = vector_db_persist_directory
|
88
|
-
# self.vector_db_collection_name = vector_db_collection_name
|
89
|
-
# self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
90
|
-
# self.ner_data_file = ner_data_file
|
91
|
-
# self.ner_threshold = ner_threshold
|
92
|
-
# self.combinations = combinations
|
93
|
-
# self.model = model
|
94
|
-
# self.multi_modal = multi_modal
|
56
|
+
|
95
57
|
|
96
58
|
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
97
59
|
"""
|
@@ -194,35 +156,27 @@ class Pipeline(BaseModel):
|
|
194
156
|
embedding_dict=embedding_dict,
|
195
157
|
embedding_size=embedding_size,
|
196
158
|
embeddings_pooling_strategy=pooling_strategy,
|
197
|
-
|
159
|
+
dataset_splitter_type = self.dataset_splitter_type,
|
160
|
+
**kwargs)
|
161
|
+
elif self.dataset_type == BaseDataset:
|
162
|
+
dataset = self.dataset_type(
|
163
|
+
dataset_splitter_type = self.dataset_splitter_type,
|
198
164
|
**kwargs)
|
199
165
|
else:
|
200
166
|
dataset = self.dataset_type(**kwargs)
|
201
167
|
|
202
168
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
203
169
|
dataset.load()
|
204
|
-
|
205
170
|
dataframe = dataset.dataframe
|
171
|
+
b = not( dataset.X_train and dataset.y_train and dataset.X_test and dataset.y_test)
|
172
|
+
c = dataframe is None or dataframe.empty
|
206
173
|
|
207
|
-
if
|
174
|
+
if b or c:
|
208
175
|
raise ValueError("The dataset is not loaded")
|
209
|
-
|
210
|
-
# dataframe.dropna()
|
211
|
-
# X_train = dataset.X_train
|
212
|
-
# X_test = dataset.X_test
|
213
|
-
# y_train = dataset.y_train
|
214
|
-
# y_test = dataset.y_test
|
215
|
-
# self._train_idx_arr = dataset.train_idx_arr
|
216
|
-
# self._val_idx_arr = dataset.val_idx_arr
|
217
|
-
# Logic to set up the experiment
|
176
|
+
|
218
177
|
# column name, train data, train label, test data, test label
|
219
178
|
self._items = dataset.produce_inputs()
|
220
|
-
|
221
|
-
# unique_classes = pd.unique(dataframe[dataset.class_column])
|
222
|
-
# event_num = len(unique_classes)
|
223
|
-
# droprate = 0.3
|
224
|
-
# vector_size = self.dataset.drugs_df.shape[0]
|
225
|
-
|
179
|
+
|
226
180
|
print("Building the experiment with the following settings:")
|
227
181
|
print(
|
228
182
|
f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=mZcGqP3Ukx5FbYSMi08uq4vYDr7jbHR3xg1qOPJmU0s,10640
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
@@ -74,7 +74,7 @@ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSz
|
|
74
74
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
75
75
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
76
76
|
ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
|
77
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
77
|
+
ddi_fw/ml/ml_helper.py,sha256=l1ZLYL3x5bHxD2bh2ezEgWDlV0ni8zGZGgj07x7KR40,6310
|
78
78
|
ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
|
79
79
|
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
80
80
|
ddi_fw/ml/tensorflow_wrapper.py,sha256=-zcbd0LBg9QNMF9K1I-JC379cS3rTO7ibgsDIOnMsoc,12951
|
@@ -85,7 +85,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
|
|
85
85
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
86
86
|
ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
|
87
87
|
ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
|
88
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
88
|
+
ddi_fw/pipeline/pipeline.py,sha256=VSILkxot_O1DJMWPavzFUH3le4zVKQydcH32SbuHZlQ,9355
|
89
89
|
ddi_fw/utils/__init__.py,sha256=bqIC0YjbD0YSHtO0nWUkRs4w5nu7qBV0yU72sRzwCj8,475
|
90
90
|
ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
|
91
91
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
@@ -98,7 +98,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
98
98
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
99
99
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
100
100
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
101
|
-
ddi_fw-0.0.
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
104
|
-
ddi_fw-0.0.
|
101
|
+
ddi_fw-0.0.187.dist-info/METADATA,sha256=dzH9YAqsPxQcvuS9h0JRNx5qtd8vGNr-1c5f0uE3c7M,2542
|
102
|
+
ddi_fw-0.0.187.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
103
|
+
ddi_fw-0.0.187.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
104
|
+
ddi_fw-0.0.187.dist-info/RECORD,,
|
File without changes
|
File without changes
|