ddi-fw 0.0.186__py3-none-any.whl → 0.0.188__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +27 -29
- ddi_fw/ml/ml_helper.py +0 -38
- ddi_fw/pipeline/pipeline.py +27 -64
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.188.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.188.dist-info}/RECORD +7 -7
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.188.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.186.dist-info → ddi_fw-0.0.188.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -60,10 +60,10 @@ class BaseDataset(BaseModel):
|
|
60
60
|
dataset_splitter_type: Type[DatasetSplitter]
|
61
61
|
class_column: str = 'class'
|
62
62
|
dataframe: Optional[pd.DataFrame] = None
|
63
|
-
X_train: Optional[
|
64
|
-
X_test: Optional[
|
65
|
-
y_train: Optional[
|
66
|
-
y_test: Optional[
|
63
|
+
X_train: Optional[np.ndarray] = None
|
64
|
+
X_test: Optional[np.ndarray] = None
|
65
|
+
y_train: Optional[np.ndarray] = None
|
66
|
+
y_test: Optional[np.ndarray] = None
|
67
67
|
train_indexes: Optional[pd.Index] = None
|
68
68
|
test_indexes: Optional[pd.Index] = None
|
69
69
|
train_idx_arr: Optional[List[np.ndarray]] = None
|
@@ -81,7 +81,7 @@ class BaseDataset(BaseModel):
|
|
81
81
|
self.y_train), np.array(self.y_test)
|
82
82
|
|
83
83
|
if self.columns is None or len(self.columns) == 0:
|
84
|
-
items.append([f'
|
84
|
+
items.append([f'default', np.nan_to_num(self.X_train),
|
85
85
|
y_train_label, np.nan_to_num(self.X_test), y_test_label])
|
86
86
|
else:
|
87
87
|
for index, column in enumerate(self.columns):
|
@@ -127,11 +127,12 @@ class BaseDataset(BaseModel):
|
|
127
127
|
Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
|
128
128
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
129
129
|
"""
|
130
|
-
if self.X_train
|
130
|
+
if self.X_train and self.y_train and self.X_test and self.y_test :
|
131
131
|
# Data is already provided, no need to calculate
|
132
132
|
logging.info(
|
133
133
|
"X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
|
134
|
-
return
|
134
|
+
return
|
135
|
+
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
135
136
|
|
136
137
|
self.prep()
|
137
138
|
|
@@ -150,24 +151,26 @@ class BaseDataset(BaseModel):
|
|
150
151
|
|
151
152
|
train = self.dataframe[self.dataframe.index.isin(train_idx_all)]
|
152
153
|
test = self.dataframe[self.dataframe.index.isin(test_idx_all)]
|
154
|
+
X_train = train.drop(self.class_column, axis=1)
|
155
|
+
X_train = train.drop(self.class_column, axis=1)
|
156
|
+
y_train = train[self.class_column]
|
157
|
+
X_test = test.drop(self.class_column, axis=1)
|
158
|
+
y_test = test[self.class_column]
|
159
|
+
|
160
|
+
self.X_train = np.array(X_train)
|
161
|
+
self.y_train = np.array(y_train)
|
162
|
+
self.X_test = np.array(X_test)
|
163
|
+
self.y_test = np.array(y_test)
|
153
164
|
|
154
|
-
self.
|
155
|
-
self.
|
156
|
-
self.X_test = test.drop(self.class_column, axis=1)
|
157
|
-
self.y_test = test[self.class_column]
|
158
|
-
|
159
|
-
self.train_indexes = self.X_train.index
|
160
|
-
self.test_indexes = self.X_test.index
|
165
|
+
self.train_indexes = X_train.index
|
166
|
+
self.test_indexes = X_test.index
|
161
167
|
self.train_idx_arr = train_idx_arr
|
162
168
|
self.val_idx_arr = val_idx_arr
|
163
169
|
|
164
170
|
# Dataframe to numpy array conversion
|
165
|
-
|
166
|
-
self.y_train = np.array(self.y_train)
|
167
|
-
self.X_test = np.array(self.X_test)
|
168
|
-
self.y_test = np.array(self.y_test)
|
171
|
+
|
169
172
|
|
170
|
-
return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
173
|
+
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
171
174
|
|
172
175
|
def __get_indexes__(self, path):
|
173
176
|
train_index_path = path+'/train_indexes.txt'
|
@@ -219,21 +222,16 @@ class BaseDataset(BaseModel):
|
|
219
222
|
|
220
223
|
X_train, X_test, y_train, y_test, X_train.index, X_test.index, train_idx_arr, val_idx_arr = self.dataset_splitter.split(
|
221
224
|
X=X, y=y)
|
222
|
-
self.X_train = X_train
|
223
|
-
self.X_test = X_test
|
224
|
-
self.y_train =
|
225
|
-
self.y_test = y_test
|
225
|
+
self.X_train = np.array(X_train)
|
226
|
+
self.X_test = np.array(X_test)
|
227
|
+
self.y_train = np.array(y_train.tolist())
|
228
|
+
self.y_test = np.array(y_test.tolist())
|
226
229
|
self.train_indexes = X_train.index
|
227
230
|
self.test_indexes = X_test.index
|
228
231
|
self.train_idx_arr = train_idx_arr
|
229
232
|
self.val_idx_arr = val_idx_arr
|
230
233
|
|
231
|
-
|
232
|
-
self.X_train = np.array(self.X_train)
|
233
|
-
self.y_train = np.array(self.y_train.tolist())
|
234
|
-
self.X_test = np.array(self.X_test)
|
235
|
-
self.y_test = np.array(self.y_test.tolist())
|
236
|
-
|
234
|
+
|
237
235
|
if save_indexes:
|
238
236
|
# train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
|
239
237
|
self.__save_indexes__(
|
ddi_fw/ml/ml_helper.py
CHANGED
@@ -111,44 +111,6 @@ class MultiModalRunner:
|
|
111
111
|
if self.use_mlflow:
|
112
112
|
with mlflow.start_run(run_name=self.prefix, description="***") as run:
|
113
113
|
self.__predict(single_results)
|
114
|
-
# self.level_0_run_id = run.info.run_id
|
115
|
-
# item_dict = {t[0]: t for t in self.items}
|
116
|
-
# print("multi_modal")
|
117
|
-
# print(self.multi_modal)
|
118
|
-
# print(item_dict.keys())
|
119
|
-
|
120
|
-
# for m in self.multi_modal:
|
121
|
-
# name = m.get('name')
|
122
|
-
# input_type = m.get('input_type')
|
123
|
-
# input = m.get('input')
|
124
|
-
# inputs = m.get('inputs')
|
125
|
-
# model_type = get_import(m.get("model_type"))
|
126
|
-
# kwargs = m.get('params')
|
127
|
-
# T = self.__create_model(self.library)
|
128
|
-
# single_modal = T(self.date, name, model_type, **kwargs)
|
129
|
-
# if input_type == '1D':
|
130
|
-
# item = item_dict[input]
|
131
|
-
# single_modal.set_data(
|
132
|
-
# self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
133
|
-
# elif input_type == '2D':
|
134
|
-
# # check keys
|
135
|
-
# filtered_dict = {k: item_dict[k]
|
136
|
-
# for k in inputs if k in item_dict}
|
137
|
-
# print(filtered_dict.keys())
|
138
|
-
# first_input = next(iter(filtered_dict.values()))
|
139
|
-
# train_data_list = [f[1] for f in filtered_dict.values()]
|
140
|
-
# test_data_list = [f[3] for f in filtered_dict.values()]
|
141
|
-
# train_data = np.stack(train_data_list, axis=1)
|
142
|
-
# test_data = np.stack(test_data_list, axis=1)
|
143
|
-
# train_label = first_input[2]
|
144
|
-
# test_label = first_input[4]
|
145
|
-
# single_modal.set_data(
|
146
|
-
# self.train_idx_arr, self.val_idx_arr, train_data, train_label, test_data, test_label)
|
147
|
-
# else:
|
148
|
-
# raise Exception("check configurations")
|
149
|
-
# logs, metrics, prediction = single_modal.fit_and_evaluate()
|
150
|
-
# self.result.add_metric(name, metrics)
|
151
|
-
# single_results[name] = prediction
|
152
114
|
else:
|
153
115
|
self.__predict(single_results)
|
154
116
|
if combinations:
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -36,63 +36,26 @@ class Pipeline(BaseModel):
|
|
36
36
|
combinations: Optional[List[str]] = None
|
37
37
|
model: Optional[Any] = None
|
38
38
|
multi_modal: Optional[Any] = None
|
39
|
-
use_mlflow: bool =
|
40
|
-
_items:List=[]
|
41
|
-
_train_idx_arr:List|None=[]
|
42
|
-
_val_idx_arr:List|None=[]
|
43
|
-
|
39
|
+
use_mlflow: bool = False
|
40
|
+
_items: List = []
|
41
|
+
_train_idx_arr: List | None = []
|
42
|
+
_val_idx_arr: List | None = []
|
43
|
+
|
44
44
|
@property
|
45
45
|
def items(self) -> List:
|
46
46
|
return self._items
|
47
|
+
|
47
48
|
@property
|
48
|
-
def train_idx_arr(self) -> List|None:
|
49
|
+
def train_idx_arr(self) -> List | None:
|
49
50
|
return self._train_idx_arr
|
51
|
+
|
50
52
|
@property
|
51
|
-
def val_idx_arr(self) -> List|None:
|
53
|
+
def val_idx_arr(self) -> List | None:
|
52
54
|
return self._val_idx_arr
|
53
55
|
|
54
56
|
class Config:
|
55
57
|
arbitrary_types_allowed = True
|
56
58
|
|
57
|
-
# class Pipeline:
|
58
|
-
# def __init__(self,
|
59
|
-
# library='tensorflow',
|
60
|
-
# experiment_name=None,
|
61
|
-
# experiment_description=None,
|
62
|
-
# experiment_tags=None,
|
63
|
-
# artifact_location=None,
|
64
|
-
# tracking_uri=None,
|
65
|
-
# dataset_type: BaseDataset = None,
|
66
|
-
# columns=None,
|
67
|
-
# embedding_dict=None,
|
68
|
-
# column_embedding_configs=None,
|
69
|
-
# vector_db_persist_directory=None,
|
70
|
-
# vector_db_collection_name=None,
|
71
|
-
# embedding_pooling_strategy_type: PoolingStrategy = None,
|
72
|
-
# ner_data_file=None,
|
73
|
-
# ner_threshold=None,
|
74
|
-
# combinations=None,
|
75
|
-
# model=None,
|
76
|
-
# multi_modal = None ):
|
77
|
-
# self.library = library
|
78
|
-
# self.experiment_name = experiment_name
|
79
|
-
# self.experiment_description = experiment_description
|
80
|
-
# self.experiment_tags = experiment_tags
|
81
|
-
# self.artifact_location = artifact_location
|
82
|
-
# self.tracking_uri = tracking_uri
|
83
|
-
# self.dataset_type = dataset_type
|
84
|
-
# self.columns = columns
|
85
|
-
# self.embedding_dict = embedding_dict
|
86
|
-
# self.column_embedding_configs = column_embedding_configs
|
87
|
-
# self.vector_db_persist_directory = vector_db_persist_directory
|
88
|
-
# self.vector_db_collection_name = vector_db_collection_name
|
89
|
-
# self.embedding_pooling_strategy_type = embedding_pooling_strategy_type
|
90
|
-
# self.ner_data_file = ner_data_file
|
91
|
-
# self.ner_threshold = ner_threshold
|
92
|
-
# self.combinations = combinations
|
93
|
-
# self.model = model
|
94
|
-
# self.multi_modal = multi_modal
|
95
|
-
|
96
59
|
def __create_or_update_embeddings__(self, embedding_dict, vector_db_persist_directory, vector_db_collection_name, column=None):
|
97
60
|
"""
|
98
61
|
Fetch embeddings and metadata from a persistent Chroma vector database and update the provided embedding_dict.
|
@@ -184,45 +147,45 @@ class Pipeline(BaseModel):
|
|
184
147
|
# filename=self.ner_data_file) if self.ner_data_file else None
|
185
148
|
|
186
149
|
dataset_splitter = self.dataset_splitter_type()
|
187
|
-
|
150
|
+
|
188
151
|
if issubclass(self.dataset_type, TextDatasetMixin):
|
189
152
|
key, value = next(iter(embedding_dict.items()))
|
190
153
|
embedding_size = value[next(iter(value))][0].shape[0]
|
191
|
-
pooling_strategy = self.embedding_pooling_strategy_type(
|
154
|
+
pooling_strategy = self.embedding_pooling_strategy_type(
|
155
|
+
) if self.embedding_pooling_strategy_type else None
|
192
156
|
|
193
157
|
dataset = self.dataset_type(
|
194
158
|
embedding_dict=embedding_dict,
|
195
159
|
embedding_size=embedding_size,
|
196
160
|
embeddings_pooling_strategy=pooling_strategy,
|
197
|
-
|
161
|
+
dataset_splitter_type=self.dataset_splitter_type,
|
162
|
+
**kwargs)
|
163
|
+
elif self.dataset_type == BaseDataset:
|
164
|
+
dataset = self.dataset_type(
|
165
|
+
dataset_splitter_type=self.dataset_splitter_type,
|
198
166
|
**kwargs)
|
199
167
|
else:
|
200
168
|
dataset = self.dataset_type(**kwargs)
|
201
169
|
|
202
170
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
203
171
|
dataset.load()
|
204
|
-
|
205
172
|
dataframe = dataset.dataframe
|
206
173
|
|
207
|
-
|
174
|
+
# Check if any of the arrays are None or empty
|
175
|
+
is_data_valid = (dataset.X_train is not None and dataset.X_train.size > 0 and
|
176
|
+
dataset.y_train is not None and dataset.y_train.size > 0 and
|
177
|
+
dataset.X_test is not None and dataset.X_test.size > 0 and
|
178
|
+
dataset.y_test is not None and dataset.y_test.size > 0)
|
179
|
+
|
180
|
+
# Check if the dataframe is None or empty
|
181
|
+
is_dataframe_valid = dataframe is not None and not dataframe.empty
|
182
|
+
|
183
|
+
if not (is_data_valid and is_dataframe_valid):
|
208
184
|
raise ValueError("The dataset is not loaded")
|
209
185
|
|
210
|
-
# dataframe.dropna()
|
211
|
-
# X_train = dataset.X_train
|
212
|
-
# X_test = dataset.X_test
|
213
|
-
# y_train = dataset.y_train
|
214
|
-
# y_test = dataset.y_test
|
215
|
-
# self._train_idx_arr = dataset.train_idx_arr
|
216
|
-
# self._val_idx_arr = dataset.val_idx_arr
|
217
|
-
# Logic to set up the experiment
|
218
186
|
# column name, train data, train label, test data, test label
|
219
187
|
self._items = dataset.produce_inputs()
|
220
188
|
|
221
|
-
# unique_classes = pd.unique(dataframe[dataset.class_column])
|
222
|
-
# event_num = len(unique_classes)
|
223
|
-
# droprate = 0.3
|
224
|
-
# vector_size = self.dataset.drugs_df.shape[0]
|
225
|
-
|
226
189
|
print("Building the experiment with the following settings:")
|
227
190
|
print(
|
228
191
|
f"Name: {self.experiment_name}, Dataset: {dataset}, Model: {self.model}")
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=eKPbntiDhqpqaV1SlrPmuSUq_9i_5INlnJuAlwj61Nk,10630
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
@@ -74,7 +74,7 @@ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSz
|
|
74
74
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
75
75
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
76
76
|
ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
|
77
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
77
|
+
ddi_fw/ml/ml_helper.py,sha256=l1ZLYL3x5bHxD2bh2ezEgWDlV0ni8zGZGgj07x7KR40,6310
|
78
78
|
ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
|
79
79
|
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
80
80
|
ddi_fw/ml/tensorflow_wrapper.py,sha256=-zcbd0LBg9QNMF9K1I-JC379cS3rTO7ibgsDIOnMsoc,12951
|
@@ -85,7 +85,7 @@ ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,2
|
|
85
85
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
86
86
|
ddi_fw/pipeline/multi_pipeline.py,sha256=NfcH4Ze5U-JRiH3lrxEDWj-VPxYQYtp7tq6bLCImBzs,5550
|
87
87
|
ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
|
88
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
88
|
+
ddi_fw/pipeline/pipeline.py,sha256=dCXZuXOlW74ZO0e_OhS9OX0dqI9abj7CQz_lkKrDIWY,9787
|
89
89
|
ddi_fw/utils/__init__.py,sha256=bqIC0YjbD0YSHtO0nWUkRs4w5nu7qBV0yU72sRzwCj8,475
|
90
90
|
ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
|
91
91
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
@@ -98,7 +98,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
98
98
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
99
99
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
100
100
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
101
|
-
ddi_fw-0.0.
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
104
|
-
ddi_fw-0.0.
|
101
|
+
ddi_fw-0.0.188.dist-info/METADATA,sha256=SRAoTA4fu0suxghXx5okr-RsfC512VEotrkTCUeXBck,2542
|
102
|
+
ddi_fw-0.0.188.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
103
|
+
ddi_fw-0.0.188.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
104
|
+
ddi_fw-0.0.188.dist-info/RECORD,,
|
File without changes
|
File without changes
|