ddi-fw 0.0.189__py3-none-any.whl → 0.0.191__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ddi_fw/datasets/core.py +6 -7
- ddi_fw/ml/ml_helper.py +40 -29
- ddi_fw/pipeline/multi_pipeline.py +2 -0
- ddi_fw/pipeline/pipeline.py +9 -1
- ddi_fw/utils/__init__.py +2 -1
- ddi_fw/utils/numpy_utils.py +27 -0
- {ddi_fw-0.0.189.dist-info → ddi_fw-0.0.191.dist-info}/METADATA +1 -1
- {ddi_fw-0.0.189.dist-info → ddi_fw-0.0.191.dist-info}/RECORD +10 -9
- {ddi_fw-0.0.189.dist-info → ddi_fw-0.0.191.dist-info}/WHEEL +0 -0
- {ddi_fw-0.0.189.dist-info → ddi_fw-0.0.191.dist-info}/top_level.txt +0 -0
ddi_fw/datasets/core.py
CHANGED
@@ -73,6 +73,7 @@ class BaseDataset(BaseModel):
|
|
73
73
|
class Config:
|
74
74
|
arbitrary_types_allowed = True
|
75
75
|
|
76
|
+
# TODO columns yoksa tüm feature'lar alınıyor, bu pipeline'da nasıl yapılacak?
|
76
77
|
def produce_inputs(self):
|
77
78
|
items = []
|
78
79
|
if self.X_train is None or self.X_test is None:
|
@@ -127,15 +128,15 @@ class BaseDataset(BaseModel):
|
|
127
128
|
Load the dataset. If X_train, y_train, X_test, and y_test are already provided,
|
128
129
|
skip deriving them. Otherwise, derive them from the dataframe and indices.
|
129
130
|
"""
|
130
|
-
|
131
|
+
self.prep()
|
132
|
+
|
133
|
+
if self.X_train is not None or self.y_train is not None or self.X_test is not None or self.y_test is not None:
|
131
134
|
# Data is already provided, no need to calculate
|
132
135
|
logging.info(
|
133
136
|
"X_train, y_train, X_test, and y_test are already provided. Skipping calculation.")
|
134
137
|
return
|
135
138
|
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
136
139
|
|
137
|
-
self.prep()
|
138
|
-
|
139
140
|
if self.index_path is None:
|
140
141
|
raise Exception(
|
141
142
|
"There is no index path. Please call split_dataset or provide indices.")
|
@@ -156,7 +157,7 @@ class BaseDataset(BaseModel):
|
|
156
157
|
y_train = train[self.class_column]
|
157
158
|
X_test = test.drop(self.class_column, axis=1)
|
158
159
|
y_test = test[self.class_column]
|
159
|
-
|
160
|
+
|
160
161
|
self.X_train = np.array(X_train)
|
161
162
|
# self.y_train = np.array(y_train)
|
162
163
|
self.y_train = np.array(y_train.tolist())
|
@@ -170,7 +171,6 @@ class BaseDataset(BaseModel):
|
|
170
171
|
self.val_idx_arr = val_idx_arr
|
171
172
|
|
172
173
|
# Dataframe to numpy array conversion
|
173
|
-
|
174
174
|
|
175
175
|
# return self.X_train, self.X_test, self.y_train, self.y_test, self.train_indexes, self.test_indexes, self.train_idx_arr, self.val_idx_arr
|
176
176
|
|
@@ -226,14 +226,13 @@ class BaseDataset(BaseModel):
|
|
226
226
|
X=X, y=y)
|
227
227
|
self.X_train = np.array(X_train)
|
228
228
|
self.X_test = np.array(X_test)
|
229
|
-
self.y_train =
|
229
|
+
self.y_train = np.array(y_train.tolist())
|
230
230
|
self.y_test = np.array(y_test.tolist())
|
231
231
|
self.train_indexes = X_train.index
|
232
232
|
self.test_indexes = X_test.index
|
233
233
|
self.train_idx_arr = train_idx_arr
|
234
234
|
self.val_idx_arr = val_idx_arr
|
235
235
|
|
236
|
-
|
237
236
|
if save_indexes:
|
238
237
|
# train_pairs = [row['id1'].join(',').row['id2'] for index, row in X_train.iterrows()]
|
239
238
|
self.__save_indexes__(
|
ddi_fw/ml/ml_helper.py
CHANGED
@@ -57,42 +57,53 @@ class MultiModalRunner:
|
|
57
57
|
raise ValueError(
|
58
58
|
"Unsupported library type. Choose 'tensorflow' or 'pytorch'.")
|
59
59
|
|
60
|
+
# TODO check single_results, 1d,2d ...
|
60
61
|
def __predict(self, single_results):
|
61
62
|
item_dict = {t[0]: t for t in self.items}
|
62
63
|
print("multi_modal")
|
63
64
|
print(self.multi_modal)
|
64
65
|
print(item_dict.keys())
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
67
|
+
if self.multi_modal:
|
68
|
+
for m in self.multi_modal:
|
69
|
+
name = m.get('name')
|
70
|
+
input_type = m.get('input_type')
|
71
|
+
input = m.get('input')
|
72
|
+
inputs = m.get('inputs')
|
73
|
+
model_type = get_import(m.get("model_type"))
|
74
|
+
kwargs = m.get('params')
|
75
|
+
T = self.__create_model(self.library)
|
76
|
+
single_modal = T(self.date, name, model_type,
|
77
|
+
use_mlflow=self.use_mlflow, **kwargs)
|
78
|
+
|
79
|
+
if input is not None and inputs is not None:
|
80
|
+
raise Exception("input and inputs should not be used together")
|
81
|
+
|
82
|
+
if input_type == '1D':
|
83
|
+
item = item_dict[input]
|
84
|
+
single_modal.set_data(
|
85
|
+
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
86
|
+
elif input_type == '2D':
|
87
|
+
# check keys
|
88
|
+
filtered_dict = {k: item_dict[k]
|
89
|
+
for k in inputs if k in item_dict}
|
90
|
+
print(filtered_dict.keys())
|
91
|
+
first_input = next(iter(filtered_dict.values()))
|
92
|
+
train_data_list = [f[1] for f in filtered_dict.values()]
|
93
|
+
test_data_list = [f[3] for f in filtered_dict.values()]
|
94
|
+
train_data = np.stack(train_data_list, axis=1)
|
95
|
+
test_data = np.stack(test_data_list, axis=1)
|
96
|
+
train_label = first_input[2]
|
97
|
+
test_label = first_input[4]
|
98
|
+
single_modal.set_data(
|
99
|
+
self.train_idx_arr, self.val_idx_arr, train_data, train_label, test_data, test_label)
|
100
|
+
else:
|
101
|
+
raise Exception("check configurations")
|
94
102
|
else:
|
95
|
-
|
103
|
+
item = self.items[0]
|
104
|
+
single_modal.set_data(
|
105
|
+
self.train_idx_arr, self.val_idx_arr, item[1], item[2], item[3], item[4])
|
106
|
+
|
96
107
|
logs, metrics, prediction = single_modal.fit_and_evaluate()
|
97
108
|
self.result.add_metric(name, metrics)
|
98
109
|
single_results[name] = prediction
|
@@ -50,6 +50,7 @@ class MultiPipeline():
|
|
50
50
|
type = config.get("type")
|
51
51
|
library = config.get("library")
|
52
52
|
|
53
|
+
use_mlflow = config.get("use_mlflow")
|
53
54
|
experiment_name = config.get("experiment_name")
|
54
55
|
experiment_description = config.get("experiment_description")
|
55
56
|
experiment_tags = config.get("experiment_tags")
|
@@ -84,6 +85,7 @@ class MultiPipeline():
|
|
84
85
|
if type == "general":
|
85
86
|
pipeline = Pipeline(
|
86
87
|
library=library,
|
88
|
+
use_mlflow=use_mlflow,
|
87
89
|
experiment_name=experiment_name,
|
88
90
|
experiment_description=experiment_description,
|
89
91
|
experiment_tags=experiment_tags,
|
ddi_fw/pipeline/pipeline.py
CHANGED
@@ -37,10 +37,15 @@ class Pipeline(BaseModel):
|
|
37
37
|
model: Optional[Any] = None
|
38
38
|
multi_modal: Optional[Any] = None
|
39
39
|
use_mlflow: bool = False
|
40
|
+
_dataset: BaseDataset = []
|
40
41
|
_items: List = []
|
41
42
|
_train_idx_arr: List | None = []
|
42
43
|
_val_idx_arr: List | None = []
|
43
44
|
|
45
|
+
@property
|
46
|
+
def dataset(self) -> BaseDataset:
|
47
|
+
return self._dataset
|
48
|
+
|
44
49
|
@property
|
45
50
|
def items(self) -> List:
|
46
51
|
return self._items
|
@@ -168,7 +173,10 @@ class Pipeline(BaseModel):
|
|
168
173
|
dataset = self.dataset_type(**kwargs)
|
169
174
|
|
170
175
|
# X_train, X_test, y_train, y_test, train_indexes, test_indexes, train_idx_arr, val_idx_arr = dataset.load()
|
176
|
+
|
171
177
|
dataset.load()
|
178
|
+
self._dataset = dataset
|
179
|
+
|
172
180
|
dataframe = dataset.dataframe
|
173
181
|
|
174
182
|
# Check if any of the arrays are None or empty
|
@@ -180,7 +188,7 @@ class Pipeline(BaseModel):
|
|
180
188
|
# Check if the dataframe is None or empty
|
181
189
|
is_dataframe_valid = dataframe is not None and not dataframe.empty
|
182
190
|
|
183
|
-
if not (is_data_valid
|
191
|
+
if not (is_data_valid or is_dataframe_valid):
|
184
192
|
raise ValueError("The dataset is not loaded")
|
185
193
|
|
186
194
|
# column name, train data, train label, test data, test label
|
ddi_fw/utils/__init__.py
CHANGED
@@ -4,4 +4,5 @@ from .py7zr_helper import Py7ZipHelper
|
|
4
4
|
from .enums import UMLSCodeTypes, DrugBankTextDataTypes
|
5
5
|
from .package_helper import get_import
|
6
6
|
from .kaggle import create_kaggle_dataset
|
7
|
-
from .categorical_data_encoding_checker import is_one_hot_encoded, is_binary_encoded, is_binary_vector,is_label_encoded
|
7
|
+
from .categorical_data_encoding_checker import is_one_hot_encoded, is_binary_encoded, is_binary_vector,is_label_encoded
|
8
|
+
from .numpy_utils import adjust_array_dims
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
def adjust_array_dims(arr, final_ndim=2):
|
4
|
+
# Add axes if array has fewer dimensions than final_ndim
|
5
|
+
while arr.ndim < final_ndim:
|
6
|
+
arr = arr[:, np.newaxis] # Add a new axis
|
7
|
+
|
8
|
+
# Drop axes if array has more dimensions than final_ndim
|
9
|
+
while arr.ndim > final_ndim:
|
10
|
+
arr = np.squeeze(arr, axis=-1) # Remove the last axis
|
11
|
+
|
12
|
+
return arr
|
13
|
+
|
14
|
+
|
15
|
+
|
16
|
+
# # Example usage
|
17
|
+
# arr_1d = np.array([1, 2, 3, 4, 5])
|
18
|
+
|
19
|
+
# # Convert to a 3D array (iteratively adds axes)
|
20
|
+
# arr_3d = adjust_array_dims(arr_1d, final_ndim=3)
|
21
|
+
# print(arr_3d)
|
22
|
+
# print("Shape of arr_3d:", arr_3d.shape)
|
23
|
+
|
24
|
+
# # Convert to a 2D array (iteratively drops axes)
|
25
|
+
# arr_2d = adjust_array_dims(arr_3d, final_ndim=2)
|
26
|
+
# print(arr_2d)
|
27
|
+
# print("Shape of arr_2d:", arr_2d.shape)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
ddi_fw/datasets/__init__.py,sha256=_I3iDHARwzmg7_EL5XKtB_TgG1yAkLSOVTujLL9Wz9Q,280
|
2
|
-
ddi_fw/datasets/core.py,sha256=
|
2
|
+
ddi_fw/datasets/core.py,sha256=IaEk4T7f590rAAAG7Nc45mofeutX85mNxhikFs7mzpE,10839
|
3
3
|
ddi_fw/datasets/dataset_splitter.py,sha256=8H8uZTAf8N9LUZeSeHOMawtJFJhnDgUUqFcnl7dquBQ,1672
|
4
4
|
ddi_fw/datasets/db_utils.py,sha256=OTsa3d-Iic7z3HmzSQK9UigedRbHDxYChJk0s4GfLnw,6191
|
5
5
|
ddi_fw/datasets/setup_._py,sha256=khYVJuW5PlOY_i_A16F3UbSZ6s6o_ljw33Byw3C-A8E,1047
|
@@ -74,7 +74,7 @@ ddi_fw/langchain/sentence_splitter.py,sha256=h_bYElx4Ud1mwDNJfL7mUwvgadwKX3GKlSz
|
|
74
74
|
ddi_fw/langchain/storage.py,sha256=OizKyWm74Js7T6Q9kez-ulUoBGzIMFo4R46h4kjUyIM,11200
|
75
75
|
ddi_fw/ml/__init__.py,sha256=tIxiW0g6q1VsmDYVXR_ovvHQR3SCir8g2bKxx_CrS7s,221
|
76
76
|
ddi_fw/ml/evaluation_helper.py,sha256=2-7CLSgGTqLEk4HkgCVIOt-GxfLAn6SBozJghAtHb5M,11581
|
77
|
-
ddi_fw/ml/ml_helper.py,sha256=
|
77
|
+
ddi_fw/ml/ml_helper.py,sha256=ENjdpu6stRxGxqhFweZLIglCnLREtMh6ypwanf0qMGc,6940
|
78
78
|
ddi_fw/ml/model_wrapper.py,sha256=kabPXuo7S8tGkp9a00V04n4rXDmv7dD8wYGMjotISRc,1050
|
79
79
|
ddi_fw/ml/pytorch_wrapper.py,sha256=pe6UsjP2XeTgLxDnIUiodoyhJTGCxV27wD4Cjxysu2Q,8553
|
80
80
|
ddi_fw/ml/tensorflow_wrapper.py,sha256=lNJvg3odqMKmILecOMdcOCAOrwzWZDzxB0DWGcYWsPg,12952
|
@@ -83,14 +83,15 @@ ddi_fw/ner/mmlrestclient.py,sha256=NZta7m2Qm6I_qtVguMZhqtAUjVBmmXn0-TMnsNp0jpg,6
|
|
83
83
|
ddi_fw/ner/ner.py,sha256=FHyyX53Xwpdw8Hec261dyN88yD7Z9LmJua2mIrQLguI,17967
|
84
84
|
ddi_fw/pipeline/__init__.py,sha256=tKDM_rW4vPjlYTeOkNgi9PujDzb4e9O3LK1w5wqnebw,212
|
85
85
|
ddi_fw/pipeline/multi_modal_combination_strategy.py,sha256=JSyuP71b1I1yuk0s2ecCJZTtCED85jBtkpwTUxibJvI,1706
|
86
|
-
ddi_fw/pipeline/multi_pipeline.py,sha256=
|
86
|
+
ddi_fw/pipeline/multi_pipeline.py,sha256=fYyvwIOscUahjXd3QO5RSFrp1LliGR7RzOZyAXrXXz4,5637
|
87
87
|
ddi_fw/pipeline/ner_pipeline.py,sha256=Bp6BA6nozfWFaMHH6jKlzesnCGO6qiMkzdGy_ed6nh0,5947
|
88
|
-
ddi_fw/pipeline/pipeline.py,sha256=
|
89
|
-
ddi_fw/utils/__init__.py,sha256=
|
88
|
+
ddi_fw/pipeline/pipeline.py,sha256=GMMauyp0GvdaQLyQ5dPBffDDxFK28hdDtPUzdFX9-Yk,9961
|
89
|
+
ddi_fw/utils/__init__.py,sha256=HC32XkYQTYH_9vt0eX6tqQngEFG-R70hGrYkT-BcHCk,519
|
90
90
|
ddi_fw/utils/categorical_data_encoding_checker.py,sha256=gzb_vUDBrCMUhBxY1fBYTe8hmK72p0_uw3DTga8cqP8,1580
|
91
91
|
ddi_fw/utils/enums.py,sha256=19eJ3fX5eRK_xPvkYcukmug144jXPH4X9zQqtsFBj5A,671
|
92
92
|
ddi_fw/utils/json_helper.py,sha256=BVU6wmJgdXPxyqLPu3Ck_9Es5RrP1PDanKvE-OSj1D4,571
|
93
93
|
ddi_fw/utils/kaggle.py,sha256=wKRJ18KpQ6P-CubpZklEgsDtyFpR9RUL1_HyyF6ttEE,2425
|
94
|
+
ddi_fw/utils/numpy_utils.py,sha256=gd1WNq5NpWD2MBEMTtFuS5I0h8B6FAUNcq6BVOlxdhY,797
|
94
95
|
ddi_fw/utils/package_helper.py,sha256=erl8_onmhK-41zQoaED2qyDUV9GQxmT9sdoyRp9_q5I,1056
|
95
96
|
ddi_fw/utils/py7zr_helper.py,sha256=gOqaFIyJvTjUM-btO2x9AQ69jZOS8PoKN0wetYIckJw,4747
|
96
97
|
ddi_fw/utils/utils.py,sha256=szwnxMTDRrZoeNRyDuf3aCbtzriwtaRk4mHSH3asLdA,4301
|
@@ -98,7 +99,7 @@ ddi_fw/utils/zip_helper.py,sha256=YRZA4tKZVBJwGQM0_WK6L-y5MoqkKoC-nXuuHK6CU9I,55
|
|
98
99
|
ddi_fw/vectorization/__init__.py,sha256=LcJOpLVoLvHPDw9phGFlUQGeNcST_zKV-Oi1Pm5h_nE,110
|
99
100
|
ddi_fw/vectorization/feature_vector_generation.py,sha256=Z1A_DOBqDFPqLN4YB-3oYlOQWJK-X6Oes6UFjpzR47Q,4760
|
100
101
|
ddi_fw/vectorization/idf_helper.py,sha256=_Gd1dtDSLaw8o-o0JugzSKMt9FpeXewTh4wGEaUd4VQ,2571
|
101
|
-
ddi_fw-0.0.
|
102
|
-
ddi_fw-0.0.
|
103
|
-
ddi_fw-0.0.
|
104
|
-
ddi_fw-0.0.
|
102
|
+
ddi_fw-0.0.191.dist-info/METADATA,sha256=xrlNEz8W_iIfKVZidKR8kNa_WtrDr4EnVQwbtex_sbQ,2542
|
103
|
+
ddi_fw-0.0.191.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
104
|
+
ddi_fw-0.0.191.dist-info/top_level.txt,sha256=PMwHICFZTZtcpzQNPV4UQnfNXYIeLR_Ste-Wfc1h810,7
|
105
|
+
ddi_fw-0.0.191.dist-info/RECORD,,
|
File without changes
|
File without changes
|