dragon-ml-toolbox 7.0.0__py3-none-any.whl → 8.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/METADATA +2 -1
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/RECORD +14 -12
- ml_tools/ML_datasetmaster.py +165 -116
- ml_tools/ML_evaluation.py +5 -2
- ml_tools/ML_evaluation_multi.py +296 -0
- ml_tools/ML_inference.py +232 -34
- ml_tools/ML_models.py +0 -4
- ml_tools/ML_trainer.py +168 -71
- ml_tools/_ML_optimization_multi.py +231 -0
- ml_tools/data_exploration.py +80 -2
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-7.0.0.dist-info → dragon_ml_toolbox-8.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version:
|
|
3
|
+
Version: 8.1.0
|
|
4
4
|
Summary: A collection of tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Loza <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -147,6 +147,7 @@ ensemble_learning
|
|
|
147
147
|
ETL_engineering
|
|
148
148
|
ML_callbacks
|
|
149
149
|
ML_datasetmaster
|
|
150
|
+
ML_evaluation_multi
|
|
150
151
|
ML_evaluation
|
|
151
152
|
ML_inference
|
|
152
153
|
ML_models
|
|
@@ -1,25 +1,27 @@
|
|
|
1
|
-
dragon_ml_toolbox-
|
|
2
|
-
dragon_ml_toolbox-
|
|
1
|
+
dragon_ml_toolbox-8.1.0.dist-info/licenses/LICENSE,sha256=2uUFNy7D0TLgHim1K5s3DIJ4q_KvxEXVilnU20cWliY,1066
|
|
2
|
+
dragon_ml_toolbox-8.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=lY4_rJPnLnMu7YBQaY-_iz1JRDcLdQzNCyeLAF1glJY,1837
|
|
3
3
|
ml_tools/ETL_engineering.py,sha256=4wwZXi9_U7xfCY70jGBaKniOeZ0m75ppxWpQBd_DmLc,39369
|
|
4
4
|
ml_tools/GUI_tools.py,sha256=n4ZZ5kEjwK5rkOCFJE41HeLFfjhpJVLUSzk9Kd9Kr_0,45410
|
|
5
5
|
ml_tools/MICE_imputation.py,sha256=oFHg-OytOzPYTzBR_wIRHhP71cMn3aupDeT59ABsXlQ,11576
|
|
6
6
|
ml_tools/ML_callbacks.py,sha256=noedVMmHZ72Odbg28zqx5wkhhvX2v-jXicKE_NCAiqU,13838
|
|
7
|
-
ml_tools/ML_datasetmaster.py,sha256=
|
|
8
|
-
ml_tools/ML_evaluation.py,sha256=
|
|
9
|
-
ml_tools/
|
|
10
|
-
ml_tools/
|
|
7
|
+
ml_tools/ML_datasetmaster.py,sha256=tN-GBPEwXRWFBT8r8K0v9b3Bd77DhqSH5FkjDP6BHTw,28847
|
|
8
|
+
ml_tools/ML_evaluation.py,sha256=BER5dOvSTySNzO92gm8tIpqJ5vT-s0iHMmaoly1uUH8,16018
|
|
9
|
+
ml_tools/ML_evaluation_multi.py,sha256=uVtKGYWgOLv34Xj_jz6E_HAYzNb0HwRbMwA8oFZWpUk,12395
|
|
10
|
+
ml_tools/ML_inference.py,sha256=hwtAdyDCE1xtqLgJgyOTAPck0eTmkOCJK1cM_IJSdck,22824
|
|
11
|
+
ml_tools/ML_models.py,sha256=xZiSFh7S6eitl-VjjvNpsikojDvurK8n_ueLEh6_5pM,27979
|
|
11
12
|
ml_tools/ML_optimization.py,sha256=GX-qZ2mCI3gWRCTP5w7lXrZpfGle3J_mE0O68seIoio,13475
|
|
12
13
|
ml_tools/ML_scaler.py,sha256=pGkp1nUpeuoBvbq5hUkieQdxex6kNef1mEbeS_HUCJs,7471
|
|
13
|
-
ml_tools/ML_trainer.py,sha256=
|
|
14
|
+
ml_tools/ML_trainer.py,sha256=6JSmEQaCPSo-S_5plNBTPw-SYgzZpyMNwiqpShJf7qU,23726
|
|
14
15
|
ml_tools/PSO_optimization.py,sha256=9Y074d-B5h4Wvp9YPiy6KAeXM-Yv6Il3gWalKvOLVgo,22705
|
|
15
16
|
ml_tools/RNN_forecast.py,sha256=2CyjBLSYYc3xLHxwLXUmP5Qv8AmV1OB_EndETNX1IBk,1956
|
|
16
17
|
ml_tools/SQL.py,sha256=bkSTmMV4CtEqa67hApYWaRxTqwAlKIc5_b28P1bnDwg,10475
|
|
17
18
|
ml_tools/VIF_factor.py,sha256=2nUMupfUoogf8o6ghoFZk_OwWhFXU0R3C9Gj0HOlI14,10415
|
|
19
|
+
ml_tools/_ML_optimization_multi.py,sha256=DrNG3Vf1uUw-3CpYfXREgSGuR4dTpLWY1F3R9j-PYqQ,9816
|
|
18
20
|
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
19
21
|
ml_tools/_logger.py,sha256=TpgYguxO-CWYqqgLW0tqFjtwZ58PE_W2OCfWNGZr0n0,1175
|
|
20
22
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
21
23
|
ml_tools/custom_logger.py,sha256=nyLRxaRxkqYOFdSjI0X2BWXB8C2IU18QfmqIFKqSedI,5820
|
|
22
|
-
ml_tools/data_exploration.py,sha256=
|
|
24
|
+
ml_tools/data_exploration.py,sha256=RuMHWagXrSQi1MzAMlYeBeVg7UxhVvEq8gJ9bIam2BM,27103
|
|
23
25
|
ml_tools/ensemble_evaluation.py,sha256=wnqoTPg4WYWf2A8z5XT0eSlW4snEuLCXQVj88sZKzQ4,24683
|
|
24
26
|
ml_tools/ensemble_inference.py,sha256=rtU7eUaQne615n2g7IHZCJI-OvrBCcjxbTkEIvtCGFQ,9414
|
|
25
27
|
ml_tools/ensemble_learning.py,sha256=dAyFgSTyvxJWjc_enJ_8EUoWwiekBeoNyJNxVY-kcUU,21868
|
|
@@ -28,7 +30,7 @@ ml_tools/keys.py,sha256=HtPG8-MWh89C32A7eIlfuuA-DLwkxGkoDfwR2TGN9CQ,1074
|
|
|
28
30
|
ml_tools/optimization_tools.py,sha256=EL5tgNFwRo-82pbRE1CFVy9noNhULD7wprWuKadPheg,5090
|
|
29
31
|
ml_tools/path_manager.py,sha256=Z8e7w3MPqQaN8xmTnKuXZS6CIW59BFwwqGhGc00sdp4,13692
|
|
30
32
|
ml_tools/utilities.py,sha256=LqXXTovaHbA5AOKRk6Ru6DgAPAM0wPfYU70kUjYBryo,19231
|
|
31
|
-
dragon_ml_toolbox-
|
|
32
|
-
dragon_ml_toolbox-
|
|
33
|
-
dragon_ml_toolbox-
|
|
34
|
-
dragon_ml_toolbox-
|
|
33
|
+
dragon_ml_toolbox-8.1.0.dist-info/METADATA,sha256=qGTl4__H1ZsbyJHtExcDt14i8ziWXpEy2WaRAELPmTI,6778
|
|
34
|
+
dragon_ml_toolbox-8.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
35
|
+
dragon_ml_toolbox-8.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
36
|
+
dragon_ml_toolbox-8.1.0.dist-info/RECORD,,
|
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -18,6 +18,7 @@ from .ML_scaler import PytorchScaler
|
|
|
18
18
|
|
|
19
19
|
__all__ = [
|
|
20
20
|
"DatasetMaker",
|
|
21
|
+
"DatasetMakerMulti",
|
|
21
22
|
"VisionDatasetMaker",
|
|
22
23
|
"SequenceMaker",
|
|
23
24
|
"ResizeAspectFill",
|
|
@@ -57,71 +58,26 @@ class _PytorchDataset(Dataset):
|
|
|
57
58
|
return self.features[index], self.labels[index]
|
|
58
59
|
|
|
59
60
|
|
|
60
|
-
#
|
|
61
|
-
|
|
61
|
+
# --- Abstract Base Class (New) ---
|
|
62
|
+
# --- Abstract Base Class (Corrected) ---
|
|
63
|
+
class _BaseDatasetMaker(ABC):
|
|
62
64
|
"""
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
This class takes a DataFrame, automatically splits it into training and
|
|
66
|
-
testing sets, and converts them into PyTorch Datasets. It assumes the
|
|
67
|
-
target variable is the last column. It can also create, apply, and
|
|
68
|
-
save a PytorchScaler for standardizing continuous features.
|
|
69
|
-
|
|
70
|
-
Attributes:
|
|
71
|
-
`scaler` -> PytorchScaler | None
|
|
72
|
-
`train_dataset` -> PyTorch Dataset
|
|
73
|
-
`test_dataset` -> PyTorch Dataset
|
|
74
|
-
`feature_names` -> list[str]
|
|
75
|
-
`target_name` -> str
|
|
76
|
-
`id` -> str | None
|
|
77
|
-
|
|
78
|
-
The ID can be manually set to any string if needed, it is `None` by default.
|
|
65
|
+
Abstract base class for dataset makers. Contains shared logic for
|
|
66
|
+
splitting, scaling, and accessing datasets to reduce code duplication.
|
|
79
67
|
"""
|
|
80
|
-
def __init__(self
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
random_state: int = 42,
|
|
85
|
-
scaler: Optional[PytorchScaler] = None,
|
|
86
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
87
|
-
"""
|
|
88
|
-
Args:
|
|
89
|
-
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
90
|
-
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
91
|
-
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
92
|
-
random_state (int): The seed for the random number generator for reproducibility.
|
|
93
|
-
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
94
|
-
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
95
|
-
"""
|
|
96
|
-
# Validation
|
|
97
|
-
if not isinstance(pandas_df, pandas.DataFrame):
|
|
98
|
-
raise TypeError("Input must be a pandas.DataFrame.")
|
|
99
|
-
if kind not in ["regression", "classification"]:
|
|
100
|
-
raise ValueError("`kind` must be 'regression' or 'classification'.")
|
|
101
|
-
|
|
102
|
-
# 1. Identify features and target
|
|
103
|
-
features = pandas_df.iloc[:, :-1]
|
|
104
|
-
target = pandas_df.iloc[:, -1]
|
|
105
|
-
|
|
106
|
-
self._feature_names = features.columns.tolist()
|
|
107
|
-
self._target_name = str(target.name)
|
|
108
|
-
|
|
109
|
-
#set id
|
|
68
|
+
def __init__(self):
|
|
69
|
+
self._train_ds: Optional[Dataset] = None
|
|
70
|
+
self._test_ds: Optional[Dataset] = None
|
|
71
|
+
self.scaler: Optional[PytorchScaler] = None
|
|
110
72
|
self._id: Optional[str] = None
|
|
111
|
-
|
|
112
|
-
self.
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
self._X_train_shape = X_train.shape
|
|
120
|
-
self._X_test_shape = X_test.shape
|
|
121
|
-
self._y_train_shape = y_train.shape
|
|
122
|
-
self._y_test_shape = y_test.shape
|
|
123
|
-
|
|
124
|
-
# 3. Handle Column to Index Conversion
|
|
73
|
+
self._feature_names: List[str] = []
|
|
74
|
+
self._X_train_shape = (0,0)
|
|
75
|
+
self._X_test_shape = (0,0)
|
|
76
|
+
self._y_train_shape = (0,)
|
|
77
|
+
self._y_test_shape = (0,)
|
|
78
|
+
|
|
79
|
+
def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
|
|
80
|
+
"""Internal helper to fit and apply a PytorchScaler."""
|
|
125
81
|
continuous_feature_indices: Optional[List[int]] = None
|
|
126
82
|
if continuous_feature_columns:
|
|
127
83
|
if all(isinstance(c, str) for c in continuous_feature_columns):
|
|
@@ -129,108 +85,201 @@ class DatasetMaker:
|
|
|
129
85
|
try:
|
|
130
86
|
continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
|
|
131
87
|
except KeyError as e:
|
|
132
|
-
raise ValueError(f"Feature column '{e.args[0]}' not found
|
|
88
|
+
raise ValueError(f"Feature column '{e.args[0]}' not found.")
|
|
133
89
|
elif all(isinstance(c, int) for c in continuous_feature_columns):
|
|
134
90
|
continuous_feature_indices = continuous_feature_columns # type: ignore
|
|
135
91
|
else:
|
|
136
92
|
raise TypeError("`continuous_feature_columns` must be a list of all strings or all integers.")
|
|
137
|
-
|
|
138
|
-
# 4. Handle Scaling
|
|
93
|
+
|
|
139
94
|
X_train_values = X_train.values
|
|
140
95
|
X_test_values = X_test.values
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
temp_label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
148
|
-
temp_train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=temp_label_dtype)
|
|
149
|
-
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
150
|
-
|
|
151
|
-
# If a scaler exists (either passed in or just fitted), apply it
|
|
96
|
+
|
|
97
|
+
if self.scaler is None and continuous_feature_indices:
|
|
98
|
+
_LOGGER.info("Fitting a new PytorchScaler on training data.")
|
|
99
|
+
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
100
|
+
self.scaler = PytorchScaler.fit(temp_train_ds, continuous_feature_indices)
|
|
101
|
+
|
|
152
102
|
if self.scaler and self.scaler.mean_ is not None:
|
|
153
103
|
_LOGGER.info("Applying scaler transformation to train and test feature sets.")
|
|
154
104
|
X_train_tensor = self.scaler.transform(torch.tensor(X_train_values, dtype=torch.float32))
|
|
155
105
|
X_test_tensor = self.scaler.transform(torch.tensor(X_test_values, dtype=torch.float32))
|
|
156
|
-
|
|
157
|
-
X_train_values = X_train_tensor.numpy()
|
|
158
|
-
X_test_values = X_test_tensor.numpy()
|
|
106
|
+
return X_train_tensor.numpy(), X_test_tensor.numpy()
|
|
159
107
|
|
|
160
|
-
|
|
161
|
-
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
162
|
-
self._train_ds = _PytorchDataset(X_train_values, y_train.values, labels_dtype=label_dtype)
|
|
163
|
-
self._test_ds = _PytorchDataset(X_test_values, y_test.values, labels_dtype=label_dtype)
|
|
108
|
+
return X_train_values, X_test_values
|
|
164
109
|
|
|
165
110
|
@property
|
|
166
111
|
def train_dataset(self) -> Dataset:
|
|
167
|
-
"
|
|
112
|
+
if self._train_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
168
113
|
return self._train_ds
|
|
169
114
|
|
|
170
115
|
@property
|
|
171
116
|
def test_dataset(self) -> Dataset:
|
|
172
|
-
"
|
|
117
|
+
if self._test_ds is None: raise RuntimeError("Dataset not yet created.")
|
|
173
118
|
return self._test_ds
|
|
174
119
|
|
|
175
120
|
@property
|
|
176
121
|
def feature_names(self) -> list[str]:
|
|
177
|
-
"""Returns the list of feature column names."""
|
|
178
122
|
return self._feature_names
|
|
179
123
|
|
|
180
|
-
@property
|
|
181
|
-
def target_name(self) -> str:
|
|
182
|
-
"""Returns the name of the target column."""
|
|
183
|
-
return self._target_name
|
|
184
|
-
|
|
185
124
|
@property
|
|
186
125
|
def id(self) -> Optional[str]:
|
|
187
|
-
"""Returns the object identifier if any."""
|
|
188
126
|
return self._id
|
|
189
|
-
|
|
127
|
+
|
|
190
128
|
@id.setter
|
|
191
129
|
def id(self, dataset_id: str):
|
|
192
|
-
|
|
193
|
-
if not isinstance(dataset_id, str):
|
|
194
|
-
raise ValueError(f"Dataset ID '{type(dataset_id)}' is not a string.")
|
|
130
|
+
if not isinstance(dataset_id, str): raise ValueError("ID must be a string.")
|
|
195
131
|
self._id = dataset_id
|
|
196
132
|
|
|
197
133
|
def dataframes_info(self) -> None:
|
|
198
|
-
"
|
|
199
|
-
print("
|
|
200
|
-
print(f"
|
|
201
|
-
print(
|
|
202
|
-
|
|
203
|
-
print(f" y_test shape: {self._y_test_shape}")
|
|
204
|
-
print("-------------------------------------------")
|
|
205
|
-
|
|
134
|
+
print("--- DataFrame Shapes After Split ---")
|
|
135
|
+
print(f" X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
|
|
136
|
+
print(f" X_test shape: {self._X_test_shape}, y_test shape: {self._y_test_shape}")
|
|
137
|
+
print("------------------------------------")
|
|
138
|
+
|
|
206
139
|
def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
|
|
207
140
|
"""Saves a list of feature names as a text file"""
|
|
208
141
|
save_list_strings(list_strings=self._feature_names,
|
|
209
142
|
directory=directory,
|
|
210
143
|
filename="feature_names",
|
|
211
|
-
verbose=verbose)
|
|
212
|
-
|
|
144
|
+
verbose=verbose)
|
|
145
|
+
|
|
213
146
|
def save_scaler(self, save_dir: Union[str, Path]):
|
|
214
147
|
"""
|
|
215
148
|
Saves the fitted PytorchScaler's state to a .pth file.
|
|
216
149
|
|
|
217
|
-
The filename is automatically generated based on the
|
|
150
|
+
The filename is automatically generated based on the dataset id.
|
|
218
151
|
|
|
219
152
|
Args:
|
|
220
153
|
save_dir (str | Path): The directory where the scaler will be saved.
|
|
221
154
|
"""
|
|
222
|
-
if not self.scaler:
|
|
223
|
-
|
|
224
|
-
return
|
|
225
|
-
|
|
155
|
+
if not self.scaler: raise RuntimeError("No scaler was fitted or provided.")
|
|
156
|
+
if not self.id: raise ValueError("Must set the `id` before saving scaler.")
|
|
226
157
|
save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
sanitized_target = sanitize_filename(self.target_name)
|
|
230
|
-
filename = f"scaler_{sanitized_target}.pth"
|
|
231
|
-
|
|
158
|
+
sanitized_id = sanitize_filename(self.id)
|
|
159
|
+
filename = f"scaler_{sanitized_id}.pth"
|
|
232
160
|
filepath = save_path / filename
|
|
233
161
|
self.scaler.save(filepath)
|
|
162
|
+
_LOGGER.info(f"Scaler for dataset '{self.id}' saved to '{filepath.name}'.")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# Single target dataset
|
|
166
|
+
class DatasetMaker(_BaseDatasetMaker):
|
|
167
|
+
"""
|
|
168
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
169
|
+
|
|
170
|
+
This class takes a DataFrame, automatically splits it into training and
|
|
171
|
+
testing sets, and converts them into PyTorch Datasets. It assumes the
|
|
172
|
+
target variable is the last column. It can also create, apply, and
|
|
173
|
+
save a PytorchScaler for standardizing continuous features.
|
|
174
|
+
|
|
175
|
+
Attributes:
|
|
176
|
+
`scaler` -> PytorchScaler | None
|
|
177
|
+
`train_dataset` -> PyTorch Dataset
|
|
178
|
+
`test_dataset` -> PyTorch Dataset
|
|
179
|
+
`feature_names` -> list[str]
|
|
180
|
+
`target_name` -> str
|
|
181
|
+
`id` -> str
|
|
182
|
+
|
|
183
|
+
The ID can be manually set to any string if needed, it is the target name by default.
|
|
184
|
+
"""
|
|
185
|
+
def __init__(self,
|
|
186
|
+
pandas_df: pandas.DataFrame,
|
|
187
|
+
kind: Literal["regression", "classification"],
|
|
188
|
+
test_size: float = 0.2,
|
|
189
|
+
random_state: int = 42,
|
|
190
|
+
scaler: Optional[PytorchScaler] = None,
|
|
191
|
+
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
192
|
+
"""
|
|
193
|
+
Args:
|
|
194
|
+
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
195
|
+
kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
|
|
196
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
197
|
+
random_state (int): The seed for the random number generator for reproducibility.
|
|
198
|
+
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
199
|
+
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
200
|
+
"""
|
|
201
|
+
super().__init__()
|
|
202
|
+
self.scaler = scaler
|
|
203
|
+
|
|
204
|
+
# --- 1. Identify features and target (single-target logic) ---
|
|
205
|
+
features = pandas_df.iloc[:, :-1]
|
|
206
|
+
target = pandas_df.iloc[:, -1]
|
|
207
|
+
self._feature_names = features.columns.tolist()
|
|
208
|
+
self._target_name = str(target.name)
|
|
209
|
+
self._id = self._target_name
|
|
210
|
+
|
|
211
|
+
# --- 2. Split ---
|
|
212
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
213
|
+
features, target, test_size=test_size, random_state=random_state
|
|
214
|
+
)
|
|
215
|
+
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
216
|
+
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
217
|
+
|
|
218
|
+
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
219
|
+
|
|
220
|
+
# --- 3. Scale ---
|
|
221
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
222
|
+
X_train, y_train, X_test, label_dtype, continuous_feature_columns
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# --- 4. Create Datasets ---
|
|
226
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train.values, label_dtype)
|
|
227
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test.values, label_dtype)
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def target_name(self) -> str:
|
|
231
|
+
return self._target_name
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# --- New Multi-Target Class ---
|
|
235
|
+
class DatasetMakerMulti(_BaseDatasetMaker):
|
|
236
|
+
"""
|
|
237
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
|
|
238
|
+
|
|
239
|
+
This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
|
|
240
|
+
"""
|
|
241
|
+
def __init__(self,
|
|
242
|
+
pandas_df: pandas.DataFrame,
|
|
243
|
+
target_columns: List[str],
|
|
244
|
+
test_size: float = 0.2,
|
|
245
|
+
random_state: int = 42,
|
|
246
|
+
scaler: Optional[PytorchScaler] = None,
|
|
247
|
+
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
248
|
+
"""
|
|
249
|
+
Args:
|
|
250
|
+
pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
|
|
251
|
+
target_columns (list[str]): List of target column names.
|
|
252
|
+
test_size (float): The proportion of the dataset to allocate to the test split.
|
|
253
|
+
random_state (int): The seed for the random number generator for reproducibility.
|
|
254
|
+
scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
|
|
255
|
+
continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
|
|
256
|
+
"""
|
|
257
|
+
super().__init__()
|
|
258
|
+
self.scaler = scaler
|
|
259
|
+
|
|
260
|
+
self._target_names = target_columns
|
|
261
|
+
self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
|
|
262
|
+
features = pandas_df[self._feature_names]
|
|
263
|
+
target = pandas_df[self._target_names]
|
|
264
|
+
|
|
265
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
266
|
+
features, target, test_size=test_size, random_state=random_state
|
|
267
|
+
)
|
|
268
|
+
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
269
|
+
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
270
|
+
|
|
271
|
+
label_dtype = torch.float32
|
|
272
|
+
|
|
273
|
+
X_train_final, X_test_final = self._prepare_scaler(
|
|
274
|
+
X_train, y_train, X_test, label_dtype, continuous_feature_columns
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, label_dtype)
|
|
278
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, label_dtype)
|
|
279
|
+
|
|
280
|
+
@property
|
|
281
|
+
def target_names(self) -> list[str]:
|
|
282
|
+
return self._target_names
|
|
234
283
|
|
|
235
284
|
|
|
236
285
|
# --- Private Base Class ---
|
ml_tools/ML_evaluation.py
CHANGED
|
@@ -249,8 +249,11 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
|
|
|
249
249
|
plt.close(fig_hist)
|
|
250
250
|
|
|
251
251
|
|
|
252
|
-
def shap_summary_plot(model,
|
|
253
|
-
|
|
252
|
+
def shap_summary_plot(model,
|
|
253
|
+
background_data: Union[torch.Tensor,np.ndarray],
|
|
254
|
+
instances_to_explain: Union[torch.Tensor,np.ndarray],
|
|
255
|
+
feature_names: Optional[list[str]],
|
|
256
|
+
save_dir: Union[str, Path]):
|
|
254
257
|
"""
|
|
255
258
|
Calculates SHAP values and saves summary plots and data.
|
|
256
259
|
|