dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dragon-ml-toolbox might be problematic. Click here for more details.
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA +1 -1
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/RECORD +14 -14
- ml_tools/ML_datasetmaster.py +144 -63
- ml_tools/ML_models.py +119 -55
- ml_tools/ML_optimization.py +49 -36
- ml_tools/PSO_optimization.py +5 -1
- ml_tools/_schema.py +19 -0
- ml_tools/data_exploration.py +75 -46
- ml_tools/optimization_tools.py +65 -86
- ml_tools/serde.py +1 -2
- ml_tools/ML_simple_optimization.py +0 -413
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/top_level.txt +0 -0
|
@@ -1,41 +1,41 @@
|
|
|
1
|
-
dragon_ml_toolbox-13.
|
|
2
|
-
dragon_ml_toolbox-13.
|
|
1
|
+
dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
|
|
2
|
+
dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
|
|
3
3
|
ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
|
|
4
4
|
ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
|
|
5
5
|
ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
|
|
6
6
|
ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
|
|
7
7
|
ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
|
|
8
|
-
ml_tools/ML_datasetmaster.py,sha256=
|
|
8
|
+
ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
|
|
9
9
|
ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
|
|
10
10
|
ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
|
|
11
11
|
ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
|
|
12
|
-
ml_tools/ML_models.py,sha256=
|
|
13
|
-
ml_tools/ML_optimization.py,sha256=
|
|
12
|
+
ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
|
|
13
|
+
ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
|
|
14
14
|
ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
|
|
15
|
-
ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
|
|
16
15
|
ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
|
|
17
16
|
ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
|
|
18
|
-
ml_tools/PSO_optimization.py,sha256=
|
|
17
|
+
ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
|
|
19
18
|
ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
|
|
20
19
|
ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
|
|
21
20
|
ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
|
|
22
21
|
ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
|
|
23
22
|
ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
|
|
23
|
+
ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
|
|
24
24
|
ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
|
|
25
25
|
ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
|
|
26
26
|
ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
|
|
27
|
-
ml_tools/data_exploration.py,sha256=
|
|
27
|
+
ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
|
|
28
28
|
ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
|
|
29
29
|
ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
|
|
30
30
|
ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
|
|
31
31
|
ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
|
|
32
32
|
ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
|
|
33
33
|
ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
|
|
34
|
-
ml_tools/optimization_tools.py,sha256=
|
|
34
|
+
ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
|
|
35
35
|
ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
|
|
36
|
-
ml_tools/serde.py,sha256=
|
|
36
|
+
ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
|
|
37
37
|
ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
|
|
38
|
-
dragon_ml_toolbox-13.
|
|
39
|
-
dragon_ml_toolbox-13.
|
|
40
|
-
dragon_ml_toolbox-13.
|
|
41
|
-
dragon_ml_toolbox-13.
|
|
38
|
+
dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
|
|
39
|
+
dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
40
|
+
dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
|
|
41
|
+
dragon_ml_toolbox-13.1.0.dist-info/RECORD,,
|
ml_tools/ML_datasetmaster.py
CHANGED
|
@@ -17,6 +17,7 @@ from ._script_info import _script_info
|
|
|
17
17
|
from .custom_logger import save_list_strings
|
|
18
18
|
from .ML_scaler import PytorchScaler
|
|
19
19
|
from .keys import DatasetKeys
|
|
20
|
+
from ._schema import FeatureSchema
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
__all__ = [
|
|
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
|
|
|
35
36
|
Converts numpy/pandas data into tensors for model consumption.
|
|
36
37
|
"""
|
|
37
38
|
def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
|
|
38
|
-
labels: Union[numpy.ndarray, pandas.Series],
|
|
39
|
+
labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
|
|
39
40
|
labels_dtype: torch.dtype,
|
|
40
41
|
features_dtype: torch.dtype = torch.float32,
|
|
41
42
|
feature_names: Optional[List[str]] = None,
|
|
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
|
|
|
48
49
|
|
|
49
50
|
if isinstance(features, numpy.ndarray):
|
|
50
51
|
self.features = torch.tensor(features, dtype=features_dtype)
|
|
51
|
-
else:
|
|
52
|
-
self.features = torch.tensor(features.
|
|
52
|
+
else: # It's a pandas.DataFrame
|
|
53
|
+
self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
|
|
53
54
|
|
|
54
55
|
if isinstance(labels, numpy.ndarray):
|
|
55
56
|
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
57
|
+
elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
|
|
58
|
+
self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
|
|
56
59
|
else:
|
|
57
|
-
|
|
60
|
+
# Fallback for other types (though your type hints don't cover this)
|
|
61
|
+
self.labels = torch.tensor(labels, dtype=labels_dtype)
|
|
58
62
|
|
|
59
63
|
self._feature_names = feature_names
|
|
60
64
|
self._target_names = target_names
|
|
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
|
|
|
98
102
|
self._X_test_shape = (0,0)
|
|
99
103
|
self._y_train_shape = (0,)
|
|
100
104
|
self._y_test_shape = (0,)
|
|
101
|
-
|
|
102
|
-
def _prepare_scaler(self,
|
|
103
|
-
|
|
105
|
+
|
|
106
|
+
def _prepare_scaler(self,
|
|
107
|
+
X_train: pandas.DataFrame,
|
|
108
|
+
y_train: Union[pandas.Series, pandas.DataFrame],
|
|
109
|
+
X_test: pandas.DataFrame,
|
|
110
|
+
label_dtype: torch.dtype,
|
|
111
|
+
schema: FeatureSchema):
|
|
112
|
+
"""Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
|
|
104
113
|
continuous_feature_indices: Optional[List[int]] = None
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
|
|
115
|
+
# Get continuous feature indices *from the schema*
|
|
116
|
+
if schema.continuous_feature_names:
|
|
117
|
+
_LOGGER.info("Getting continuous feature indices from schema.")
|
|
118
|
+
try:
|
|
119
|
+
# Convert columns to a standard list for .index()
|
|
120
|
+
train_cols_list = X_train.columns.to_list()
|
|
121
|
+
# Map names from schema to column indices in the training DataFrame
|
|
122
|
+
continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
|
|
123
|
+
except ValueError as e: #
|
|
124
|
+
_LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
|
|
125
|
+
raise ValueError()
|
|
126
|
+
else:
|
|
127
|
+
_LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
|
|
118
128
|
|
|
119
129
|
X_train_values = X_train.values
|
|
120
130
|
X_test_values = X_test.values
|
|
121
131
|
|
|
132
|
+
# continuous_feature_indices is derived
|
|
122
133
|
if self.scaler is None and continuous_feature_indices:
|
|
123
134
|
_LOGGER.info("Fitting a new PytorchScaler on training data.")
|
|
124
135
|
temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
|
|
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
225
236
|
"""
|
|
226
237
|
Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
|
|
227
238
|
|
|
228
|
-
This class takes a DataFrame, automatically splits
|
|
229
|
-
|
|
230
|
-
target variable is the last column. It can also create, apply, and
|
|
231
|
-
save a PytorchScaler for standardizing continuous features.
|
|
239
|
+
This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
|
|
240
|
+
It can also create and apply a PytorchScaler using the schema.
|
|
232
241
|
|
|
233
242
|
Attributes:
|
|
234
243
|
`scaler` -> PytorchScaler | None
|
|
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
|
|
|
242
251
|
"""
|
|
243
252
|
def __init__(self,
|
|
244
253
|
pandas_df: pandas.DataFrame,
|
|
254
|
+
schema: FeatureSchema,
|
|
245
255
|
kind: Literal["regression", "classification"],
|
|
246
256
|
test_size: float = 0.2,
|
|
247
257
|
random_state: int = 42,
|
|
248
|
-
scaler: Optional[PytorchScaler] = None
|
|
249
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
258
|
+
scaler: Optional[PytorchScaler] = None):
|
|
250
259
|
"""
|
|
251
260
|
Args:
|
|
252
|
-
pandas_df (pandas.DataFrame):
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
261
|
+
pandas_df (pandas.DataFrame):
|
|
262
|
+
The pre-processed input DataFrame containing all columns. (features and single target).
|
|
263
|
+
schema (FeatureSchema):
|
|
264
|
+
The definitive schema object from data_exploration.
|
|
265
|
+
kind (Literal["regression", "classification"]):
|
|
266
|
+
The type of ML task. This determines the data type of the labels.
|
|
267
|
+
test_size (float):
|
|
268
|
+
The proportion of the dataset to allocate to the test split.
|
|
269
|
+
random_state (int):
|
|
270
|
+
The seed for the random number of generator for reproducibility.
|
|
271
|
+
scaler (PytorchScaler | None):
|
|
272
|
+
A pre-fitted PytorchScaler instance, if None a new scaler will be created.
|
|
258
273
|
"""
|
|
259
274
|
super().__init__()
|
|
260
275
|
self.scaler = scaler
|
|
261
276
|
|
|
262
|
-
# --- 1. Identify features
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
277
|
+
# --- 1. Identify features (from schema) ---
|
|
278
|
+
self._feature_names = list(schema.feature_names)
|
|
279
|
+
|
|
280
|
+
# --- 2. Infer target (by set difference) ---
|
|
281
|
+
all_cols_set = set(pandas_df.columns)
|
|
282
|
+
feature_cols_set = set(self._feature_names)
|
|
283
|
+
|
|
284
|
+
target_cols_set = all_cols_set - feature_cols_set
|
|
285
|
+
|
|
286
|
+
if len(target_cols_set) == 0:
|
|
287
|
+
_LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
|
|
288
|
+
raise ValueError("No target column found in DataFrame.")
|
|
289
|
+
if len(target_cols_set) > 1:
|
|
290
|
+
_LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
|
|
291
|
+
raise ValueError("Ambiguous target: More than one non-feature column found.")
|
|
292
|
+
|
|
293
|
+
target_name = list(target_cols_set)[0]
|
|
294
|
+
self._target_names = [target_name]
|
|
295
|
+
self._id = target_name
|
|
296
|
+
|
|
297
|
+
# --- 3. Split Data ---
|
|
298
|
+
features_df = pandas_df[self._feature_names]
|
|
299
|
+
target_series = pandas_df[target_name]
|
|
268
300
|
|
|
269
|
-
# --- 2. Split ---
|
|
270
301
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
271
|
-
|
|
302
|
+
features_df,
|
|
303
|
+
target_series,
|
|
304
|
+
test_size=test_size,
|
|
305
|
+
random_state=random_state
|
|
272
306
|
)
|
|
273
307
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
274
308
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
275
309
|
|
|
276
310
|
label_dtype = torch.float32 if kind == "regression" else torch.int64
|
|
277
311
|
|
|
278
|
-
# ---
|
|
312
|
+
# --- 4. Scale (using the schema) ---
|
|
279
313
|
X_train_final, X_test_final = self._prepare_scaler(
|
|
280
|
-
X_train, y_train, X_test, label_dtype,
|
|
314
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
281
315
|
)
|
|
282
316
|
|
|
283
|
-
# ---
|
|
284
|
-
self._train_ds = _PytorchDataset(X_train_final, y_train
|
|
285
|
-
self._test_ds = _PytorchDataset(X_test_final, y_test
|
|
286
|
-
|
|
317
|
+
# --- 5. Create Datasets ---
|
|
318
|
+
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
319
|
+
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
320
|
+
|
|
287
321
|
|
|
288
|
-
# ---
|
|
322
|
+
# --- Multi-Target Class ---
|
|
289
323
|
class DatasetMakerMulti(_BaseDatasetMaker):
|
|
290
324
|
"""
|
|
291
|
-
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
325
|
+
Dataset maker for pre-processed, numerical pandas DataFrames with
|
|
326
|
+
multiple target columns.
|
|
292
327
|
|
|
293
|
-
This class takes a DataFrame,
|
|
328
|
+
This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
|
|
329
|
+
*target_columns*. It validates that the schema's features and the
|
|
330
|
+
target columns are mutually exclusive and together account for all
|
|
331
|
+
columns in the DataFrame.
|
|
332
|
+
|
|
333
|
+
Targets dtype is torch.float32
|
|
294
334
|
"""
|
|
295
335
|
def __init__(self,
|
|
296
336
|
pandas_df: pandas.DataFrame,
|
|
297
337
|
target_columns: List[str],
|
|
338
|
+
schema: FeatureSchema,
|
|
298
339
|
test_size: float = 0.2,
|
|
299
340
|
random_state: int = 42,
|
|
300
|
-
scaler: Optional[PytorchScaler] = None
|
|
301
|
-
continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
|
|
341
|
+
scaler: Optional[PytorchScaler] = None):
|
|
302
342
|
"""
|
|
303
343
|
Args:
|
|
304
|
-
pandas_df (pandas.DataFrame):
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
344
|
+
pandas_df (pandas.DataFrame):
|
|
345
|
+
The pre-processed input DataFrame with *all* columns
|
|
346
|
+
(features and targets).
|
|
347
|
+
target_columns (list[str]):
|
|
348
|
+
List of target column names.
|
|
349
|
+
schema (FeatureSchema):
|
|
350
|
+
The definitive schema object from data_exploration.
|
|
351
|
+
test_size (float):
|
|
352
|
+
The proportion of the dataset to allocate to the test split.
|
|
353
|
+
random_state (int):
|
|
354
|
+
The seed for the random number generator for reproducibility.
|
|
355
|
+
scaler (PytorchScaler | None):
|
|
356
|
+
A pre-fitted PytorchScaler instance.
|
|
357
|
+
|
|
358
|
+
## Note:
|
|
359
|
+
For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
|
|
360
|
+
This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
|
|
310
361
|
"""
|
|
311
362
|
super().__init__()
|
|
312
363
|
self.scaler = scaler
|
|
313
364
|
|
|
365
|
+
# --- 1. Get features and targets from schema/args ---
|
|
366
|
+
self._feature_names = list(schema.feature_names)
|
|
314
367
|
self._target_names = target_columns
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
368
|
+
|
|
369
|
+
# --- 2. Validation ---
|
|
370
|
+
all_cols_set = set(pandas_df.columns)
|
|
371
|
+
feature_cols_set = set(self._feature_names)
|
|
372
|
+
target_cols_set = set(self._target_names)
|
|
373
|
+
|
|
374
|
+
overlap = feature_cols_set.intersection(target_cols_set)
|
|
375
|
+
if overlap:
|
|
376
|
+
_LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
|
|
377
|
+
raise ValueError("Features and targets overlap.")
|
|
378
|
+
|
|
379
|
+
schema_plus_targets = feature_cols_set.union(target_cols_set)
|
|
380
|
+
missing_cols = all_cols_set - schema_plus_targets
|
|
381
|
+
if missing_cols:
|
|
382
|
+
_LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
|
|
383
|
+
|
|
384
|
+
extra_cols = schema_plus_targets - all_cols_set
|
|
385
|
+
if extra_cols:
|
|
386
|
+
_LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
|
|
387
|
+
raise ValueError("Schema/target definition mismatch with DataFrame.")
|
|
388
|
+
|
|
389
|
+
# --- 3. Split Data ---
|
|
390
|
+
features_df = pandas_df[self._feature_names]
|
|
391
|
+
target_df = pandas_df[self._target_names]
|
|
318
392
|
|
|
319
393
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
320
|
-
|
|
394
|
+
features_df,
|
|
395
|
+
target_df,
|
|
396
|
+
test_size=test_size,
|
|
397
|
+
random_state=random_state
|
|
321
398
|
)
|
|
322
399
|
self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
|
|
323
400
|
self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
|
|
324
401
|
|
|
325
|
-
|
|
402
|
+
# Multi-target for regression or multi-binary
|
|
403
|
+
label_dtype = torch.float32
|
|
326
404
|
|
|
405
|
+
# --- 4. Scale (using the schema) ---
|
|
327
406
|
X_train_final, X_test_final = self._prepare_scaler(
|
|
328
|
-
X_train, y_train, X_test, label_dtype,
|
|
407
|
+
X_train, y_train, X_test, label_dtype, schema
|
|
329
408
|
)
|
|
330
409
|
|
|
410
|
+
# --- 5. Create Datasets ---
|
|
411
|
+
# _PytorchDataset now correctly handles y_train (a DataFrame)
|
|
331
412
|
self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
332
413
|
self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
|
|
333
414
|
|
ml_tools/ML_models.py
CHANGED
|
@@ -8,6 +8,7 @@ from ._logger import _LOGGER
|
|
|
8
8
|
from .path_manager import make_fullpath
|
|
9
9
|
from ._script_info import _script_info
|
|
10
10
|
from .keys import PytorchModelArchitectureKeys
|
|
11
|
+
from ._schema import FeatureSchema
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
__all__ = [
|
|
@@ -298,76 +299,59 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
298
299
|
"""
|
|
299
300
|
A Transformer-based model for tabular data tasks.
|
|
300
301
|
|
|
301
|
-
This model uses a Feature Tokenizer to convert all input features into a
|
|
302
|
+
This model uses a Feature Tokenizer to convert all input features into a
|
|
303
|
+
sequence of embeddings, prepends a [CLS] token, and processes the
|
|
302
304
|
sequence with a standard Transformer Encoder.
|
|
303
305
|
"""
|
|
304
306
|
def __init__(self, *,
|
|
305
|
-
|
|
307
|
+
schema: FeatureSchema,
|
|
306
308
|
out_targets: int,
|
|
307
|
-
categorical_index_map: Dict[int, int],
|
|
308
309
|
embedding_dim: int = 32,
|
|
309
310
|
num_heads: int = 8,
|
|
310
311
|
num_layers: int = 6,
|
|
311
312
|
dropout: float = 0.1):
|
|
312
313
|
"""
|
|
313
314
|
Args:
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
The model requires a specific input format. All columns in the input DataFrame must be numerical, but they are treated differently based on the
|
|
328
|
-
provided index lists.
|
|
329
|
-
|
|
330
|
-
**Nominal Categorical Features** (e.g., 'City', 'Color'): Should **NOT** be one-hot encoded.
|
|
331
|
-
Instead, convert them to integer codes (label encoding). You must then provide a dictionary mapping their column indices to
|
|
332
|
-
their cardinality (the number of unique categories) via the `categorical_map` parameter.
|
|
333
|
-
|
|
334
|
-
**Ordinal & Binary Features** (e.g., 'Low/Medium/High', 'True/False'): Should be treated as **numerical**. Map them to numbers that
|
|
335
|
-
represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should **NOT** be included in the
|
|
336
|
-
`categorical_map` parameter.
|
|
337
|
-
|
|
338
|
-
**Standard Numerical and Continuous Features** (e.g., 'Age', 'Price'): It is highly recommended to scale them before training.
|
|
315
|
+
schema (FeatureSchema):
|
|
316
|
+
The definitive schema object created by `data_exploration.finalize_feature_schema()`.
|
|
317
|
+
out_targets (int):
|
|
318
|
+
Number of output targets (1 for regression).
|
|
319
|
+
embedding_dim (int):
|
|
320
|
+
The dimension for all feature embeddings. Must be divisible
|
|
321
|
+
by num_heads.
|
|
322
|
+
num_heads (int):
|
|
323
|
+
The number of heads in the multi-head attention mechanism.
|
|
324
|
+
num_layers (int):
|
|
325
|
+
The number of sub-encoder-layers in the transformer encoder.
|
|
326
|
+
dropout (float):
|
|
327
|
+
The dropout value.
|
|
339
328
|
"""
|
|
340
329
|
super().__init__()
|
|
341
330
|
|
|
331
|
+
# --- Get info from schema ---
|
|
332
|
+
in_features = len(schema.feature_names)
|
|
333
|
+
categorical_index_map = schema.categorical_index_map
|
|
334
|
+
|
|
342
335
|
# --- Validation ---
|
|
343
|
-
if categorical_index_map and max(categorical_index_map.keys()) >= in_features:
|
|
336
|
+
if categorical_index_map and (max(categorical_index_map.keys()) >= in_features):
|
|
344
337
|
_LOGGER.error(f"A categorical index ({max(categorical_index_map.keys())}) is out of bounds for the provided input features ({in_features}).")
|
|
345
338
|
raise ValueError()
|
|
346
339
|
|
|
347
|
-
# --- Derive numerical indices ---
|
|
348
|
-
all_indices = set(range(in_features))
|
|
349
|
-
categorical_indices_set = set(categorical_index_map.keys())
|
|
350
|
-
numerical_indices = sorted(list(all_indices - categorical_indices_set))
|
|
351
|
-
|
|
352
340
|
# --- Save configuration ---
|
|
353
|
-
self.
|
|
341
|
+
self.schema = schema # <-- Save the whole schema
|
|
354
342
|
self.out_targets = out_targets
|
|
355
|
-
self.numerical_indices = numerical_indices
|
|
356
|
-
self.categorical_map = categorical_index_map
|
|
357
343
|
self.embedding_dim = embedding_dim
|
|
358
344
|
self.num_heads = num_heads
|
|
359
345
|
self.num_layers = num_layers
|
|
360
346
|
self.dropout = dropout
|
|
361
347
|
|
|
362
|
-
# --- 1. Feature Tokenizer ---
|
|
348
|
+
# --- 1. Feature Tokenizer (now takes the schema) ---
|
|
363
349
|
self.tokenizer = _FeatureTokenizer(
|
|
364
|
-
|
|
365
|
-
categorical_map=categorical_index_map,
|
|
350
|
+
schema=schema,
|
|
366
351
|
embedding_dim=embedding_dim
|
|
367
352
|
)
|
|
368
353
|
|
|
369
354
|
# --- 2. CLS Token ---
|
|
370
|
-
# A learnable token that will be prepended to the sequence.
|
|
371
355
|
self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
|
|
372
356
|
|
|
373
357
|
# --- 3. Transformer Encoder ---
|
|
@@ -416,21 +400,87 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
416
400
|
|
|
417
401
|
def get_architecture_config(self) -> Dict[str, Any]:
|
|
418
402
|
"""Returns the full configuration of the model."""
|
|
403
|
+
# Deconstruct schema into a JSON-friendly dict
|
|
404
|
+
# Tuples are saved as lists
|
|
405
|
+
schema_dict = {
|
|
406
|
+
'feature_names': self.schema.feature_names,
|
|
407
|
+
'continuous_feature_names': self.schema.continuous_feature_names,
|
|
408
|
+
'categorical_feature_names': self.schema.categorical_feature_names,
|
|
409
|
+
'categorical_index_map': self.schema.categorical_index_map,
|
|
410
|
+
'categorical_mappings': self.schema.categorical_mappings
|
|
411
|
+
}
|
|
412
|
+
|
|
419
413
|
return {
|
|
420
|
-
'
|
|
414
|
+
'schema_dict': schema_dict,
|
|
421
415
|
'out_targets': self.out_targets,
|
|
422
|
-
'categorical_map': self.categorical_map,
|
|
423
416
|
'embedding_dim': self.embedding_dim,
|
|
424
417
|
'num_heads': self.num_heads,
|
|
425
418
|
'num_layers': self.num_layers,
|
|
426
419
|
'dropout': self.dropout
|
|
427
420
|
}
|
|
421
|
+
|
|
422
|
+
@classmethod
|
|
423
|
+
def load(cls: type, file_or_dir: Union[str, Path], verbose: bool = True) -> nn.Module:
|
|
424
|
+
"""Loads a model architecture from a JSON file."""
|
|
425
|
+
user_path = make_fullpath(file_or_dir)
|
|
426
|
+
|
|
427
|
+
if user_path.is_dir():
|
|
428
|
+
json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
|
|
429
|
+
target_path = make_fullpath(user_path / json_filename, enforce="file")
|
|
430
|
+
elif user_path.is_file():
|
|
431
|
+
target_path = user_path
|
|
432
|
+
else:
|
|
433
|
+
_LOGGER.error(f"Invalid path: '{file_or_dir}'")
|
|
434
|
+
raise IOError()
|
|
435
|
+
|
|
436
|
+
with open(target_path, 'r') as f:
|
|
437
|
+
saved_data = json.load(f)
|
|
438
|
+
|
|
439
|
+
saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
|
|
440
|
+
config = saved_data[PytorchModelArchitectureKeys.CONFIG]
|
|
441
|
+
|
|
442
|
+
if saved_class_name != cls.__name__:
|
|
443
|
+
_LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")
|
|
444
|
+
raise ValueError()
|
|
445
|
+
|
|
446
|
+
# --- RECONSTRUCTION LOGIC ---
|
|
447
|
+
if 'schema_dict' not in config:
|
|
448
|
+
_LOGGER.error("Invalid architecture file: missing 'schema_dict'. This file may be from an older version.")
|
|
449
|
+
raise ValueError("Missing 'schema_dict' in config.")
|
|
450
|
+
|
|
451
|
+
schema_data = config.pop('schema_dict')
|
|
452
|
+
|
|
453
|
+
# Re-hydrate the categorical_index_map
|
|
454
|
+
# JSON saves all dict keys as strings, so we must convert them back to int.
|
|
455
|
+
raw_index_map = schema_data['categorical_index_map']
|
|
456
|
+
if raw_index_map is not None:
|
|
457
|
+
rehydrated_index_map = {int(k): v for k, v in raw_index_map.items()}
|
|
458
|
+
else:
|
|
459
|
+
rehydrated_index_map = None
|
|
460
|
+
|
|
461
|
+
# Re-hydrate the FeatureSchema object
|
|
462
|
+
# JSON deserializes tuples as lists, so we must convert them back.
|
|
463
|
+
schema = FeatureSchema(
|
|
464
|
+
feature_names=tuple(schema_data['feature_names']),
|
|
465
|
+
continuous_feature_names=tuple(schema_data['continuous_feature_names']),
|
|
466
|
+
categorical_feature_names=tuple(schema_data['categorical_feature_names']),
|
|
467
|
+
categorical_index_map=rehydrated_index_map,
|
|
468
|
+
categorical_mappings=schema_data['categorical_mappings']
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
config['schema'] = schema
|
|
472
|
+
# --- End Reconstruction ---
|
|
473
|
+
|
|
474
|
+
model = cls(**config)
|
|
475
|
+
if verbose:
|
|
476
|
+
_LOGGER.info(f"Successfully loaded architecture for '{saved_class_name}'")
|
|
477
|
+
return model
|
|
428
478
|
|
|
429
479
|
def __repr__(self) -> str:
|
|
430
480
|
"""Returns the developer-friendly string representation of the model."""
|
|
431
481
|
# Build the architecture string part-by-part
|
|
432
482
|
parts = [
|
|
433
|
-
f"Tokenizer(features={self.
|
|
483
|
+
f"Tokenizer(features={len(self.schema.feature_names)}, dim={self.embedding_dim})",
|
|
434
484
|
"[CLS]",
|
|
435
485
|
f"TransformerEncoder(layers={self.num_layers}, heads={self.num_heads})",
|
|
436
486
|
f"PredictionHead(outputs={self.out_targets})"
|
|
@@ -443,29 +493,41 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
|
|
|
443
493
|
|
|
444
494
|
class _FeatureTokenizer(nn.Module):
|
|
445
495
|
"""
|
|
446
|
-
Transforms raw numerical and categorical features from any column order
|
|
496
|
+
Transforms raw numerical and categorical features from any column order
|
|
497
|
+
into a sequence of embeddings.
|
|
447
498
|
"""
|
|
448
499
|
def __init__(self,
|
|
449
|
-
|
|
450
|
-
categorical_map: Dict[int, int],
|
|
500
|
+
schema: FeatureSchema,
|
|
451
501
|
embedding_dim: int):
|
|
452
502
|
"""
|
|
453
503
|
Args:
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
embedding_dim (int):
|
|
504
|
+
schema (FeatureSchema):
|
|
505
|
+
The definitive schema object from data_exploration.
|
|
506
|
+
embedding_dim (int):
|
|
507
|
+
The dimension for all feature embeddings.
|
|
457
508
|
"""
|
|
458
509
|
super().__init__()
|
|
459
510
|
|
|
460
|
-
#
|
|
461
|
-
|
|
462
|
-
|
|
511
|
+
# --- Get info from schema ---
|
|
512
|
+
categorical_map = schema.categorical_index_map
|
|
513
|
+
|
|
514
|
+
if categorical_map:
|
|
515
|
+
# Unpack the dictionary into separate lists
|
|
516
|
+
self.categorical_indices = list(categorical_map.keys())
|
|
517
|
+
cardinalities = list(categorical_map.values())
|
|
518
|
+
else:
|
|
519
|
+
self.categorical_indices = []
|
|
520
|
+
cardinalities = []
|
|
521
|
+
|
|
522
|
+
# Derive numerical indices by finding what's not categorical
|
|
523
|
+
all_indices = set(range(len(schema.feature_names)))
|
|
524
|
+
categorical_indices_set = set(self.categorical_indices)
|
|
525
|
+
self.numerical_indices = sorted(list(all_indices - categorical_indices_set))
|
|
463
526
|
|
|
464
|
-
self.numerical_indices = numerical_indices
|
|
465
527
|
self.embedding_dim = embedding_dim
|
|
466
528
|
|
|
467
529
|
# A learnable embedding for each numerical feature
|
|
468
|
-
self.numerical_embeddings = nn.Parameter(torch.randn(len(numerical_indices), embedding_dim))
|
|
530
|
+
self.numerical_embeddings = nn.Parameter(torch.randn(len(self.numerical_indices), embedding_dim))
|
|
469
531
|
|
|
470
532
|
# A standard embedding layer for each categorical feature
|
|
471
533
|
self.categorical_embeddings = nn.ModuleList(
|
|
@@ -487,6 +549,8 @@ class _FeatureTokenizer(nn.Module):
|
|
|
487
549
|
# Process categorical features
|
|
488
550
|
categorical_tokens = []
|
|
489
551
|
for i, embed_layer in enumerate(self.categorical_embeddings):
|
|
552
|
+
# x_categorical[:, i] selects the i-th categorical column
|
|
553
|
+
# (e.g., all values for the 'color' feature)
|
|
490
554
|
token = embed_layer(x_categorical[:, i]).unsqueeze(1)
|
|
491
555
|
categorical_tokens.append(token)
|
|
492
556
|
|