dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dragon-ml-toolbox might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dragon-ml-toolbox
3
- Version: 13.0.0
3
+ Version: 13.1.0
4
4
  Summary: A collection of tools for data science and machine learning projects.
5
5
  Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
6
6
  License-Expression: MIT
@@ -1,41 +1,41 @@
1
- dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
- dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
1
+ dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
2
+ dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
3
3
  ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
4
4
  ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
5
5
  ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
6
6
  ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
7
7
  ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
8
- ml_tools/ML_datasetmaster.py,sha256=kedCGneR3S2zui0_JFZN6TBL5e69XWkdpkE_QohyqSM,31433
8
+ ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
9
9
  ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
10
10
  ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
11
11
  ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
12
- ml_tools/ML_models.py,sha256=G64NPhYZfYvHTIUwkIrMrNLgfDTKJwqdc8jwesPqB9E,28090
13
- ml_tools/ML_optimization.py,sha256=es3TlQbY7RYgJMZnznkjYGbUxFnAqzZxE_g3_qLK9Q8,22960
12
+ ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
13
+ ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
14
14
  ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
15
- ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
16
15
  ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
17
16
  ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
18
- ml_tools/PSO_optimization.py,sha256=fVHeemqilBS0zrGV25E5yKwDlGdd2ZKa18d8CZ6Q6Fk,22961
17
+ ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
19
18
  ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
20
19
  ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
21
20
  ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
22
21
  ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
23
22
  ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
23
+ ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
24
24
  ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
25
25
  ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
26
26
  ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
27
- ml_tools/data_exploration.py,sha256=haddQFsXAWzuf84NLItcZ4Q7vzN3YWjFoh7lPlWUczo,50679
27
+ ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
28
28
  ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
29
29
  ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
30
30
  ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
31
31
  ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
32
32
  ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
33
33
  ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
34
- ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CRw,13521
34
+ ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
35
35
  ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
36
- ml_tools/serde.py,sha256=ll2mVC0sO2jIEdG3K6xMcgEN13N4YSb8VjviGvw_ers,4949
36
+ ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
37
37
  ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
38
- dragon_ml_toolbox-13.0.0.dist-info/METADATA,sha256=trY1fFyTTXLS6TZdrJXxq4_YMPjEZhKCilzCg6qFxzw,6166
39
- dragon_ml_toolbox-13.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
- dragon_ml_toolbox-13.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
- dragon_ml_toolbox-13.0.0.dist-info/RECORD,,
38
+ dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
39
+ dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
40
+ dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
41
+ dragon_ml_toolbox-13.1.0.dist-info/RECORD,,
@@ -17,6 +17,7 @@ from ._script_info import _script_info
17
17
  from .custom_logger import save_list_strings
18
18
  from .ML_scaler import PytorchScaler
19
19
  from .keys import DatasetKeys
20
+ from ._schema import FeatureSchema
20
21
 
21
22
 
22
23
  __all__ = [
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
35
36
  Converts numpy/pandas data into tensors for model consumption.
36
37
  """
37
38
  def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
38
- labels: Union[numpy.ndarray, pandas.Series],
39
+ labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
39
40
  labels_dtype: torch.dtype,
40
41
  features_dtype: torch.dtype = torch.float32,
41
42
  feature_names: Optional[List[str]] = None,
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
48
49
 
49
50
  if isinstance(features, numpy.ndarray):
50
51
  self.features = torch.tensor(features, dtype=features_dtype)
51
- else:
52
- self.features = torch.tensor(features.values, dtype=features_dtype)
52
+ else: # It's a pandas.DataFrame
53
+ self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
53
54
 
54
55
  if isinstance(labels, numpy.ndarray):
55
56
  self.labels = torch.tensor(labels, dtype=labels_dtype)
57
+ elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
58
+ self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
56
59
  else:
57
- self.labels = torch.tensor(labels.values, dtype=labels_dtype)
60
+ # Fallback for other types (though your type hints don't cover this)
61
+ self.labels = torch.tensor(labels, dtype=labels_dtype)
58
62
 
59
63
  self._feature_names = feature_names
60
64
  self._target_names = target_names
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
98
102
  self._X_test_shape = (0,0)
99
103
  self._y_train_shape = (0,)
100
104
  self._y_test_shape = (0,)
101
-
102
- def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
103
- """Internal helper to fit and apply a PytorchScaler."""
105
+
106
+ def _prepare_scaler(self,
107
+ X_train: pandas.DataFrame,
108
+ y_train: Union[pandas.Series, pandas.DataFrame],
109
+ X_test: pandas.DataFrame,
110
+ label_dtype: torch.dtype,
111
+ schema: FeatureSchema):
112
+ """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
104
113
  continuous_feature_indices: Optional[List[int]] = None
105
- if continuous_feature_columns:
106
- if all(isinstance(c, str) for c in continuous_feature_columns):
107
- name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
108
- try:
109
- continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
110
- except KeyError as e:
111
- _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
112
- raise ValueError()
113
- elif all(isinstance(c, int) for c in continuous_feature_columns):
114
- continuous_feature_indices = continuous_feature_columns # type: ignore
115
- else:
116
- _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
117
- raise TypeError()
114
+
115
+ # Get continuous feature indices *from the schema*
116
+ if schema.continuous_feature_names:
117
+ _LOGGER.info("Getting continuous feature indices from schema.")
118
+ try:
119
+ # Convert columns to a standard list for .index()
120
+ train_cols_list = X_train.columns.to_list()
121
+ # Map names from schema to column indices in the training DataFrame
122
+ continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
123
+ except ValueError as e: #
124
+ _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
125
+ raise ValueError()
126
+ else:
127
+ _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
118
128
 
119
129
  X_train_values = X_train.values
120
130
  X_test_values = X_test.values
121
131
 
132
+ # continuous_feature_indices is derived
122
133
  if self.scaler is None and continuous_feature_indices:
123
134
  _LOGGER.info("Fitting a new PytorchScaler on training data.")
124
135
  temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
225
236
  """
226
237
  Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
227
238
 
228
- This class takes a DataFrame, automatically splits it into training and
229
- testing sets, and converts them into PyTorch Datasets. It assumes the
230
- target variable is the last column. It can also create, apply, and
231
- save a PytorchScaler for standardizing continuous features.
239
+ This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
240
+ It can also create and apply a PytorchScaler using the schema.
232
241
 
233
242
  Attributes:
234
243
  `scaler` -> PytorchScaler | None
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
242
251
  """
243
252
  def __init__(self,
244
253
  pandas_df: pandas.DataFrame,
254
+ schema: FeatureSchema,
245
255
  kind: Literal["regression", "classification"],
246
256
  test_size: float = 0.2,
247
257
  random_state: int = 42,
248
- scaler: Optional[PytorchScaler] = None,
249
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
258
+ scaler: Optional[PytorchScaler] = None):
250
259
  """
251
260
  Args:
252
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
253
- kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
254
- test_size (float): The proportion of the dataset to allocate to the test split.
255
- random_state (int): The seed for the random number generator for reproducibility.
256
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
257
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
261
+ pandas_df (pandas.DataFrame):
262
+ The pre-processed input DataFrame containing all columns. (features and single target).
263
+ schema (FeatureSchema):
264
+ The definitive schema object from data_exploration.
265
+ kind (Literal["regression", "classification"]):
266
+ The type of ML task. This determines the data type of the labels.
267
+ test_size (float):
268
+ The proportion of the dataset to allocate to the test split.
269
+ random_state (int):
270
+ The seed for the random number of generator for reproducibility.
271
+ scaler (PytorchScaler | None):
272
+ A pre-fitted PytorchScaler instance, if None a new scaler will be created.
258
273
  """
259
274
  super().__init__()
260
275
  self.scaler = scaler
261
276
 
262
- # --- 1. Identify features and target (single-target logic) ---
263
- features = pandas_df.iloc[:, :-1]
264
- target = pandas_df.iloc[:, -1]
265
- self._feature_names = features.columns.tolist()
266
- self._target_names = [str(target.name)]
267
- self._id = self._target_names[0]
277
+ # --- 1. Identify features (from schema) ---
278
+ self._feature_names = list(schema.feature_names)
279
+
280
+ # --- 2. Infer target (by set difference) ---
281
+ all_cols_set = set(pandas_df.columns)
282
+ feature_cols_set = set(self._feature_names)
283
+
284
+ target_cols_set = all_cols_set - feature_cols_set
285
+
286
+ if len(target_cols_set) == 0:
287
+ _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
288
+ raise ValueError("No target column found in DataFrame.")
289
+ if len(target_cols_set) > 1:
290
+ _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
291
+ raise ValueError("Ambiguous target: More than one non-feature column found.")
292
+
293
+ target_name = list(target_cols_set)[0]
294
+ self._target_names = [target_name]
295
+ self._id = target_name
296
+
297
+ # --- 3. Split Data ---
298
+ features_df = pandas_df[self._feature_names]
299
+ target_series = pandas_df[target_name]
268
300
 
269
- # --- 2. Split ---
270
301
  X_train, X_test, y_train, y_test = train_test_split(
271
- features, target, test_size=test_size, random_state=random_state
302
+ features_df,
303
+ target_series,
304
+ test_size=test_size,
305
+ random_state=random_state
272
306
  )
273
307
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
274
308
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
275
309
 
276
310
  label_dtype = torch.float32 if kind == "regression" else torch.int64
277
311
 
278
- # --- 3. Scale ---
312
+ # --- 4. Scale (using the schema) ---
279
313
  X_train_final, X_test_final = self._prepare_scaler(
280
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
314
+ X_train, y_train, X_test, label_dtype, schema
281
315
  )
282
316
 
283
- # --- 4. Create Datasets ---
284
- self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
285
- self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
286
-
317
+ # --- 5. Create Datasets ---
318
+ self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
319
+ self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
320
+
287
321
 
288
- # --- New Multi-Target Class ---
322
+ # --- Multi-Target Class ---
289
323
  class DatasetMakerMulti(_BaseDatasetMaker):
290
324
  """
291
- Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
325
+ Dataset maker for pre-processed, numerical pandas DataFrames with
326
+ multiple target columns.
292
327
 
293
- This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
328
+ This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
329
+ *target_columns*. It validates that the schema's features and the
330
+ target columns are mutually exclusive and together account for all
331
+ columns in the DataFrame.
332
+
333
+ Targets dtype is torch.float32
294
334
  """
295
335
  def __init__(self,
296
336
  pandas_df: pandas.DataFrame,
297
337
  target_columns: List[str],
338
+ schema: FeatureSchema,
298
339
  test_size: float = 0.2,
299
340
  random_state: int = 42,
300
- scaler: Optional[PytorchScaler] = None,
301
- continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
341
+ scaler: Optional[PytorchScaler] = None):
302
342
  """
303
343
  Args:
304
- pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
305
- target_columns (list[str]): List of target column names.
306
- test_size (float): The proportion of the dataset to allocate to the test split.
307
- random_state (int): The seed for the random number generator for reproducibility.
308
- scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
309
- continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
344
+ pandas_df (pandas.DataFrame):
345
+ The pre-processed input DataFrame with *all* columns
346
+ (features and targets).
347
+ target_columns (list[str]):
348
+ List of target column names.
349
+ schema (FeatureSchema):
350
+ The definitive schema object from data_exploration.
351
+ test_size (float):
352
+ The proportion of the dataset to allocate to the test split.
353
+ random_state (int):
354
+ The seed for the random number generator for reproducibility.
355
+ scaler (PytorchScaler | None):
356
+ A pre-fitted PytorchScaler instance.
357
+
358
+ ## Note:
359
+ For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
360
+ This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
310
361
  """
311
362
  super().__init__()
312
363
  self.scaler = scaler
313
364
 
365
+ # --- 1. Get features and targets from schema/args ---
366
+ self._feature_names = list(schema.feature_names)
314
367
  self._target_names = target_columns
315
- self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
316
- features = pandas_df[self._feature_names]
317
- target = pandas_df[self._target_names]
368
+
369
+ # --- 2. Validation ---
370
+ all_cols_set = set(pandas_df.columns)
371
+ feature_cols_set = set(self._feature_names)
372
+ target_cols_set = set(self._target_names)
373
+
374
+ overlap = feature_cols_set.intersection(target_cols_set)
375
+ if overlap:
376
+ _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
377
+ raise ValueError("Features and targets overlap.")
378
+
379
+ schema_plus_targets = feature_cols_set.union(target_cols_set)
380
+ missing_cols = all_cols_set - schema_plus_targets
381
+ if missing_cols:
382
+ _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
383
+
384
+ extra_cols = schema_plus_targets - all_cols_set
385
+ if extra_cols:
386
+ _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
387
+ raise ValueError("Schema/target definition mismatch with DataFrame.")
388
+
389
+ # --- 3. Split Data ---
390
+ features_df = pandas_df[self._feature_names]
391
+ target_df = pandas_df[self._target_names]
318
392
 
319
393
  X_train, X_test, y_train, y_test = train_test_split(
320
- features, target, test_size=test_size, random_state=random_state
394
+ features_df,
395
+ target_df,
396
+ test_size=test_size,
397
+ random_state=random_state
321
398
  )
322
399
  self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
323
400
  self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
324
401
 
325
- label_dtype = torch.float32
402
+ # Multi-target for regression or multi-binary
403
+ label_dtype = torch.float32
326
404
 
405
+ # --- 4. Scale (using the schema) ---
327
406
  X_train_final, X_test_final = self._prepare_scaler(
328
- X_train, y_train, X_test, label_dtype, continuous_feature_columns
407
+ X_train, y_train, X_test, label_dtype, schema
329
408
  )
330
409
 
410
+ # --- 5. Create Datasets ---
411
+ # _PytorchDataset now correctly handles y_train (a DataFrame)
331
412
  self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
332
413
  self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
333
414
 
ml_tools/ML_models.py CHANGED
@@ -8,6 +8,7 @@ from ._logger import _LOGGER
8
8
  from .path_manager import make_fullpath
9
9
  from ._script_info import _script_info
10
10
  from .keys import PytorchModelArchitectureKeys
11
+ from ._schema import FeatureSchema
11
12
 
12
13
 
13
14
  __all__ = [
@@ -298,76 +299,59 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
298
299
  """
299
300
  A Transformer-based model for tabular data tasks.
300
301
 
301
- This model uses a Feature Tokenizer to convert all input features into a sequence of embeddings, prepends a [CLS] token, and processes the
302
+ This model uses a Feature Tokenizer to convert all input features into a
303
+ sequence of embeddings, prepends a [CLS] token, and processes the
302
304
  sequence with a standard Transformer Encoder.
303
305
  """
304
306
  def __init__(self, *,
305
- in_features: int,
307
+ schema: FeatureSchema,
306
308
  out_targets: int,
307
- categorical_index_map: Dict[int, int],
308
309
  embedding_dim: int = 32,
309
310
  num_heads: int = 8,
310
311
  num_layers: int = 6,
311
312
  dropout: float = 0.1):
312
313
  """
313
314
  Args:
314
- in_features (int): The total number of columns in the input data (features).
315
- out_targets (int): Number of output targets (1 for regression).
316
- categorical_index_map (Dict[int, int]): Maps categorical column index to its cardinality (number of unique categories).
317
- embedding_dim (int): The dimension for all feature embeddings. Must be divisible by num_heads.
318
- num_heads (int): The number of heads in the multi-head attention mechanism.
319
- num_layers (int): The number of sub-encoder-layers in the transformer encoder.
320
- dropout (float): The dropout value.
321
-
322
- Note:
323
- - All arguments are keyword-only to promote clarity.
324
- - Column indices start at 0.
325
-
326
- ### Data Preparation
327
- The model requires a specific input format. All columns in the input DataFrame must be numerical, but they are treated differently based on the
328
- provided index lists.
329
-
330
- **Nominal Categorical Features** (e.g., 'City', 'Color'): Should **NOT** be one-hot encoded.
331
- Instead, convert them to integer codes (label encoding). You must then provide a dictionary mapping their column indices to
332
- their cardinality (the number of unique categories) via the `categorical_map` parameter.
333
-
334
- **Ordinal & Binary Features** (e.g., 'Low/Medium/High', 'True/False'): Should be treated as **numerical**. Map them to numbers that
335
- represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should **NOT** be included in the
336
- `categorical_map` parameter.
337
-
338
- **Standard Numerical and Continuous Features** (e.g., 'Age', 'Price'): It is highly recommended to scale them before training.
315
+ schema (FeatureSchema):
316
+ The definitive schema object created by `data_exploration.finalize_feature_schema()`.
317
+ out_targets (int):
318
+ Number of output targets (1 for regression).
319
+ embedding_dim (int):
320
+ The dimension for all feature embeddings. Must be divisible
321
+ by num_heads.
322
+ num_heads (int):
323
+ The number of heads in the multi-head attention mechanism.
324
+ num_layers (int):
325
+ The number of sub-encoder-layers in the transformer encoder.
326
+ dropout (float):
327
+ The dropout value.
339
328
  """
340
329
  super().__init__()
341
330
 
331
+ # --- Get info from schema ---
332
+ in_features = len(schema.feature_names)
333
+ categorical_index_map = schema.categorical_index_map
334
+
342
335
  # --- Validation ---
343
- if categorical_index_map and max(categorical_index_map.keys()) >= in_features:
336
+ if categorical_index_map and (max(categorical_index_map.keys()) >= in_features):
344
337
  _LOGGER.error(f"A categorical index ({max(categorical_index_map.keys())}) is out of bounds for the provided input features ({in_features}).")
345
338
  raise ValueError()
346
339
 
347
- # --- Derive numerical indices ---
348
- all_indices = set(range(in_features))
349
- categorical_indices_set = set(categorical_index_map.keys())
350
- numerical_indices = sorted(list(all_indices - categorical_indices_set))
351
-
352
340
  # --- Save configuration ---
353
- self.in_features = in_features
341
+ self.schema = schema # <-- Save the whole schema
354
342
  self.out_targets = out_targets
355
- self.numerical_indices = numerical_indices
356
- self.categorical_map = categorical_index_map
357
343
  self.embedding_dim = embedding_dim
358
344
  self.num_heads = num_heads
359
345
  self.num_layers = num_layers
360
346
  self.dropout = dropout
361
347
 
362
- # --- 1. Feature Tokenizer ---
348
+ # --- 1. Feature Tokenizer (now takes the schema) ---
363
349
  self.tokenizer = _FeatureTokenizer(
364
- numerical_indices=numerical_indices,
365
- categorical_map=categorical_index_map,
350
+ schema=schema,
366
351
  embedding_dim=embedding_dim
367
352
  )
368
353
 
369
354
  # --- 2. CLS Token ---
370
- # A learnable token that will be prepended to the sequence.
371
355
  self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
372
356
 
373
357
  # --- 3. Transformer Encoder ---
@@ -416,21 +400,87 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
416
400
 
417
401
  def get_architecture_config(self) -> Dict[str, Any]:
418
402
  """Returns the full configuration of the model."""
403
+ # Deconstruct schema into a JSON-friendly dict
404
+ # Tuples are saved as lists
405
+ schema_dict = {
406
+ 'feature_names': self.schema.feature_names,
407
+ 'continuous_feature_names': self.schema.continuous_feature_names,
408
+ 'categorical_feature_names': self.schema.categorical_feature_names,
409
+ 'categorical_index_map': self.schema.categorical_index_map,
410
+ 'categorical_mappings': self.schema.categorical_mappings
411
+ }
412
+
419
413
  return {
420
- 'in_features': self.in_features,
414
+ 'schema_dict': schema_dict,
421
415
  'out_targets': self.out_targets,
422
- 'categorical_map': self.categorical_map,
423
416
  'embedding_dim': self.embedding_dim,
424
417
  'num_heads': self.num_heads,
425
418
  'num_layers': self.num_layers,
426
419
  'dropout': self.dropout
427
420
  }
421
+
422
+ @classmethod
423
+ def load(cls: type, file_or_dir: Union[str, Path], verbose: bool = True) -> nn.Module:
424
+ """Loads a model architecture from a JSON file."""
425
+ user_path = make_fullpath(file_or_dir)
426
+
427
+ if user_path.is_dir():
428
+ json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
429
+ target_path = make_fullpath(user_path / json_filename, enforce="file")
430
+ elif user_path.is_file():
431
+ target_path = user_path
432
+ else:
433
+ _LOGGER.error(f"Invalid path: '{file_or_dir}'")
434
+ raise IOError()
435
+
436
+ with open(target_path, 'r') as f:
437
+ saved_data = json.load(f)
438
+
439
+ saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
440
+ config = saved_data[PytorchModelArchitectureKeys.CONFIG]
441
+
442
+ if saved_class_name != cls.__name__:
443
+ _LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")
444
+ raise ValueError()
445
+
446
+ # --- RECONSTRUCTION LOGIC ---
447
+ if 'schema_dict' not in config:
448
+ _LOGGER.error("Invalid architecture file: missing 'schema_dict'. This file may be from an older version.")
449
+ raise ValueError("Missing 'schema_dict' in config.")
450
+
451
+ schema_data = config.pop('schema_dict')
452
+
453
+ # Re-hydrate the categorical_index_map
454
+ # JSON saves all dict keys as strings, so we must convert them back to int.
455
+ raw_index_map = schema_data['categorical_index_map']
456
+ if raw_index_map is not None:
457
+ rehydrated_index_map = {int(k): v for k, v in raw_index_map.items()}
458
+ else:
459
+ rehydrated_index_map = None
460
+
461
+ # Re-hydrate the FeatureSchema object
462
+ # JSON deserializes tuples as lists, so we must convert them back.
463
+ schema = FeatureSchema(
464
+ feature_names=tuple(schema_data['feature_names']),
465
+ continuous_feature_names=tuple(schema_data['continuous_feature_names']),
466
+ categorical_feature_names=tuple(schema_data['categorical_feature_names']),
467
+ categorical_index_map=rehydrated_index_map,
468
+ categorical_mappings=schema_data['categorical_mappings']
469
+ )
470
+
471
+ config['schema'] = schema
472
+ # --- End Reconstruction ---
473
+
474
+ model = cls(**config)
475
+ if verbose:
476
+ _LOGGER.info(f"Successfully loaded architecture for '{saved_class_name}'")
477
+ return model
428
478
 
429
479
  def __repr__(self) -> str:
430
480
  """Returns the developer-friendly string representation of the model."""
431
481
  # Build the architecture string part-by-part
432
482
  parts = [
433
- f"Tokenizer(features={self.in_features}, dim={self.embedding_dim})",
483
+ f"Tokenizer(features={len(self.schema.feature_names)}, dim={self.embedding_dim})",
434
484
  "[CLS]",
435
485
  f"TransformerEncoder(layers={self.num_layers}, heads={self.num_heads})",
436
486
  f"PredictionHead(outputs={self.out_targets})"
@@ -443,29 +493,41 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
443
493
 
444
494
  class _FeatureTokenizer(nn.Module):
445
495
  """
446
- Transforms raw numerical and categorical features from any column order into a sequence of embeddings.
496
+ Transforms raw numerical and categorical features from any column order
497
+ into a sequence of embeddings.
447
498
  """
448
499
  def __init__(self,
449
- numerical_indices: List[int],
450
- categorical_map: Dict[int, int],
500
+ schema: FeatureSchema,
451
501
  embedding_dim: int):
452
502
  """
453
503
  Args:
454
- numerical_indices (List[int]): A list of column indices for the numerical features.
455
- categorical_map (Dict[int, int]): A dictionary mapping each categorical column index to its cardinality (number of unique categories).
456
- embedding_dim (int): The dimension for all feature embeddings.
504
+ schema (FeatureSchema):
505
+ The definitive schema object from data_exploration.
506
+ embedding_dim (int):
507
+ The dimension for all feature embeddings.
457
508
  """
458
509
  super().__init__()
459
510
 
460
- # Unpack the dictionary into separate lists for indices and cardinalities
461
- self.categorical_indices = list(categorical_map.keys())
462
- cardinalities = list(categorical_map.values())
511
+ # --- Get info from schema ---
512
+ categorical_map = schema.categorical_index_map
513
+
514
+ if categorical_map:
515
+ # Unpack the dictionary into separate lists
516
+ self.categorical_indices = list(categorical_map.keys())
517
+ cardinalities = list(categorical_map.values())
518
+ else:
519
+ self.categorical_indices = []
520
+ cardinalities = []
521
+
522
+ # Derive numerical indices by finding what's not categorical
523
+ all_indices = set(range(len(schema.feature_names)))
524
+ categorical_indices_set = set(self.categorical_indices)
525
+ self.numerical_indices = sorted(list(all_indices - categorical_indices_set))
463
526
 
464
- self.numerical_indices = numerical_indices
465
527
  self.embedding_dim = embedding_dim
466
528
 
467
529
  # A learnable embedding for each numerical feature
468
- self.numerical_embeddings = nn.Parameter(torch.randn(len(numerical_indices), embedding_dim))
530
+ self.numerical_embeddings = nn.Parameter(torch.randn(len(self.numerical_indices), embedding_dim))
469
531
 
470
532
  # A standard embedding layer for each categorical feature
471
533
  self.categorical_embeddings = nn.ModuleList(
@@ -487,6 +549,8 @@ class _FeatureTokenizer(nn.Module):
487
549
  # Process categorical features
488
550
  categorical_tokens = []
489
551
  for i, embed_layer in enumerate(self.categorical_embeddings):
552
+ # x_categorical[:, i] selects the i-th categorical column
553
+ # (e.g., all values for the 'color' feature)
490
554
  token = embed_layer(x_categorical[:, i]).unsqueeze(1)
491
555
  categorical_tokens.append(token)
492
556