PyPI - dragon-ml-toolbox - Versions diffs - 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.0.0py3-none-any.whl → 13.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (15) hide show

{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 13.0.0
+Version: 13.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-13.1.0.dist-info}/RECORD RENAMED Viewed

@@ -1,41 +1,41 @@
-dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-13.0.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
+dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-13.1.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=iy2r_R7wjzsCbz_Q_jMsp_jfZ6oP8XW9QhwzRBH0mGY,1904
 ml_tools/ETL_cleaning.py,sha256=2VBRllV8F-ZiPylPp8Az2gwn5ztgazN0BH5OKnRUhV0,20402
 ml_tools/ETL_engineering.py,sha256=KfYqgsxupAx6e_TxwO1LZXeu5mFkIhVXJrNjP3CzIZc,54927
 ml_tools/GUI_tools.py,sha256=Va6ig-dHULPVRwQYYtH3fvY5XPIoqRcJpRW8oXC55Hw,45413
 ml_tools/MICE_imputation.py,sha256=X273Qlgoqqg7KTmoKd75YDyAPB0UIbTzGP3xsCmRh3E,11717
 ml_tools/ML_callbacks.py,sha256=elD2Yr030sv_6gX_m9GVd6HTyrbmt34nFS8lrgS4HtM,15808
-ml_tools/ML_datasetmaster.py,sha256=kedCGneR3S2zui0_JFZN6TBL5e69XWkdpkE_QohyqSM,31433
+ml_tools/ML_datasetmaster.py,sha256=7QJnOM6GWFklKt2fiukITM3DK49i3ThK8wazb5szwpE,34396
 ml_tools/ML_evaluation.py,sha256=3u5dOhS77gn3kAshKr2GwSa5xZBF0YM77ZkFevqNPvA,18528
 ml_tools/ML_evaluation_multi.py,sha256=L6Ub_uObXsI7ToVCF6DtmAFekHRcga5wWMOnRYRR-BY,16121
 ml_tools/ML_inference.py,sha256=yq2gdN6s_OUYC5ZLQrIJC5BA5H33q8UKODXwb-_0M2c,23549
-ml_tools/ML_models.py,sha256=G64NPhYZfYvHTIUwkIrMrNLgfDTKJwqdc8jwesPqB9E,28090
-ml_tools/ML_optimization.py,sha256=es3TlQbY7RYgJMZnznkjYGbUxFnAqzZxE_g3_qLK9Q8,22960
+ml_tools/ML_models.py,sha256=4Kb23pSusPMRH8h-R9ztK6JoH1lMuckxq7ihorll-H8,29965
+ml_tools/ML_optimization.py,sha256=P0zkhKAwTpkorIBtR0AOIDcyexo5ngmvFUzo3DfNO-E,22692
 ml_tools/ML_scaler.py,sha256=tw6onj9o8_kk3FQYb930HUzvv1zsFZe2YZJdF3LtHkU,7538
-ml_tools/ML_simple_optimization.py,sha256=W2mce1XFCuiOHTOjOsCNbETISHn5MwYlYsTIXH5hMMo,18177
 ml_tools/ML_trainer.py,sha256=9BP6JFClqGfe7GL-FGG3n5e-no9ssjEOLol7P6baGrI,29019
 ml_tools/ML_utilities.py,sha256=EnKpPTnJ2qjZmz7kvows4Uu5CfSA7ByRmI1v2-KarKw,9337
-ml_tools/PSO_optimization.py,sha256=fVHeemqilBS0zrGV25E5yKwDlGdd2ZKa18d8CZ6Q6Fk,22961
+ml_tools/PSO_optimization.py,sha256=T-HWHMRJUnPvPwixdU5jif3_rnnI36TzcL8u3oSCwuA,22960
 ml_tools/RNN_forecast.py,sha256=Qa2KoZfdAvSjZ4yE78N4BFXtr3tTr0Gx7tQJZPotsh0,1967
 ml_tools/SQL.py,sha256=vXLPGfVVg8bfkbBE3HVfyEclVbdJy0TBhuQONtMwSCQ,11234
 ml_tools/VIF_factor.py,sha256=at5IVqPvicja2-DNSTSIIy3SkzDWCmLzo3qTG_qr5n8,10422
 ml_tools/__init__.py,sha256=q0y9faQ6e17XCQ7eUiCZ1FJ4Bg5EQqLjZ9f_l5REUUY,41
 ml_tools/_logger.py,sha256=dlp5cGbzooK9YSNSZYB4yjZrOaQUGW8PTrM411AOvL8,4717
+ml_tools/_schema.py,sha256=MYYAO8CYygIvwv9TkGBAxzZpG7xQ2IV8_yB5zzFin0c,710
 ml_tools/_script_info.py,sha256=21r83LV3RubsNZ_RTEUON6RbDf7Mh4_udweNcvdF_Fk,212
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/custom_logger.py,sha256=7tSAgRL7e-Ekm7rS1FLDocaPLCnaoKc7VSrtfwCtCEg,10067
-ml_tools/data_exploration.py,sha256=haddQFsXAWzuf84NLItcZ4Q7vzN3YWjFoh7lPlWUczo,50679
+ml_tools/data_exploration.py,sha256=aVcxjoXVqrmFBpwBSbLvrG8quzJfr92On48Sy3K58Vs,51900
 ml_tools/ensemble_evaluation.py,sha256=FGHSe8LBI8_w8LjNeJWOcYQ1UK_mc6fVah8gmSvNVGg,26853
 ml_tools/ensemble_inference.py,sha256=0yLmLNj45RVVoSCLH1ZYJG9IoAhTkWUqEZmLOQTFGTY,9348
 ml_tools/ensemble_learning.py,sha256=vsIED7nlheYI4w2SBzP6SC1AnNeMfn-2A1Gqw5EfxsM,21964
 ml_tools/handle_excel.py,sha256=pfdAPb9ywegFkM9T54bRssDOsX-K7rSeV0RaMz7lEAo,14006
 ml_tools/keys.py,sha256=eJ4St5fl8uHstEGO1XVdP8G-ddwjOxV9zqG0D6W8pCI,2124
 ml_tools/math_utilities.py,sha256=PxoOrnuj6Ntp7_TJqyDWi0JX03WpAO5iaFNK2Oeq5I4,8800
-ml_tools/optimization_tools.py,sha256=P074YCuZzkqkONnAsM-Zb9DTX_i8cRkkJLpwAWz6CRw,13521
+ml_tools/optimization_tools.py,sha256=TYFQ2nSnp7xxs-VyoZISWgnGJghFbsWasHjruegyJRs,12763
 ml_tools/path_manager.py,sha256=CyDU16pOKmC82jPubqJPT6EBt-u-3rGVbxyPIZCvDDY,18432
-ml_tools/serde.py,sha256=ll2mVC0sO2jIEdG3K6xMcgEN13N4YSb8VjviGvw_ers,4949
+ml_tools/serde.py,sha256=Wjf8N1thSfJ4r6Vm_pWxP2UTPcP2f3s2FiGz0z6kqKI,4925
 ml_tools/utilities.py,sha256=OcAyV1tEcYAfOWlGjRgopsjDLxU3DcI5EynzvWV4q3A,15754
-dragon_ml_toolbox-13.0.0.dist-info/METADATA,sha256=trY1fFyTTXLS6TZdrJXxq4_YMPjEZhKCilzCg6qFxzw,6166
-dragon_ml_toolbox-13.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-13.0.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-13.0.0.dist-info/RECORD,,
+dragon_ml_toolbox-13.1.0.dist-info/METADATA,sha256=8n0bhl_rSVdg6MDh51r7tl5JflbqIOdqZx5gjaBWk0o,6166
+dragon_ml_toolbox-13.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-13.1.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-13.1.0.dist-info/RECORD,,

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -17,6 +17,7 @@ from ._script_info import _script_info
 from .custom_logger import save_list_strings
 from .ML_scaler import PytorchScaler
 from .keys import DatasetKeys
+from ._schema import FeatureSchema
 __all__ = [
@@ -35,7 +36,7 @@ class _PytorchDataset(Dataset):
     Converts numpy/pandas data into tensors for model consumption.
     """
     def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
-                 labels: Union[numpy.ndarray, pandas.Series],
+                 labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
                  labels_dtype: torch.dtype,
                  features_dtype: torch.dtype = torch.float32,
                  feature_names: Optional[List[str]] = None,
@@ -48,13 +49,16 @@ class _PytorchDataset(Dataset):
         if isinstance(features, numpy.ndarray):
             self.features = torch.tensor(features, dtype=features_dtype)
-        else:
-            self.features = torch.tensor(features.values, dtype=features_dtype)
+        else: # It's a pandas.DataFrame
+            self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
         if isinstance(labels, numpy.ndarray):
             self.labels = torch.tensor(labels, dtype=labels_dtype)
+        elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
+            self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
         else:
-            self.labels = torch.tensor(labels.values, dtype=labels_dtype)
+             # Fallback for other types (though your type hints don't cover this)
+            self.labels = torch.tensor(labels, dtype=labels_dtype)
         self._feature_names = feature_names
         self._target_names = target_names
@@ -98,27 +102,34 @@ class _BaseDatasetMaker(ABC):
         self._X_test_shape = (0,0)
         self._y_train_shape = (0,)
         self._y_test_shape = (0,)
-    def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
-        """Internal helper to fit and apply a PytorchScaler."""
+    def _prepare_scaler(self,
+                        X_train: pandas.DataFrame,
+                        y_train: Union[pandas.Series, pandas.DataFrame],
+                        X_test: pandas.DataFrame,
+                        label_dtype: torch.dtype,
+                        schema: FeatureSchema):
+        """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
         continuous_feature_indices: Optional[List[int]] = None
-        if continuous_feature_columns:
-            if all(isinstance(c, str) for c in continuous_feature_columns):
-                name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
-                try:
-                    continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
-                except KeyError as e:
-                    _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
-                    raise ValueError()
-            elif all(isinstance(c, int) for c in continuous_feature_columns):
-                continuous_feature_indices = continuous_feature_columns # type: ignore
-            else:
-                _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
-                raise TypeError()
+        # Get continuous feature indices *from the schema*
+        if schema.continuous_feature_names:
+            _LOGGER.info("Getting continuous feature indices from schema.")
+            try:
+                # Convert columns to a standard list for .index()
+                train_cols_list = X_train.columns.to_list()
+                # Map names from schema to column indices in the training DataFrame
+                continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
+            except ValueError as e: #
+                _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
+                raise ValueError()
+        else:
+            _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
         X_train_values = X_train.values
         X_test_values = X_test.values
+        # continuous_feature_indices is derived
         if self.scaler is None and continuous_feature_indices:
             _LOGGER.info("Fitting a new PytorchScaler on training data.")
             temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +236,8 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
-    This class takes a DataFrame, automatically splits it into training and
-    testing sets, and converts them into PyTorch Datasets. It assumes the
-    target variable is the last column. It can also create, apply, and
-    save a PytorchScaler for standardizing continuous features.
+    This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
+    It can also create and apply a PytorchScaler using the schema.
     Attributes:
         `scaler` -> PytorchScaler | None
@@ -242,92 +251,164 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
+                 schema: FeatureSchema,
                  kind: Literal["regression", "classification"],
                  test_size: float = 0.2,
                  random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 scaler: Optional[PytorchScaler] = None):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame containing all columns. (features and single target).
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            kind (Literal["regression", "classification"]):
+                The type of ML task. This determines the data type of the labels.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number of generator for reproducibility.
+            scaler (PytorchScaler | None):
+                A pre-fitted PytorchScaler instance, if None a new scaler will be created.
         """
         super().__init__()
         self.scaler = scaler
-        # --- 1. Identify features and target (single-target logic) ---
-        features = pandas_df.iloc[:, :-1]
-        target = pandas_df.iloc[:, -1]
-        self._feature_names = features.columns.tolist()
-        self._target_names = [str(target.name)]
-        self._id = self._target_names[0]
+        # --- 1. Identify features (from schema) ---
+        self._feature_names = list(schema.feature_names)
+        # --- 2. Infer target (by set difference) ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = all_cols_set - feature_cols_set
+        if len(target_cols_set) == 0:
+            _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
+            raise ValueError("No target column found in DataFrame.")
+        if len(target_cols_set) > 1:
+            _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
+            raise ValueError("Ambiguous target: More than one non-feature column found.")
+        target_name = list(target_cols_set)[0]
+        self._target_names = [target_name]
+        self._id = target_name
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_series = pandas_df[target_name]
-        # --- 2. Split ---
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_series,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
         label_dtype = torch.float32 if kind == "regression" else torch.int64
-        # --- 3. Scale ---
+        # --- 4. Scale (using the schema) ---
         X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+            X_train, y_train, X_test, label_dtype, schema
         )
-        # --- 4. Create Datasets ---
-        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        # --- 5. Create Datasets ---
+        self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-# --- New Multi-Target Class ---
+# --- Multi-Target Class ---
 class DatasetMakerMulti(_BaseDatasetMaker):
     """
-    Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
+    Dataset maker for pre-processed, numerical pandas DataFrames with
+    multiple target columns.
-    This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
+    This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
+    *target_columns*. It validates that the schema's features and the
+    target columns are mutually exclusive and together account for all
+    columns in the DataFrame.
+    Targets dtype is torch.float32
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
                  target_columns: List[str],
+                 schema: FeatureSchema,
                  test_size: float = 0.2,
                  random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 scaler: Optional[PytorchScaler] = None):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            target_columns (list[str]): List of target column names.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame with *all* columns
+                (features and targets).
+            target_columns (list[str]):
+                List of target column names.
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number generator for reproducibility.
+            scaler (PytorchScaler | None):
+                A pre-fitted PytorchScaler instance.
+        ## Note:
+        For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
+        This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
         """
         super().__init__()
         self.scaler = scaler
+        # --- 1. Get features and targets from schema/args ---
+        self._feature_names = list(schema.feature_names)
         self._target_names = target_columns
-        self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
-        features = pandas_df[self._feature_names]
-        target = pandas_df[self._target_names]
+        # --- 2. Validation ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = set(self._target_names)
+        overlap = feature_cols_set.intersection(target_cols_set)
+        if overlap:
+            _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
+            raise ValueError("Features and targets overlap.")
+        schema_plus_targets = feature_cols_set.union(target_cols_set)
+        missing_cols = all_cols_set - schema_plus_targets
+        if missing_cols:
+            _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
+        extra_cols = schema_plus_targets - all_cols_set
+        if extra_cols:
+            _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
+            raise ValueError("Schema/target definition mismatch with DataFrame.")
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_df = pandas_df[self._target_names]
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_df,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
-        label_dtype = torch.float32
+        # Multi-target for regression or multi-binary
+        label_dtype = torch.float32
+        # --- 4. Scale (using the schema) ---
         X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
+            X_train, y_train, X_test, label_dtype, schema
         )
+        # --- 5. Create Datasets ---
+        # _PytorchDataset now correctly handles y_train (a DataFrame)
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
         self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)

ml_tools/ML_models.py CHANGED Viewed

@@ -8,6 +8,7 @@ from ._logger import _LOGGER
 from .path_manager import make_fullpath
 from ._script_info import _script_info
 from .keys import PytorchModelArchitectureKeys
+from ._schema import FeatureSchema
 __all__ = [
@@ -298,76 +299,59 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
     """
     A Transformer-based model for tabular data tasks.
-    This model uses a Feature Tokenizer to convert all input features into a sequence of embeddings, prepends a [CLS] token, and processes the
+    This model uses a Feature Tokenizer to convert all input features into a
+    sequence of embeddings, prepends a [CLS] token, and processes the
     sequence with a standard Transformer Encoder.
     """
     def __init__(self, *,
-                 in_features: int,
+                 schema: FeatureSchema,
                  out_targets: int,
-                 categorical_index_map: Dict[int, int],
                  embedding_dim: int = 32,
                  num_heads: int = 8,
                  num_layers: int = 6,
                  dropout: float = 0.1):
         """
         Args:
-            in_features (int): The total number of columns in the input data (features).
-            out_targets (int): Number of output targets (1 for regression).
-            categorical_index_map (Dict[int, int]): Maps categorical column index to its cardinality (number of unique categories).
-            embedding_dim (int): The dimension for all feature embeddings. Must be divisible by num_heads.
-            num_heads (int): The number of heads in the multi-head attention mechanism.
-            num_layers (int): The number of sub-encoder-layers in the transformer encoder.
-            dropout (float): The dropout value.
-        Note:
-        - All arguments are keyword-only to promote clarity.
-        - Column indices start at 0.
-        ### Data Preparation
-        The model requires a specific input format. All columns in the input DataFrame must be numerical, but they are treated differently based on the
-        provided index lists.
-        **Nominal Categorical Features** (e.g., 'City', 'Color'): Should **NOT** be one-hot encoded.
-        Instead, convert them to integer codes (label encoding). You must then provide a dictionary mapping their column indices to
-        their cardinality (the number of unique categories) via the `categorical_map` parameter.
-        **Ordinal & Binary Features** (e.g., 'Low/Medium/High', 'True/False'): Should be treated as **numerical**. Map them to numbers that
-        represent their state (e.g., `{'Low': 0, 'Medium': 1}` or `{False: 0, True: 1}`). Their column indices should **NOT** be included in the
-        `categorical_map` parameter.
-        **Standard Numerical and Continuous Features** (e.g., 'Age', 'Price'): It is highly recommended to scale them before training.
+            schema (FeatureSchema):
+                The definitive schema object created by `data_exploration.finalize_feature_schema()`.
+            out_targets (int):
+                Number of output targets (1 for regression).
+            embedding_dim (int):
+                The dimension for all feature embeddings. Must be divisible
+                by num_heads.
+            num_heads (int):
+                The number of heads in the multi-head attention mechanism.
+            num_layers (int):
+                The number of sub-encoder-layers in the transformer encoder.
+            dropout (float):
+                The dropout value.
         """
         super().__init__()
+         # --- Get info from schema ---
+        in_features = len(schema.feature_names)
+        categorical_index_map = schema.categorical_index_map
          # --- Validation ---
-        if categorical_index_map and max(categorical_index_map.keys()) >= in_features:
+        if categorical_index_map and (max(categorical_index_map.keys()) >= in_features):
             _LOGGER.error(f"A categorical index ({max(categorical_index_map.keys())}) is out of bounds for the provided input features ({in_features}).")
             raise ValueError()
-        # --- Derive numerical indices ---
-        all_indices = set(range(in_features))
-        categorical_indices_set = set(categorical_index_map.keys())
-        numerical_indices = sorted(list(all_indices - categorical_indices_set))
         # --- Save configuration ---
-        self.in_features = in_features
+        self.schema = schema # <-- Save the whole schema
         self.out_targets = out_targets
-        self.numerical_indices = numerical_indices
-        self.categorical_map = categorical_index_map
         self.embedding_dim = embedding_dim
         self.num_heads = num_heads
         self.num_layers = num_layers
         self.dropout = dropout
-        # --- 1. Feature Tokenizer ---
+        # --- 1. Feature Tokenizer (now takes the schema) ---
         self.tokenizer = _FeatureTokenizer(
-            numerical_indices=numerical_indices,
-            categorical_map=categorical_index_map,
+            schema=schema,
             embedding_dim=embedding_dim
         )
         # --- 2. CLS Token ---
-        # A learnable token that will be prepended to the sequence.
         self.cls_token = nn.Parameter(torch.randn(1, 1, embedding_dim))
         # --- 3. Transformer Encoder ---
@@ -416,21 +400,87 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
     def get_architecture_config(self) -> Dict[str, Any]:
         """Returns the full configuration of the model."""
+        # Deconstruct schema into a JSON-friendly dict
+        # Tuples are saved as lists
+        schema_dict = {
+            'feature_names': self.schema.feature_names,
+            'continuous_feature_names': self.schema.continuous_feature_names,
+            'categorical_feature_names': self.schema.categorical_feature_names,
+            'categorical_index_map': self.schema.categorical_index_map,
+            'categorical_mappings': self.schema.categorical_mappings
+        }
         return {
-            'in_features': self.in_features,
+            'schema_dict': schema_dict,
             'out_targets': self.out_targets,
-            'categorical_map': self.categorical_map,
             'embedding_dim': self.embedding_dim,
             'num_heads': self.num_heads,
             'num_layers': self.num_layers,
             'dropout': self.dropout
         }
+    @classmethod
+    def load(cls: type, file_or_dir: Union[str, Path], verbose: bool = True) -> nn.Module:
+        """Loads a model architecture from a JSON file."""
+        user_path = make_fullpath(file_or_dir)
+        if user_path.is_dir():
+            json_filename = PytorchModelArchitectureKeys.SAVENAME + ".json"
+            target_path = make_fullpath(user_path / json_filename, enforce="file")
+        elif user_path.is_file():
+            target_path = user_path
+        else:
+            _LOGGER.error(f"Invalid path: '{file_or_dir}'")
+            raise IOError()
+        with open(target_path, 'r') as f:
+            saved_data = json.load(f)
+        saved_class_name = saved_data[PytorchModelArchitectureKeys.MODEL]
+        config = saved_data[PytorchModelArchitectureKeys.CONFIG]
+        if saved_class_name != cls.__name__:
+            _LOGGER.error(f"Model class mismatch. File specifies '{saved_class_name}', but '{cls.__name__}' was expected.")
+            raise ValueError()
+        # --- RECONSTRUCTION LOGIC ---
+        if 'schema_dict' not in config:
+            _LOGGER.error("Invalid architecture file: missing 'schema_dict'. This file may be from an older version.")
+            raise ValueError("Missing 'schema_dict' in config.")
+        schema_data = config.pop('schema_dict')
+        # Re-hydrate the categorical_index_map
+        # JSON saves all dict keys as strings, so we must convert them back to int.
+        raw_index_map = schema_data['categorical_index_map']
+        if raw_index_map is not None:
+            rehydrated_index_map = {int(k): v for k, v in raw_index_map.items()}
+        else:
+            rehydrated_index_map = None
+        # Re-hydrate the FeatureSchema object
+        # JSON deserializes tuples as lists, so we must convert them back.
+        schema = FeatureSchema(
+            feature_names=tuple(schema_data['feature_names']),
+            continuous_feature_names=tuple(schema_data['continuous_feature_names']),
+            categorical_feature_names=tuple(schema_data['categorical_feature_names']),
+            categorical_index_map=rehydrated_index_map,
+            categorical_mappings=schema_data['categorical_mappings']
+        )
+        config['schema'] = schema
+        # --- End Reconstruction ---
+        model = cls(**config)
+        if verbose:
+            _LOGGER.info(f"Successfully loaded architecture for '{saved_class_name}'")
+        return model
     def __repr__(self) -> str:
         """Returns the developer-friendly string representation of the model."""
         # Build the architecture string part-by-part
         parts = [
-            f"Tokenizer(features={self.in_features}, dim={self.embedding_dim})",
+            f"Tokenizer(features={len(self.schema.feature_names)}, dim={self.embedding_dim})",
             "[CLS]",
             f"TransformerEncoder(layers={self.num_layers}, heads={self.num_heads})",
             f"PredictionHead(outputs={self.out_targets})"
@@ -443,29 +493,41 @@ class TabularTransformer(nn.Module, _ArchitectureHandlerMixin):
 class _FeatureTokenizer(nn.Module):
     """
-    Transforms raw numerical and categorical features from any column order into a sequence of embeddings.
+    Transforms raw numerical and categorical features from any column order
+    into a sequence of embeddings.
     """
     def __init__(self,
-                 numerical_indices: List[int],
-                 categorical_map: Dict[int, int],
+                 schema: FeatureSchema,
                  embedding_dim: int):
         """
         Args:
-            numerical_indices (List[int]): A list of column indices for the numerical features.
-            categorical_map (Dict[int, int]): A dictionary mapping each categorical column index to its cardinality (number of unique categories).
-            embedding_dim (int): The dimension for all feature embeddings.
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            embedding_dim (int):
+                The dimension for all feature embeddings.
         """
         super().__init__()
-        # Unpack the dictionary into separate lists for indices and cardinalities
-        self.categorical_indices = list(categorical_map.keys())
-        cardinalities = list(categorical_map.values())
+        # --- Get info from schema ---
+        categorical_map = schema.categorical_index_map
+        if categorical_map:
+            # Unpack the dictionary into separate lists
+            self.categorical_indices = list(categorical_map.keys())
+            cardinalities = list(categorical_map.values())
+        else:
+            self.categorical_indices = []
+            cardinalities = []
+        # Derive numerical indices by finding what's not categorical
+        all_indices = set(range(len(schema.feature_names)))
+        categorical_indices_set = set(self.categorical_indices)
+        self.numerical_indices = sorted(list(all_indices - categorical_indices_set))
-        self.numerical_indices = numerical_indices
         self.embedding_dim = embedding_dim
         # A learnable embedding for each numerical feature
-        self.numerical_embeddings = nn.Parameter(torch.randn(len(numerical_indices), embedding_dim))
+        self.numerical_embeddings = nn.Parameter(torch.randn(len(self.numerical_indices), embedding_dim))
         # A standard embedding layer for each categorical feature
         self.categorical_embeddings = nn.ModuleList(
@@ -487,6 +549,8 @@ class _FeatureTokenizer(nn.Module):
         # Process categorical features
         categorical_tokens = []
         for i, embed_layer in enumerate(self.categorical_embeddings):
+            # x_categorical[:, i] selects the i-th categorical column
+            # (e.g., all values for the 'color' feature)
             token = embed_layer(x_categorical[:, i]).unsqueeze(1)
             categorical_tokens.append(token)

dragon-ml-toolbox 13.0.0__py3-none-any.whl → 13.1.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 13.0.0py3-none-any.whl → 13.1.0py3-none-any.whl