PyPI - dragon-ml-toolbox - Versions diffs - 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.0.0py3-none-any.whl → 14.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/METADATA +12 -2
dragon_ml_toolbox-14.7.0.dist-info/RECORD +49 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
ml_tools/MICE_imputation.py +207 -5
ml_tools/ML_configuration.py +108 -0
ml_tools/ML_datasetmaster.py +241 -260
ml_tools/ML_evaluation.py +229 -76
ml_tools/ML_evaluation_multi.py +45 -16
ml_tools/ML_inference.py +0 -1
ml_tools/ML_models.py +135 -55
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +49 -36
ml_tools/ML_trainer.py +498 -29
ml_tools/ML_utilities.py +351 -4
ml_tools/ML_vision_datasetmaster.py +1492 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +641 -0
ml_tools/ML_vision_transformers.py +203 -0
ml_tools/PSO_optimization.py +5 -1
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_schema.py +96 -0
ml_tools/custom_logger.py +37 -14
ml_tools/data_exploration.py +576 -138
ml_tools/ensemble_evaluation.py +53 -10
ml_tools/keys.py +43 -1
ml_tools/math_utilities.py +1 -1
ml_tools/optimization_tools.py +65 -86
ml_tools/serde.py +78 -17
ml_tools/utilities.py +192 -3
dragon_ml_toolbox-13.0.0.dist-info/RECORD +0 -41
ml_tools/ML_simple_optimization.py +0 -413
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/top_level.txt +0 -0

ml_tools/ML_datasetmaster.py CHANGED Viewed

@@ -1,13 +1,10 @@
 import torch
-from torch.utils.data import Dataset, Subset
+from torch.utils.data import Dataset
 import pandas
 import numpy
 from sklearn.model_selection import train_test_split
 from typing import Literal, Union, Tuple, List, Optional
 from abc import ABC, abstractmethod
-from PIL import Image, ImageOps
-from torchvision.datasets import ImageFolder
-from torchvision import transforms
 import matplotlib.pyplot as plt
 from pathlib import Path
@@ -17,14 +14,13 @@ from ._script_info import _script_info
 from .custom_logger import save_list_strings
 from .ML_scaler import PytorchScaler
 from .keys import DatasetKeys
+from ._schema import FeatureSchema
 __all__ = [
     "DatasetMaker",
     "DatasetMakerMulti",
-    "VisionDatasetMaker",
-    "SequenceMaker",
-    "ResizeAspectFill",
+    "SequenceMaker"
 ]
@@ -35,7 +31,7 @@ class _PytorchDataset(Dataset):
     Converts numpy/pandas data into tensors for model consumption.
     """
     def __init__(self, features: Union[numpy.ndarray, pandas.DataFrame],
-                 labels: Union[numpy.ndarray, pandas.Series],
+                 labels: Union[numpy.ndarray, pandas.Series, pandas.DataFrame],
                  labels_dtype: torch.dtype,
                  features_dtype: torch.dtype = torch.float32,
                  feature_names: Optional[List[str]] = None,
@@ -48,13 +44,16 @@ class _PytorchDataset(Dataset):
         if isinstance(features, numpy.ndarray):
             self.features = torch.tensor(features, dtype=features_dtype)
-        else:
-            self.features = torch.tensor(features.values, dtype=features_dtype)
+        else: # It's a pandas.DataFrame
+            self.features = torch.tensor(features.to_numpy(), dtype=features_dtype)
         if isinstance(labels, numpy.ndarray):
             self.labels = torch.tensor(labels, dtype=labels_dtype)
+        elif isinstance(labels, (pandas.Series, pandas.DataFrame)):
+            self.labels = torch.tensor(labels.to_numpy(), dtype=labels_dtype)
         else:
-            self.labels = torch.tensor(labels.values, dtype=labels_dtype)
+             # Fallback for other types (though your type hints don't cover this)
+            self.labels = torch.tensor(labels, dtype=labels_dtype)
         self._feature_names = feature_names
         self._target_names = target_names
@@ -98,27 +97,34 @@ class _BaseDatasetMaker(ABC):
         self._X_test_shape = (0,0)
         self._y_train_shape = (0,)
         self._y_test_shape = (0,)
-    def _prepare_scaler(self, X_train: pandas.DataFrame, y_train: Union[pandas.Series, pandas.DataFrame], X_test: pandas.DataFrame, label_dtype: torch.dtype, continuous_feature_columns: Optional[Union[List[int], List[str]]]):
-        """Internal helper to fit and apply a PytorchScaler."""
+    def _prepare_scaler(self,
+                        X_train: pandas.DataFrame,
+                        y_train: Union[pandas.Series, pandas.DataFrame],
+                        X_test: pandas.DataFrame,
+                        label_dtype: torch.dtype,
+                        schema: FeatureSchema):
+        """Internal helper to fit and apply a PytorchScaler using a FeatureSchema."""
         continuous_feature_indices: Optional[List[int]] = None
-        if continuous_feature_columns:
-            if all(isinstance(c, str) for c in continuous_feature_columns):
-                name_to_idx = {name: i for i, name in enumerate(self._feature_names)}
-                try:
-                    continuous_feature_indices = [name_to_idx[name] for name in continuous_feature_columns] # type: ignore
-                except KeyError as e:
-                    _LOGGER.error(f"Feature column '{e.args[0]}' not found.")
-                    raise ValueError()
-            elif all(isinstance(c, int) for c in continuous_feature_columns):
-                continuous_feature_indices = continuous_feature_columns # type: ignore
-            else:
-                _LOGGER.error("'continuous_feature_columns' must be a list of all strings or all integers.")
-                raise TypeError()
-        X_train_values = X_train.values
-        X_test_values = X_test.values
+        # Get continuous feature indices *from the schema*
+        if schema.continuous_feature_names:
+            _LOGGER.info("Getting continuous feature indices from schema.")
+            try:
+                # Convert columns to a standard list for .index()
+                train_cols_list = X_train.columns.to_list()
+                # Map names from schema to column indices in the training DataFrame
+                continuous_feature_indices = [train_cols_list.index(name) for name in schema.continuous_feature_names]
+            except ValueError as e: #
+                _LOGGER.error(f"Feature name from schema not found in training data columns:\n{e}")
+                raise ValueError()
+        else:
+            _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
+        X_train_values = X_train.to_numpy()
+        X_test_values = X_test.to_numpy()
+        # continuous_feature_indices is derived
         if self.scaler is None and continuous_feature_indices:
             _LOGGER.info("Fitting a new PytorchScaler on training data.")
             temp_train_ds = _PytorchDataset(X_train_values, y_train, label_dtype) # type: ignore
@@ -225,10 +231,8 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     Dataset maker for pre-processed, numerical pandas DataFrames with a single target column.
-    This class takes a DataFrame, automatically splits it into training and
-    testing sets, and converts them into PyTorch Datasets. It assumes the
-    target variable is the last column. It can also create, apply, and
-    save a PytorchScaler for standardizing continuous features.
+    This class takes a DataFrame, and a FeatureSchema, automatically splits and converts them into PyTorch Datasets.
+    It can also create and apply a PytorchScaler using the schema.
     Attributes:
         `scaler` -> PytorchScaler | None
@@ -242,95 +246,234 @@ class DatasetMaker(_BaseDatasetMaker):
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
+                 schema: FeatureSchema,
                  kind: Literal["regression", "classification"],
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 random_state: int = 42):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            kind (Literal["regression", "classification"]): The type of ML task. This determines the data type of the labels.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame containing all columns. (features and single target).
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            kind ("regression" | "classification"):
+                The type of ML task. This determines the data type of the labels.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number of generator for reproducibility.
         """
         super().__init__()
-        self.scaler = scaler
-        # --- 1. Identify features and target (single-target logic) ---
-        features = pandas_df.iloc[:, :-1]
-        target = pandas_df.iloc[:, -1]
-        self._feature_names = features.columns.tolist()
-        self._target_names = [str(target.name)]
-        self._id = self._target_names[0]
-        # --- 2. Split ---
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None # To be created
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
+        # --- 1. Identify features (from schema) ---
+        self._feature_names = list(schema.feature_names)
+        # --- 2. Infer target (by set difference) ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = all_cols_set - feature_cols_set
+        if len(target_cols_set) == 0:
+            _LOGGER.error("No target column found. The schema's features match the DataFrame's columns exactly.")
+            raise ValueError("No target column found in DataFrame.")
+        if len(target_cols_set) > 1:
+            _LOGGER.error(f"Ambiguous target. Found {len(target_cols_set)} columns not in the schema: {list(target_cols_set)}. DatasetMaker (single-target) requires exactly one.")
+            raise ValueError("Ambiguous target: More than one non-feature column found.")
+        target_name = list(target_cols_set)[0]
+        self._target_names = [target_name]
+        self._id = target_name
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_series = pandas_df[target_name]
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_series,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
         label_dtype = torch.float32 if kind == "regression" else torch.int64
-        # --- 3. Scale ---
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
-        )
+        # --- 4. Scale (using the schema) ---
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
+        # --- 5. Create Datasets ---
+        self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+    def __repr__(self) -> str:
+        s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
+        s += f"  Target: {self.target_names[0]}\n"
+        s += f"  Features: {self.number_of_features}\n"
+        s += f"  Scaler: {'Fitted' if self.scaler else 'None'}\n"
-        # --- 4. Create Datasets ---
-        self._train_ds = _PytorchDataset(X_train_final, y_train.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
-        self._test_ds = _PytorchDataset(X_test_final, y_test.values, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+        if self._train_ds:
+            s += f"  Train Samples: {len(self._train_ds)}\n" # type: ignore
+        if self._test_ds:
+            s += f"  Test Samples: {len(self._test_ds)}\n" # type: ignore
+        return s
-# --- New Multi-Target Class ---
+# --- Multi-Target Class ---
 class DatasetMakerMulti(_BaseDatasetMaker):
     """
-    Dataset maker for pre-processed, numerical pandas DataFrames with a multiple target columns.
+    Dataset maker for pre-processed, numerical pandas DataFrames with
+    multiple target columns.
-    This class takes a DataFrame, automatically splits it into training and testing sets, and converts them into PyTorch Datasets.
+    This class takes a *full* DataFrame, a *FeatureSchema*, and a list of
+    *target_columns*. It validates that the schema's features and the
+    target columns are mutually exclusive and together account for all
+    columns in the DataFrame.
+    Targets dtype is torch.float32
     """
     def __init__(self,
                  pandas_df: pandas.DataFrame,
                  target_columns: List[str],
+                 schema: FeatureSchema,
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None,
-                 continuous_feature_columns: Optional[Union[List[int], List[str]]] = None):
+                 random_state: int = 42):
         """
         Args:
-            pandas_df (pandas.DataFrame): The pre-processed input DataFrame with numerical data.
-            target_columns (list[str]): List of target column names.
-            test_size (float): The proportion of the dataset to allocate to the test split.
-            random_state (int): The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None): A pre-fitted PytorchScaler instance.
-            continuous_feature_columns (List[int] | List[str] | None): Column indices or names of continuous features to scale. If provided creates a new PytorchScaler.
+            pandas_df (pandas.DataFrame):
+                The pre-processed input DataFrame with *all* columns
+                (features and targets).
+            target_columns (list[str]):
+                List of target column names.
+            schema (FeatureSchema):
+                The definitive schema object from data_exploration.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
+            test_size (float):
+                The proportion of the dataset to allocate to the test split.
+            random_state (int):
+                The seed for the random number generator for reproducibility.
+        ## Note:
+        For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
+        This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
         """
         super().__init__()
-        self.scaler = scaler
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
+        # --- 1. Get features and targets from schema/args ---
+        self._feature_names = list(schema.feature_names)
         self._target_names = target_columns
-        self._feature_names = [col for col in pandas_df.columns if col not in target_columns]
-        features = pandas_df[self._feature_names]
-        target = pandas_df[self._target_names]
+        # --- 2. Validation ---
+        all_cols_set = set(pandas_df.columns)
+        feature_cols_set = set(self._feature_names)
+        target_cols_set = set(self._target_names)
+        overlap = feature_cols_set.intersection(target_cols_set)
+        if overlap:
+            _LOGGER.error(f"Features and targets are not mutually exclusive. Overlap: {list(overlap)}")
+            raise ValueError("Features and targets overlap.")
+        schema_plus_targets = feature_cols_set.union(target_cols_set)
+        missing_cols = all_cols_set - schema_plus_targets
+        if missing_cols:
+            _LOGGER.warning(f"Columns in DataFrame but not in schema or targets: {list(missing_cols)}")
+        extra_cols = schema_plus_targets - all_cols_set
+        if extra_cols:
+            _LOGGER.error(f"Columns in schema/targets but not in DataFrame: {list(extra_cols)}")
+            raise ValueError("Schema/target definition mismatch with DataFrame.")
+        # --- 3. Split Data ---
+        features_df = pandas_df[self._feature_names]
+        target_df = pandas_df[self._target_names]
         X_train, X_test, y_train, y_test = train_test_split(
-            features, target, test_size=test_size, random_state=random_state
+            features_df,
+            target_df,
+            test_size=test_size,
+            random_state=random_state
         )
         self._X_train_shape, self._X_test_shape = X_train.shape, X_test.shape
         self._y_train_shape, self._y_test_shape = y_train.shape, y_test.shape
-        label_dtype = torch.float32
+        # Multi-target for regression or multi-binary
+        label_dtype = torch.float32
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, continuous_feature_columns
-        )
+        # --- 4. Scale (using the schema) ---
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
+        # --- 5. Create Datasets ---
+        # _PytorchDataset now correctly handles y_train (a DataFrame)
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
         self._test_ds = _PytorchDataset(X_test_final, y_test, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
+    def __repr__(self) -> str:
+        s = f"<{self.__class__.__name__} (ID: '{self.id}')>\n"
+        s += f"  Targets: {self.number_of_targets}\n"
+        s += f"  Features: {self.number_of_features}\n"
+        s += f"  Scaler: {'Fitted' if self.scaler else 'None'}\n"
+        if self._train_ds:
+            s += f"  Train Samples: {len(self._train_ds)}\n" # type: ignore
+        if self._test_ds:
+            s += f"  Test Samples: {len(self._test_ds)}\n" # type: ignore
+        return s
 # --- Private Base Class ---
 class _BaseMaker(ABC):
@@ -351,149 +494,6 @@ class _BaseMaker(ABC):
         pass
-# --- VisionDatasetMaker ---
-class VisionDatasetMaker(_BaseMaker):
-    """
-    Creates processed PyTorch datasets for computer vision tasks from an
-    image folder directory.
-    Uses online augmentations per epoch (image augmentation without creating new files).
-    """
-    def __init__(self, full_dataset: ImageFolder):
-        super().__init__()
-        self.full_dataset = full_dataset
-        self.labels = [s[1] for s in self.full_dataset.samples]
-        self.class_map = full_dataset.class_to_idx
-        self._is_split = False
-        self._are_transforms_configured = False
-    @classmethod
-    def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
-        """Creates a maker instance from a root directory of images."""
-        initial_transform = transforms.Compose([transforms.ToTensor()])
-        full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
-        _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
-        return cls(full_dataset)
-    @staticmethod
-    def inspect_folder(path: Union[str, Path]):
-        """
-        Logs a report of the types, sizes, and channels of image files
-        found in the directory and its subdirectories.
-        """
-        path_obj = make_fullpath(path)
-        non_image_files = set()
-        img_types = set()
-        img_sizes = set()
-        img_channels = set()
-        img_counter = 0
-        _LOGGER.info(f"Inspecting folder: {path_obj}...")
-        # Use rglob to recursively find all files
-        for filepath in path_obj.rglob('*'):
-            if filepath.is_file():
-                try:
-                    # Using PIL to open is a more reliable check
-                    with Image.open(filepath) as img:
-                        img_types.add(img.format)
-                        img_sizes.add(img.size)
-                        img_channels.update(img.getbands())
-                        img_counter += 1
-                except (IOError, SyntaxError):
-                    non_image_files.add(filepath.name)
-        if non_image_files:
-            _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
-        report = (
-            f"\n--- Inspection Report for '{path_obj.name}' ---\n"
-            f"Total images found: {img_counter}\n"
-            f"Image formats: {img_types or 'None'}\n"
-            f"Image sizes (WxH): {img_sizes or 'None'}\n"
-            f"Image channels (bands): {img_channels or 'None'}\n"
-            f"--------------------------------------"
-        )
-        print(report)
-    def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
-                   stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
-        """Splits the dataset into training, validation, and optional test sets."""
-        if self._is_split:
-            _LOGGER.warning("Data has already been split.")
-            return self
-        if val_size + test_size >= 1.0:
-            _LOGGER.error("The sum of val_size and test_size must be less than 1.")
-            raise ValueError()
-        indices = list(range(len(self.full_dataset)))
-        labels_for_split = self.labels if stratify else None
-        train_indices, val_test_indices = train_test_split(
-            indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
-        )
-        if test_size > 0:
-            val_test_labels = [self.labels[i] for i in val_test_indices]
-            stratify_val_test = val_test_labels if stratify else None
-            val_indices, test_indices = train_test_split(
-                val_test_indices, test_size=(test_size / (val_size + test_size)),
-                random_state=random_state, stratify=stratify_val_test
-            )
-            self._test_dataset = Subset(self.full_dataset, test_indices)
-            _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
-        else:
-            val_indices = val_test_indices
-        self._train_dataset = Subset(self.full_dataset, train_indices)
-        self._val_dataset = Subset(self.full_dataset, val_indices)
-        self._is_split = True
-        _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
-        return self
-    def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
-                             mean: List[float] = [0.485, 0.456, 0.406],
-                             std: List[float] = [0.229, 0.224, 0.225],
-                             extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
-        """Configures and applies the image transformations (augmentations)."""
-        if not self._is_split:
-            _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
-            raise RuntimeError()
-        base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
-        if extra_train_transforms:
-            base_train_transforms.extend(extra_train_transforms)
-        final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
-        val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
-        train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
-        self._train_dataset.dataset.transform = train_transform # type: ignore
-        self._val_dataset.dataset.transform = val_transform # type: ignore
-        if self._test_dataset:
-            self._test_dataset.dataset.transform = val_transform # type: ignore
-        self._are_transforms_configured = True
-        _LOGGER.info("Image transforms configured and applied.")
-        return self
-    def get_datasets(self) -> Tuple[Dataset, ...]:
-        """Returns the final train, validation, and optional test datasets."""
-        if not self._is_split:
-            _LOGGER.error("Data has not been split. Call .split_data() first.")
-            raise RuntimeError()
-        if not self._are_transforms_configured:
-            _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
-        if self._test_dataset:
-            return self._train_dataset, self._val_dataset, self._test_dataset
-        return self._train_dataset, self._val_dataset
 # --- SequenceMaker ---
 class SequenceMaker(_BaseMaker):
     """
@@ -680,41 +680,22 @@ class SequenceMaker(_BaseMaker):
             _LOGGER.error("Windows have not been generated. Call .generate_windows() first.")
             raise RuntimeError()
         return self._train_dataset, self._test_dataset
-# --- Custom Vision Transform Class ---
-class ResizeAspectFill:
-    """
-    Custom transformation to make an image square by padding it to match the
-    longest side, preserving the aspect ratio. The image is finally centered.
-    Args:
-        pad_color (Union[str, int]): Color to use for the padding.
-                                     Defaults to "black".
-    """
-    def __init__(self, pad_color: Union[str, int] = "black") -> None:
-        self.pad_color = pad_color
-    def __call__(self, image: Image.Image) -> Image.Image:
-        if not isinstance(image, Image.Image):
-            _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
-            raise TypeError()
-        w, h = image.size
-        if w == h:
-            return image
-        # Determine padding to center the image
-        if w > h:
-            top_padding = (w - h) // 2
-            bottom_padding = w - h - top_padding
-            padding = (0, top_padding, 0, bottom_padding)
-        else: # h > w
-            left_padding = (h - w) // 2
-            right_padding = h - w - left_padding
-            padding = (left_padding, 0, right_padding, 0)
-        return ImageOps.expand(image, padding, fill=self.pad_color)
+    def __repr__(self) -> str:
+        s = f"<{self.__class__.__name__}>:\n"
+        s += f"  Sequence Length (Window): {self.sequence_length}\n"
+        s += f"  Total Data Points: {len(self.sequence)}\n"
+        s += "  --- Status ---\n"
+        s += f"  Split: {self._is_split}\n"
+        s += f"  Normalized: {self._is_normalized}\n"
+        s += f"  Windows Generated: {self._are_windows_generated}\n"
+        if self._are_windows_generated:
+            train_len = len(self._train_dataset) if self._train_dataset else 0 # type: ignore
+            test_len = len(self._test_dataset) if self._test_dataset else 0 # type: ignore
+            s += f"  Datasets (Train/Test): {train_len} / {test_len} windows\n"
+        return s
 def info():

dragon-ml-toolbox 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl

dragon-ml-toolbox 13.0.0py3-none-any.whl → 14.7.0py3-none-any.whl