PyPI - dragon-ml-toolbox - Versions diffs - 13.2.0__tar.gz → 14.1.0__tar.gz - Mend

dragon-ml-toolbox 13.2.0tar.gz → 14.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

{dragon_ml_toolbox-13.2.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-14.1.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 13.2.0
+Version: 14.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
 Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
+Requires-Dist: torchmetrics; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"

{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 13.2.0
+Version: 14.1.0
 Summary: A collection of tools for data science and machine learning projects.
 Author-email: "Karl L. Loza Vidaurre" <luigiloza@gmail.com>
 License-Expression: MIT
@@ -34,6 +34,7 @@ Requires-Dist: Pillow; extra == "ml"
 Requires-Dist: evotorch; extra == "ml"
 Requires-Dist: pyarrow; extra == "ml"
 Requires-Dist: colorlog; extra == "ml"
+Requires-Dist: torchmetrics; extra == "ml"
 Provides-Extra: mice
 Requires-Dist: numpy<2.0; extra == "mice"
 Requires-Dist: pandas; extra == "mice"

{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/SOURCES.txt RENAMED Viewed

@@ -21,10 +21,17 @@ ml_tools/ML_optimization.py
 ml_tools/ML_scaler.py
 ml_tools/ML_trainer.py
 ml_tools/ML_utilities.py
+ml_tools/ML_vision_datasetmaster.py
+ml_tools/ML_vision_evaluation.py
+ml_tools/ML_vision_inference.py
+ml_tools/ML_vision_models.py
+ml_tools/ML_vision_transformers.py
 ml_tools/PSO_optimization.py
 ml_tools/RNN_forecast.py
 ml_tools/SQL.py
 ml_tools/VIF_factor.py
+ml_tools/_ML_pytorch_tabular.py
+ml_tools/_ML_vision_recipe.py
 ml_tools/__init__.py
 ml_tools/_logger.py
 ml_tools/_schema.py

{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/dragon_ml_toolbox.egg-info/requires.txt RENAMED Viewed

@@ -21,6 +21,7 @@ Pillow
 evotorch
 pyarrow
 colorlog
+torchmetrics
 [excel]
 pandas

{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/MICE_imputation.py RENAMED Viewed

@@ -7,19 +7,20 @@ from plotnine import ggplot, labs, theme, element_blank # type: ignore
 from typing import Optional, Union
 from .utilities import load_dataframe, merge_dataframes, save_dataframe_filename
-from .math_utilities import threshold_binary_values
+from .math_utilities import threshold_binary_values, discretize_categorical_values
 from .path_manager import sanitize_filename, make_fullpath, list_csv_paths
 from ._logger import _LOGGER
 from ._script_info import _script_info
+from ._schema import FeatureSchema
 __all__ = [
+    "MiceImputer",
     "apply_mice",
     "save_imputed_datasets",
-    "get_na_column_names",
     "get_convergence_diagnostic",
     "get_imputed_distributions",
-    "run_mice_pipeline"
+    "run_mice_pipeline",
 ]
@@ -79,7 +80,7 @@ def save_imputed_datasets(save_dir: Union[str, Path], imputed_datasets: list, df
 #Get names of features that had missing values before imputation
-def get_na_column_names(df: pd.DataFrame):
+def _get_na_column_names(df: pd.DataFrame):
     return [col for col in df.columns if df[col].isna().any()]
@@ -264,7 +265,7 @@ def run_mice_pipeline(df_path_or_dir: Union[str,Path], target_columns: list[str]
         save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
-        imputed_column_names = get_na_column_names(df=df)
+        imputed_column_names = _get_na_column_names(df=df)
         get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
@@ -278,5 +279,206 @@ def _skip_targets(df: pd.DataFrame, target_cols: list[str]):
     return df_feats, df_targets
+# modern implementation
+class MiceImputer:
+    """
+    A modern MICE imputation pipeline that uses a FeatureSchema
+    to correctly discretize categorical features after imputation.
+    """
+    def __init__(self,
+                 schema: FeatureSchema,
+                 iterations: int=20,
+                 resulting_datasets: int = 1,
+                 random_state: int = 101):
+        self.schema = schema
+        self.random_state = random_state
+        self.iterations = iterations
+        self.resulting_datasets = resulting_datasets
+        # --- Store schema info ---
+        # 1. Categorical info
+        if not self.schema.categorical_index_map:
+            _LOGGER.warning("FeatureSchema has no 'categorical_index_map'. No discretization will be applied.")
+            self.cat_info = {}
+        else:
+            self.cat_info = self.schema.categorical_index_map
+        # 2. Ordered feature names (critical for index mapping)
+        self.ordered_features = list(self.schema.feature_names)
+        # 3. Names of categorical features
+        self.categorical_features = list(self.schema.categorical_feature_names)
+        _LOGGER.info(f"MiceImputer initialized. Found {len(self.cat_info)} categorical features to discretize.")
+    def _post_process(self, imputed_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Applies schema-based discretization to a completed dataframe.
+        This method works around the behavior of `discretize_categorical_values`
+        (which returns a full int32 array) by:
+        1. Calling it on the full, ordered feature array.
+        2. Extracting *only* the valid discretized categorical columns.
+        3. Updating the original float dataframe with these integer values.
+        """
+        # If no categorical features are defined, return the df as-is.
+        if not self.cat_info:
+            return imputed_df
+        try:
+            # 1. Ensure DataFrame columns match the schema order
+            # This is critical for the index-based categorical_info
+            df_ordered: pd.DataFrame = imputed_df[self.ordered_features] # type: ignore
+            # 2. Convert to NumPy array
+            array_ordered = df_ordered.to_numpy()
+            # 3. Apply discretization utility (which returns a full int32 array)
+            # This array has *correct* categorical values but *truncated* continuous values.
+            discretized_array_int32 = discretize_categorical_values(
+                array_ordered,
+                self.cat_info,
+                start_at_zero=True  # Assuming 0-based indexing
+            )
+            # 4. Create a new DF from the int32 array, keeping the categorical columns.
+            df_discretized_cats = pd.DataFrame(
+                discretized_array_int32,
+                columns=self.ordered_features,
+                index=df_ordered.index  # <-- Critical: align index
+            )[self.categorical_features] # <-- Select only cat features
+            # 5. "Rejoin": Start with a fresh copy of the *original* imputed DF (which has correct continuous floats).
+            final_df = df_ordered.copy()
+            # 6. Use .update() to "paste" the integer categorical values
+            # over the old float categorical values. Continuous floats are unaffected.
+            final_df.update(df_discretized_cats)
+            return final_df
+        except Exception as e:
+            _LOGGER.error(f"Failed during post-processing discretization:\n\tInput DF shape: {imputed_df.shape}\n\tSchema features: {len(self.ordered_features)}\n\tCategorical info keys: {list(self.cat_info.keys())}\n{e}")
+            raise
+    def _run_mice(self,
+                  df: pd.DataFrame,
+                  df_name: str) -> tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
+        """
+        Runs the MICE kernel and applies schema-based post-processing.
+        Parameters:
+            df (pd.DataFrame): The input dataframe *with NaNs*. Should only contain feature columns.
+            df_name (str): The base name for the dataset.
+        Returns:
+            tuple[mf.ImputationKernel, list[pd.DataFrame], list[str]]:
+                - The trained MICE kernel
+                - A list of imputed and processed DataFrames
+                - A list of names for the new DataFrames
+        """
+        # Ensure input df only contains features from the schema and is in the correct order.
+        try:
+            df_feats = df[self.ordered_features]
+        except KeyError as e:
+            _LOGGER.error(f"Input DataFrame is missing required schema columns: {e}")
+            raise
+        # 1. Initialize kernel
+        kernel = mf.ImputationKernel(
+            data=df_feats,
+            num_datasets=self.resulting_datasets,
+            random_state=self.random_state
+        )
+        _LOGGER.info("➡️ Schema-based MICE imputation running...")
+        # 2. Perform MICE
+        kernel.mice(self.iterations)
+        # 3. Retrieve, process, and collect datasets
+        imputed_datasets = []
+        for i in range(self.resulting_datasets):
+            # complete_data returns a pd.DataFrame
+            completed_df = kernel.complete_data(dataset=i)
+            # Apply our new discretization and ordering
+            processed_df = self._post_process(completed_df)
+            imputed_datasets.append(processed_df)
+        if not imputed_datasets:
+            _LOGGER.error("No imputed datasets were generated.")
+            raise ValueError()
+        # 4. Generate names
+        if self.resulting_datasets == 1:
+            imputed_dataset_names = [f"{df_name}_MICE"]
+        else:
+            imputed_dataset_names = [f"{df_name}_MICE_{i+1}" for i in range(self.resulting_datasets)]
+        # 5. Validate indexes
+        for imputed_df, subname in zip(imputed_datasets, imputed_dataset_names):
+            assert imputed_df.shape[0] == df.shape[0], f"❌ Row count mismatch in dataset {subname}"
+            assert all(imputed_df.index == df.index), f"❌ Index mismatch in dataset {subname}"
+        _LOGGER.info("Schema-based MICE imputation complete.")
+        return kernel, imputed_datasets, imputed_dataset_names
+    def run_pipeline(self,
+                     df_path_or_dir: Union[str,Path],
+                     save_datasets_dir: Union[str,Path],
+                     save_metrics_dir: Union[str,Path],
+                     ):
+        """
+        Runs the complete MICE imputation pipeline.
+        This method automates the entire workflow:
+            1. Loads data from a CSV file path or a directory with CSV files.
+            2. Separates features and targets based on the `FeatureSchema`.
+            3. Runs the MICE algorithm on the feature set.
+            4. Applies schema-based post-processing to discretize categorical features.
+            5. Saves the final, processed, and imputed dataset(s) (re-joined with targets) to `save_datasets_dir`.
+            6. Generates and saves convergence and distribution plots for all imputed columns to `save_metrics_dir`.
+        Parameters
+        ----------
+        df_path_or_dir :[str,Path]
+            Path to a single CSV file or a directory containing multiple CSV files to impute.
+        save_datasets_dir : [str,Path]
+            Directory where the final imputed and processed dataset(s) will be saved as CSVs.
+        save_metrics_dir : [str,Path]
+            Directory where convergence and distribution plots will be saved.
+        """
+        # Check paths
+        save_datasets_path = make_fullpath(save_datasets_dir, make=True)
+        save_metrics_path = make_fullpath(save_metrics_dir, make=True)
+        input_path = make_fullpath(df_path_or_dir)
+        if input_path.is_file():
+            all_file_paths = [input_path]
+        else:
+            all_file_paths = list(list_csv_paths(input_path).values())
+        for df_path in all_file_paths:
+            df, df_name = load_dataframe(df_path=df_path, kind="pandas")
+            df_features: pd.DataFrame = df[self.schema.feature_names] # type: ignore
+            df_targets = df.drop(columns=self.schema.feature_names)
+            imputed_column_names = _get_na_column_names(df=df_features)
+            kernel, imputed_datasets, imputed_dataset_names = self._run_mice(df=df_features, df_name=df_name)
+            save_imputed_datasets(save_dir=save_datasets_path, imputed_datasets=imputed_datasets, df_targets=df_targets, imputed_dataset_names=imputed_dataset_names)
+            get_convergence_diagnostic(kernel=kernel, imputed_dataset_names=imputed_dataset_names, column_names=imputed_column_names, root_dir=save_metrics_path)
+            get_imputed_distributions(kernel=kernel, df_name=df_name, root_dir=save_metrics_path, column_names=imputed_column_names)
 def info():
     _script_info(__all__)

{dragon_ml_toolbox-13.2.0 → dragon_ml_toolbox-14.1.0}/ml_tools/ML_datasetmaster.py RENAMED Viewed

@@ -1,13 +1,10 @@
 import torch
-from torch.utils.data import Dataset, Subset
+from torch.utils.data import Dataset
 import pandas
 import numpy
 from sklearn.model_selection import train_test_split
 from typing import Literal, Union, Tuple, List, Optional
 from abc import ABC, abstractmethod
-from PIL import Image, ImageOps
-from torchvision.datasets import ImageFolder
-from torchvision import transforms
 import matplotlib.pyplot as plt
 from pathlib import Path
@@ -23,9 +20,7 @@ from ._schema import FeatureSchema
 __all__ = [
     "DatasetMaker",
     "DatasetMakerMulti",
-    "VisionDatasetMaker",
-    "SequenceMaker",
-    "ResizeAspectFill",
+    "SequenceMaker"
 ]
@@ -126,8 +121,8 @@ class _BaseDatasetMaker(ABC):
         else:
             _LOGGER.info("No continuous features listed in schema. Scaler will not be fitted.")
-        X_train_values = X_train.values
-        X_test_values = X_test.values
+        X_train_values = X_train.to_numpy()
+        X_test_values = X_test.to_numpy()
         # continuous_feature_indices is derived
         if self.scaler is None and continuous_feature_indices:
@@ -253,26 +248,42 @@ class DatasetMaker(_BaseDatasetMaker):
                  pandas_df: pandas.DataFrame,
                  schema: FeatureSchema,
                  kind: Literal["regression", "classification"],
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None):
+                 random_state: int = 42):
         """
         Args:
             pandas_df (pandas.DataFrame):
                 The pre-processed input DataFrame containing all columns. (features and single target).
             schema (FeatureSchema):
                 The definitive schema object from data_exploration.
-            kind (Literal["regression", "classification"]):
+            kind ("regression" | "classification"):
                 The type of ML task. This determines the data type of the labels.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
             test_size (float):
                 The proportion of the dataset to allocate to the test split.
             random_state (int):
                 The seed for the random number of generator for reproducibility.
-            scaler (PytorchScaler | None):
-                A pre-fitted PytorchScaler instance, if None a new scaler will be created.
         """
         super().__init__()
-        self.scaler = scaler
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None # To be created
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
         # --- 1. Identify features (from schema) ---
         self._feature_names = list(schema.feature_names)
@@ -310,9 +321,14 @@ class DatasetMaker(_BaseDatasetMaker):
         label_dtype = torch.float32 if kind == "regression" else torch.int64
         # --- 4. Scale (using the schema) ---
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, schema
-        )
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
         # --- 5. Create Datasets ---
         self._train_ds = _PytorchDataset(X_train_final, y_train, labels_dtype=label_dtype, feature_names=self._feature_names, target_names=self._target_names)
@@ -336,9 +352,9 @@ class DatasetMakerMulti(_BaseDatasetMaker):
                  pandas_df: pandas.DataFrame,
                  target_columns: List[str],
                  schema: FeatureSchema,
+                 scaler: Union[Literal["fit"], Literal["none"], PytorchScaler],
                  test_size: float = 0.2,
-                 random_state: int = 42,
-                 scaler: Optional[PytorchScaler] = None):
+                 random_state: int = 42):
         """
         Args:
             pandas_df (pandas.DataFrame):
@@ -348,20 +364,35 @@ class DatasetMakerMulti(_BaseDatasetMaker):
                 List of target column names.
             schema (FeatureSchema):
                 The definitive schema object from data_exploration.
+            scaler ("fit" | "none" | PytorchScaler):
+                Strategy for data scaling:
+                - "fit": Fit a new PytorchScaler on continuous features.
+                - "none": Do not scale data (e.g., for TabularTransformer).
+                - PytorchScaler instance: Use a pre-fitted scaler to transform data.
             test_size (float):
                 The proportion of the dataset to allocate to the test split.
             random_state (int):
                 The seed for the random number generator for reproducibility.
-            scaler (PytorchScaler | None):
-                A pre-fitted PytorchScaler instance.
         ## Note:
         For multi-binary classification, the most common PyTorch loss function is nn.BCEWithLogitsLoss.
         This loss function requires the labels to be torch.float32 which is the same type required for regression (multi-regression) tasks.
         """
         super().__init__()
-        self.scaler = scaler
+        _apply_scaling: bool = False
+        if scaler == "fit":
+            self.scaler = None
+            _apply_scaling = True
+        elif scaler == "none":
+            self.scaler = None
+        elif isinstance(scaler, PytorchScaler):
+            self.scaler = scaler # Use the provided one
+            _apply_scaling = True
+        else:
+            _LOGGER.error(f"Invalid 'scaler' argument. Must be 'fit', 'none', or a PytorchScaler instance.")
+            raise ValueError()
         # --- 1. Get features and targets from schema/args ---
         self._feature_names = list(schema.feature_names)
         self._target_names = target_columns
@@ -403,9 +434,14 @@ class DatasetMakerMulti(_BaseDatasetMaker):
         label_dtype = torch.float32
         # --- 4. Scale (using the schema) ---
-        X_train_final, X_test_final = self._prepare_scaler(
-            X_train, y_train, X_test, label_dtype, schema
-        )
+        if _apply_scaling:
+            X_train_final, X_test_final = self._prepare_scaler(
+                X_train, y_train, X_test, label_dtype, schema
+            )
+        else:
+            _LOGGER.info("Features have not been scaled as specified.")
+            X_train_final = X_train.to_numpy()
+            X_test_final = X_test.to_numpy()
         # --- 5. Create Datasets ---
         # _PytorchDataset now correctly handles y_train (a DataFrame)
@@ -432,149 +468,6 @@ class _BaseMaker(ABC):
         pass
-# --- VisionDatasetMaker ---
-class VisionDatasetMaker(_BaseMaker):
-    """
-    Creates processed PyTorch datasets for computer vision tasks from an
-    image folder directory.
-    Uses online augmentations per epoch (image augmentation without creating new files).
-    """
-    def __init__(self, full_dataset: ImageFolder):
-        super().__init__()
-        self.full_dataset = full_dataset
-        self.labels = [s[1] for s in self.full_dataset.samples]
-        self.class_map = full_dataset.class_to_idx
-        self._is_split = False
-        self._are_transforms_configured = False
-    @classmethod
-    def from_folder(cls, root_dir: str) -> 'VisionDatasetMaker':
-        """Creates a maker instance from a root directory of images."""
-        initial_transform = transforms.Compose([transforms.ToTensor()])
-        full_dataset = ImageFolder(root=root_dir, transform=initial_transform)
-        _LOGGER.info(f"Found {len(full_dataset)} images in {len(full_dataset.classes)} classes.")
-        return cls(full_dataset)
-    @staticmethod
-    def inspect_folder(path: Union[str, Path]):
-        """
-        Logs a report of the types, sizes, and channels of image files
-        found in the directory and its subdirectories.
-        """
-        path_obj = make_fullpath(path)
-        non_image_files = set()
-        img_types = set()
-        img_sizes = set()
-        img_channels = set()
-        img_counter = 0
-        _LOGGER.info(f"Inspecting folder: {path_obj}...")
-        # Use rglob to recursively find all files
-        for filepath in path_obj.rglob('*'):
-            if filepath.is_file():
-                try:
-                    # Using PIL to open is a more reliable check
-                    with Image.open(filepath) as img:
-                        img_types.add(img.format)
-                        img_sizes.add(img.size)
-                        img_channels.update(img.getbands())
-                        img_counter += 1
-                except (IOError, SyntaxError):
-                    non_image_files.add(filepath.name)
-        if non_image_files:
-            _LOGGER.warning(f"Non-image or corrupted files found and ignored: {non_image_files}")
-        report = (
-            f"\n--- Inspection Report for '{path_obj.name}' ---\n"
-            f"Total images found: {img_counter}\n"
-            f"Image formats: {img_types or 'None'}\n"
-            f"Image sizes (WxH): {img_sizes or 'None'}\n"
-            f"Image channels (bands): {img_channels or 'None'}\n"
-            f"--------------------------------------"
-        )
-        print(report)
-    def split_data(self, val_size: float = 0.2, test_size: float = 0.0,
-                   stratify: bool = True, random_state: Optional[int] = None) -> 'VisionDatasetMaker':
-        """Splits the dataset into training, validation, and optional test sets."""
-        if self._is_split:
-            _LOGGER.warning("Data has already been split.")
-            return self
-        if val_size + test_size >= 1.0:
-            _LOGGER.error("The sum of val_size and test_size must be less than 1.")
-            raise ValueError()
-        indices = list(range(len(self.full_dataset)))
-        labels_for_split = self.labels if stratify else None
-        train_indices, val_test_indices = train_test_split(
-            indices, test_size=(val_size + test_size), random_state=random_state, stratify=labels_for_split
-        )
-        if test_size > 0:
-            val_test_labels = [self.labels[i] for i in val_test_indices]
-            stratify_val_test = val_test_labels if stratify else None
-            val_indices, test_indices = train_test_split(
-                val_test_indices, test_size=(test_size / (val_size + test_size)),
-                random_state=random_state, stratify=stratify_val_test
-            )
-            self._test_dataset = Subset(self.full_dataset, test_indices)
-            _LOGGER.info(f"Test set created with {len(self._test_dataset)} images.")
-        else:
-            val_indices = val_test_indices
-        self._train_dataset = Subset(self.full_dataset, train_indices)
-        self._val_dataset = Subset(self.full_dataset, val_indices)
-        self._is_split = True
-        _LOGGER.info(f"Data split into: \n- Training: {len(self._train_dataset)} images \n- Validation: {len(self._val_dataset)} images")
-        return self
-    def configure_transforms(self, resize_size: int = 256, crop_size: int = 224,
-                             mean: List[float] = [0.485, 0.456, 0.406],
-                             std: List[float] = [0.229, 0.224, 0.225],
-                             extra_train_transforms: Optional[List] = None) -> 'VisionDatasetMaker':
-        """Configures and applies the image transformations (augmentations)."""
-        if not self._is_split:
-            _LOGGER.error("Transforms must be configured AFTER splitting data. Call .split_data() first.")
-            raise RuntimeError()
-        base_train_transforms = [transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip()]
-        if extra_train_transforms:
-            base_train_transforms.extend(extra_train_transforms)
-        final_transforms = [transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]
-        val_transform = transforms.Compose([transforms.Resize(resize_size), transforms.CenterCrop(crop_size), *final_transforms])
-        train_transform = transforms.Compose([*base_train_transforms, *final_transforms])
-        self._train_dataset.dataset.transform = train_transform # type: ignore
-        self._val_dataset.dataset.transform = val_transform # type: ignore
-        if self._test_dataset:
-            self._test_dataset.dataset.transform = val_transform # type: ignore
-        self._are_transforms_configured = True
-        _LOGGER.info("Image transforms configured and applied.")
-        return self
-    def get_datasets(self) -> Tuple[Dataset, ...]:
-        """Returns the final train, validation, and optional test datasets."""
-        if not self._is_split:
-            _LOGGER.error("Data has not been split. Call .split_data() first.")
-            raise RuntimeError()
-        if not self._are_transforms_configured:
-            _LOGGER.warning("Transforms have not been configured. Using default ToTensor only.")
-        if self._test_dataset:
-            return self._train_dataset, self._val_dataset, self._test_dataset
-        return self._train_dataset, self._val_dataset
 # --- SequenceMaker ---
 class SequenceMaker(_BaseMaker):
     """
@@ -763,40 +656,5 @@ class SequenceMaker(_BaseMaker):
         return self._train_dataset, self._test_dataset
-# --- Custom Vision Transform Class ---
-class ResizeAspectFill:
-    """
-    Custom transformation to make an image square by padding it to match the
-    longest side, preserving the aspect ratio. The image is finally centered.
-    Args:
-        pad_color (Union[str, int]): Color to use for the padding.
-                                     Defaults to "black".
-    """
-    def __init__(self, pad_color: Union[str, int] = "black") -> None:
-        self.pad_color = pad_color
-    def __call__(self, image: Image.Image) -> Image.Image:
-        if not isinstance(image, Image.Image):
-            _LOGGER.error(f"Expected PIL.Image.Image, got {type(image).__name__}")
-            raise TypeError()
-        w, h = image.size
-        if w == h:
-            return image
-        # Determine padding to center the image
-        if w > h:
-            top_padding = (w - h) // 2
-            bottom_padding = w - h - top_padding
-            padding = (0, top_padding, 0, bottom_padding)
-        else: # h > w
-            left_padding = (h - w) // 2
-            right_padding = h - w - left_padding
-            padding = (left_padding, 0, right_padding, 0)
-        return ImageOps.expand(image, padding, fill=self.pad_color)
 def info():
     _script_info(__all__)

dragon-ml-toolbox 13.2.0__tar.gz → 14.1.0__tar.gz

dragon-ml-toolbox 13.2.0tar.gz → 14.1.0tar.gz