PyPI - dragon-ml-toolbox - Versions diffs - 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.0.0py3-none-any.whl → 14.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/METADATA +12 -2
dragon_ml_toolbox-14.7.0.dist-info/RECORD +49 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
ml_tools/MICE_imputation.py +207 -5
ml_tools/ML_configuration.py +108 -0
ml_tools/ML_datasetmaster.py +241 -260
ml_tools/ML_evaluation.py +229 -76
ml_tools/ML_evaluation_multi.py +45 -16
ml_tools/ML_inference.py +0 -1
ml_tools/ML_models.py +135 -55
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +49 -36
ml_tools/ML_trainer.py +498 -29
ml_tools/ML_utilities.py +351 -4
ml_tools/ML_vision_datasetmaster.py +1492 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +641 -0
ml_tools/ML_vision_transformers.py +203 -0
ml_tools/PSO_optimization.py +5 -1
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_schema.py +96 -0
ml_tools/custom_logger.py +37 -14
ml_tools/data_exploration.py +576 -138
ml_tools/ensemble_evaluation.py +53 -10
ml_tools/keys.py +43 -1
ml_tools/math_utilities.py +1 -1
ml_tools/optimization_tools.py +65 -86
ml_tools/serde.py +78 -17
ml_tools/utilities.py +192 -3
dragon_ml_toolbox-13.0.0.dist-info/RECORD +0 -41
ml_tools/ML_simple_optimization.py +0 -413
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-13.0.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/top_level.txt +0 -0

ml_tools/ensemble_evaluation.py CHANGED Viewed

@@ -112,7 +112,7 @@ def evaluate_model_classification(
         report_df = pd.DataFrame(report_dict).iloc[:-1, :].T
         plt.figure(figsize=figsize)
         sns.heatmap(report_df, annot=True, cmap=heatmap_cmap, fmt='.2f',
-                    annot_kws={"size": base_fontsize - 4})
+                    annot_kws={"size": base_fontsize - 4}, vmin=0.0, vmax=1.0)
         plt.title(f"{model_name} - {target_name}", fontsize=base_fontsize)
         plt.xticks(fontsize=base_fontsize - 2)
         plt.yticks(fontsize=base_fontsize - 2)
@@ -133,6 +133,7 @@ def evaluate_model_classification(
         normalize="true",
         ax=ax
     )
+    disp.im_.set_clim(vmin=0.0, vmax=1.0)
     ax.set_title(f"{model_name} - {target_name}", fontsize=base_fontsize)
     ax.tick_params(axis='both', labelsize=base_fontsize)
@@ -327,7 +328,8 @@ def plot_calibration_curve(
     target_name: str,
     figure_size: tuple = (10, 10),
     base_fontsize: int = 24,
-    n_bins: int = 15
+    n_bins: int = 15,
+    line_color: str = 'darkorange'
 ) -> plt.Figure: # type: ignore
     """
     Plots the calibration curve (reliability diagram) for a classifier.
@@ -348,22 +350,63 @@ def plot_calibration_curve(
     """
     fig, ax = plt.subplots(figsize=figure_size)
-    disp = CalibrationDisplay.from_estimator(
-        model,
-        x_test,
-        y_test,
-        n_bins=n_bins,
-        ax=ax
+    # --- Step 1: Get probabilities from the estimator ---
+    # We do this manually so we can pass them to from_predictions
+    try:
+        y_prob = model.predict_proba(x_test)
+        # Use probabilities for the positive class (assuming binary)
+        y_score = y_prob[:, 1]
+    except Exception as e:
+        _LOGGER.error(f"Could not get probabilities from model: {e}")
+        plt.close(fig)
+        return fig # Return empty figure
+    # --- Step 2: Get binned data *without* plotting ---
+    with plt.ioff():
+        fig_temp, ax_temp = plt.subplots()
+        cal_display_temp = CalibrationDisplay.from_predictions(
+            y_test,
+            y_score,
+            n_bins=n_bins,
+            ax=ax_temp,
+            name="temp"
+        )
+        line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
+        plt.close(fig_temp)
+    # --- Step 3: Build the plot from scratch on ax ---
+    # 3a. Plot the ideal diagonal line
+    ax.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+    # 3b. Use regplot for the regression line and its CI
+    sns.regplot(
+        x=line_x,
+        y=line_y,
+        ax=ax,
+        scatter=False,  # No scatter dots
+        label=f"Calibration Curve ({n_bins} bins)",
+        line_kws={
+            'color': line_color,
+            'linestyle': '--',
+            'linewidth': 2
+        }
     )
+    # --- Step 4: Apply original formatting ---
     ax.set_title(f"{model_name} - Reliability Curve for {target_name}", fontsize=base_fontsize)
     ax.tick_params(axis='both', labelsize=base_fontsize - 2)
     ax.set_xlabel("Mean Predicted Probability", fontsize=base_fontsize)
     ax.set_ylabel("Fraction of Positives", fontsize=base_fontsize)
-    ax.legend(fontsize=base_fontsize - 4)
+    # Set limits
+    ax.set_ylim(0.0, 1.0)
+    ax.set_xlim(0.0, 1.0)
+    ax.legend(fontsize=base_fontsize - 4, loc='lower right')
     fig.tight_layout()
-    # Save figure
+    # --- Step 5: Save figure (using original logic) ---
     save_path = make_fullpath(save_dir, make=True)
     sanitized_target_name = sanitize_filename(target_name)
     full_save_path = save_path / f"Calibration_Plot_{sanitized_target_name}.svg"

ml_tools/keys.py CHANGED Viewed

@@ -36,6 +36,7 @@ class PyTorchInferenceKeys:
     # For classification tasks
     LABELS = "labels"
     PROBABILITIES = "probabilities"
+    LABEL_NAMES = "label_names"
 class PytorchModelArchitectureKeys:
@@ -55,10 +56,13 @@ class PytorchArtifactPathKeys:
 class DatasetKeys:
-    """Keys for saving dataset artifacts"""
+    """Keys for saving dataset artifacts. Also used by FeatureSchema"""
     FEATURE_NAMES = "feature_names"
     TARGET_NAMES = "target_names"
     SCALER_PREFIX = "scaler_"
+    # Feature Schema
+    CONTINUOUS_NAMES = "continuous_feature_names"
+    CATEGORICAL_NAMES = "categorical_feature_names"
 class SHAPKeys:
@@ -77,6 +81,44 @@ class PyTorchCheckpointKeys:
     BEST_SCORE = "best_score"
+class UtilityKeys:
+    """Keys used for utility modules"""
+    MODEL_PARAMS_FILE = "model_parameters"
+    TOTAL_PARAMS = "Total Parameters"
+    TRAINABLE_PARAMS = "Trainable Parameters"
+    PTH_FILE = "pth report "
+    MODEL_ARCHITECTURE_FILE = "model_architecture_summary"
+class VisionKeys:
+    """For vision ML metrics"""
+    SEGMENTATION_REPORT = "segmentation_report"
+    SEGMENTATION_HEATMAP = "segmentation_metrics_heatmap"
+    SEGMENTATION_CONFUSION_MATRIX = "segmentation_confusion_matrix"
+    # Object detection
+    OBJECT_DETECTION_REPORT = "object_detection_report"
+class VisionTransformRecipeKeys:
+    """Defines the key names for the transform recipe JSON file."""
+    TASK = "task"
+    PIPELINE = "pipeline"
+    NAME = "name"
+    KWARGS = "kwargs"
+    PRE_TRANSFORMS = "pre_transforms"
+    RESIZE_SIZE = "resize_size"
+    CROP_SIZE = "crop_size"
+    MEAN = "mean"
+    STD = "std"
+class ObjectDetectionKeys:
+    """Used by the object detection dataset"""
+    BOXES = "boxes"
+    LABELS = "labels"
 class _OneHotOtherPlaceholder:
     """Used internally by GUI_tools."""
     OTHER_GUI = "OTHER"

ml_tools/math_utilities.py CHANGED Viewed

@@ -219,7 +219,7 @@ def discretize_categorical_values(
         _LOGGER.error(f"'categorical_info' is not a dictionary, or is empty.")
         raise ValueError()
-    _, total_features = input_array.shape
+    _, total_features = working_array.shape
     for col_idx, cardinality in categorical_info.items():
         if not isinstance(col_idx, int):
              _LOGGER.error(f"Column index key {col_idx} is not an integer.")

ml_tools/optimization_tools.py CHANGED Viewed

@@ -9,6 +9,7 @@ from .utilities import yield_dataframes_from_dir
 from ._logger import _LOGGER
 from ._script_info import _script_info
 from .SQL import DatabaseManager
+from ._schema import FeatureSchema
 __all__ = [
@@ -19,35 +20,25 @@ __all__ = [
 def create_optimization_bounds(
-    csv_path: Union[str, Path],
+    schema: FeatureSchema,
     continuous_bounds_map: Dict[str, Tuple[float, float]],
-    categorical_map: Dict[int, int],
-    target_column: Optional[str] = None,
     start_at_zero: bool = True
 ) -> Tuple[List[float], List[float]]:
     """
-    Generates the lower and upper bounds lists for the optimizer from a CSV header.
+    Generates the lower and upper bounds lists for the optimizer from a FeatureSchema.
     This helper function automates the creation of unbiased bounds for
     categorical features and combines them with user-defined bounds for
-    continuous features.
-    It reads *only* the header of the provided CSV to determine the full
-    list of feature columns and their order, excluding the specified target.
-    This is memory-efficient as the full dataset is not loaded.
+    continuous features, using the schema as the single source of truth
+    for feature order and type.
     Args:
-        csv_path (Union[str, Path]):
-            Path to the final, preprocessed CSV file. The column order in
-            this file must match the order expected by the model.
+        schema (FeatureSchema):
+            The definitive schema object created by
+            `data_exploration.finalize_feature_schema()`.
         continuous_bounds_map (Dict[str, Tuple[float, float]]):
             A dictionary mapping the *name* of each **continuous** feature
             to its (min_bound, max_bound) tuple.
-        categorical_map (Dict[int, int]):
-            The map from the *index* of each **categorical** feature to its cardinality.
-            (e.g., {2: 4} for a feature at index 2 with 4 categories).
-        target_column (Optional[str], optional):
-            The name of the target column to exclude. If None (default), the *last column* in the CSV is assumed to be the target.
         start_at_zero (bool):
             - If True, assumes categorical encoding is [0, 1, ..., k-1].
               Bounds will be set as [-0.5, k - 0.5].
@@ -59,98 +50,86 @@ def create_optimization_bounds(
             A tuple containing two lists: (lower_bounds, upper_bounds).
     Raises:
-        ValueError: If a feature is defined in both maps, is missing from
-                    both maps, or if a name in `continuous_bounds_map`
-                    or `target_column` is not found in the CSV columns.
+        ValueError: If a feature is missing from `continuous_bounds_map`
+                    or if a feature name in the map is not a
+                    continuous feature according to the schema.
     """
-    # 1. Read header and determine feature names
-    full_csv_path = make_fullpath(csv_path, enforce="file")
-    try:
-        df_header = pd.read_csv(full_csv_path, nrows=0, encoding="utf-8")
-    except Exception as e:
-        _LOGGER.error(f"Failed to read header from CSV: {e}")
-        raise
-    all_column_names = df_header.columns.to_list()
-    feature_names: List[str] = []
-    if target_column is None:
-        feature_names = all_column_names[:-1]
-        excluded_target = all_column_names[-1]
-        _LOGGER.info(f"No target_column provided. Assuming last column '{excluded_target}' is the target.")
-    else:
-        if target_column not in all_column_names:
-            _LOGGER.error(f"Target column '{target_column}' not found in CSV header.")
-            raise ValueError()
-        feature_names = [name for name in all_column_names if name != target_column]
-        _LOGGER.info(f"Excluding target column '{target_column}'.")
-    # 2. Initialize bound lists
+    # 1. Get feature names and map from schema
+    feature_names = schema.feature_names
+    categorical_index_map = schema.categorical_index_map
     total_features = len(feature_names)
     if total_features <= 0:
-        _LOGGER.error("No feature columns remain after excluding the target.")
+        _LOGGER.error("Schema contains no features.")
         raise ValueError()
+    _LOGGER.info(f"Generating bounds for {total_features} total features...")
+    # 2. Initialize bound lists
     lower_bounds: List[Optional[float]] = [None] * total_features
     upper_bounds: List[Optional[float]] = [None] * total_features
-    _LOGGER.info(f"Generating bounds for {total_features} total features...")
     # 3. Populate categorical bounds (Index-based)
-    # The indices in categorical_map (e.g., {2: 4}) directly correspond
-    # to the indices in the `feature_names` list.
-    for index, cardinality in categorical_map.items():
-        if not (0 <= index < total_features):
-            _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
-            raise ValueError()
-        if start_at_zero:
-            # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
-            low = -0.5
-            high = float(cardinality) - 0.5
-        else:
-            # Rule for [1, k]: bounds are [0.5, k + 0.5]
-            low = 0.5
-            high = float(cardinality) + 0.5
-        lower_bounds[index] = low
-        upper_bounds[index] = high
+    if categorical_index_map:
+        for index, cardinality in categorical_index_map.items():
+            if not (0 <= index < total_features):
+                _LOGGER.error(f"Categorical index {index} is out of range for the {total_features} features.")
+                raise ValueError()
+            if start_at_zero:
+                # Rule for [0, k-1]: bounds are [-0.5, k - 0.5]
+                low = -0.5
+                high = float(cardinality) - 0.5
+            else:
+                # Rule for [1, k]: bounds are [0.5, k + 0.5]
+                low = 0.5
+                high = float(cardinality) + 0.5
+            lower_bounds[index] = low
+            upper_bounds[index] = high
-    _LOGGER.info(f"Automatically set bounds for {len(categorical_map)} categorical features.")
+        _LOGGER.info(f"Automatically set bounds for {len(categorical_index_map)} categorical features.")
+    else:
+        _LOGGER.info("No categorical features found in schema.")
     # 4. Populate continuous bounds (Name-based)
+    # Use schema.continuous_feature_names for robust checking
+    continuous_names_set = set(schema.continuous_feature_names)
+    if continuous_names_set != set(continuous_bounds_map.keys()):
+        missing_in_map = continuous_names_set - set(continuous_bounds_map.keys())
+        if missing_in_map:
+            _LOGGER.error(f"The following continuous features are missing from 'continuous_bounds_map': {list(missing_in_map)}")
+        extra_in_map = set(continuous_bounds_map.keys()) - continuous_names_set
+        if extra_in_map:
+            _LOGGER.error(f"The following features in 'continuous_bounds_map' are not defined as continuous in the schema: {list(extra_in_map)}")
+        raise ValueError("Mismatch between 'continuous_bounds_map' and schema's continuous features.")
     count_continuous = 0
     for name, (low, high) in continuous_bounds_map.items():
-        try:
-            # Map name to its index in the *feature-only* list
-            index = feature_names.index(name)
-        except ValueError:
-            _LOGGER.warning(f"Feature name '{name}' from 'continuous_bounds_map' not found in the CSV's feature columns.")
-            continue
+        # Map name to its index in the *feature-only* list
+        # This is guaranteed to be correct by the schema
+        index = feature_names.index(name)
         if lower_bounds[index] is not None:
-            # This index was already set by the categorical map
-            _LOGGER.error(f"Feature '{name}' (at index {index}) is defined in both 'categorical_map' and 'continuous_bounds_map'.")
+            # This should be impossible if schema is correct, but good to check
+            _LOGGER.error(f"Schema conflict: Feature '{name}' (at index {index}) is defined as both continuous and categorical.")
             raise ValueError()
         lower_bounds[index] = float(low)
         upper_bounds[index] = float(high)
         count_continuous += 1
     _LOGGER.info(f"Manually set bounds for {count_continuous} continuous features.")
-    # 5. Validation: Check for any remaining None values
-    missing_indices = []
-    for i in range(total_features):
-        if lower_bounds[i] is None:
-            missing_indices.append(i)
-    if missing_indices:
+    # 5. Final Validation (all Nones should be filled)
+    if None in lower_bounds:
+        missing_indices = [i for i, b in enumerate(lower_bounds) if b is None]
         missing_names = [feature_names[i] for i in missing_indices]
-        _LOGGER.error(f"Bounds not defined for all features. Missing: {missing_names}")
-        raise ValueError()
-    # _LOGGER.info("All bounds successfully created.")
+        _LOGGER.error(f"Failed to create all bounds. This indicates an internal logic error. Missing: {missing_names}")
+        raise RuntimeError("Internal error: Not all bounds were populated.")
     # Cast to float lists, as 'None' sentinels are gone
     return (

ml_tools/serde.py CHANGED Viewed

@@ -6,15 +6,22 @@ from pathlib import Path
 from .path_manager import make_fullpath, sanitize_filename
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from ._schema import FeatureSchema
 __all__ = [
     "serialize_object_filename",
     "serialize_object",
     "deserialize_object",
+    "serialize_schema",
+    "deserialize_schema"
 ]
+# Base types that have a generic `type()` log.
+_SIMPLE_TYPES = (list, dict, tuple, set, str, int, float, bool)
 def serialize_object_filename(obj: Any, save_dir: Union[str,Path], filename: str, verbose: bool=True, raise_on_error: bool=False) -> None:
     """
     Serializes a Python object using joblib; suitable for Python built-ins, numpy, and pandas.
@@ -24,22 +31,25 @@ def serialize_object_filename(obj: Any, save_dir: Union[str,Path], filename: str
         save_dir (str | Path) : Directory path where the serialized object will be saved.
         filename (str) : Name for the output file, extension will be appended if needed.
     """
+    if obj is None:
+        _LOGGER.warning(f"Attempted to serialize a None object. Skipping save for '{filename}'.")
+        return
     try:
-        save_path = make_fullpath(save_dir, make=True)
+        save_path = make_fullpath(save_dir, make=True, enforce="directory")
         sanitized_name = sanitize_filename(filename)
-        if not sanitized_name.endswith('.joblib'):
-            sanitized_name = sanitized_name + ".joblib"
         full_path = save_path / sanitized_name
-        joblib.dump(obj, full_path)
-    except (IOError, OSError, TypeError, TerminatedWorkerError) as e:
-        _LOGGER.error(f"Failed to serialize object of type '{type(obj)}'.")
+    except (IOError, OSError, TypeError) as e:
+        _LOGGER.error(f"Failed to construct save path from dir='{save_dir}' and filename='{filename}'. Error: {e}")
         if raise_on_error:
             raise e
         return None
-    else:
-        if verbose:
-            _LOGGER.info(f"Object of type '{type(obj)}' saved to '{full_path}'")
-        return None
+    # call serialize_object with the fully constructed path.
+    serialize_object(obj=obj,
+                     file_path=full_path,
+                     verbose=verbose,
+                     raise_on_error=raise_on_error)
 def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_error: bool = False) -> None:
@@ -54,10 +64,13 @@ def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_e
                            '.joblib' extension will be appended if missing.
         raise_on_error (bool) : If True, raises exceptions on failure.
     """
+    if obj is None:
+        _LOGGER.warning(f"Attempted to serialize a None object. Skipping save for '{file_path}'.")
+        return
     try:
         # Ensure the extension is correct
-        if file_path.suffix != '.joblib':
-            file_path = file_path.with_suffix(file_path.suffix + '.joblib')
+        file_path = file_path.with_suffix('.joblib')
         # Ensure the parent directory exists
         _save_dir = make_fullpath(file_path.parent, make=True, enforce="directory")
@@ -72,7 +85,11 @@ def serialize_object(obj: Any, file_path: Path, verbose: bool = True, raise_on_e
         return None
     else:
         if verbose:
-            _LOGGER.info(f"Object of type '{type(obj)}' saved to '{file_path}'")
+            if type(obj) in _SIMPLE_TYPES:
+                _LOGGER.info(f"Object of type '{type(obj)}' saved to '{file_path}'")
+            else:
+                _LOGGER.info(f"Object '{obj}' saved to '{file_path}'")
         return None
@@ -116,16 +133,60 @@ def deserialize_object(
             # Can't do an isinstance check on 'Any', skip it.
             if type_to_check is not Any and not isinstance(obj, type_to_check):
                 error_msg = (
-                    f"Type mismatch: Expected an instance of '{expected_type}', "
-                    f"but found '{type(obj)}' in '{true_filepath}'."
+                    f"Type mismatch: Expected an instance of '{expected_type}', but found '{type(obj)}' in '{true_filepath}'."
                 )
                 _LOGGER.error(error_msg)
                 raise TypeError()
         if verbose:
-            _LOGGER.info(f"Loaded object of type '{type(obj)}' from '{true_filepath}'.")
+            # log special objects
+            if type(obj) in _SIMPLE_TYPES:
+                _LOGGER.info(f"Loaded object of type '{type(obj)}' from '{true_filepath}'.")
+            else:
+                _LOGGER.info(f"Loaded object '{obj}' from '{true_filepath}'.")
-        return obj
+        return obj # type: ignore
+def serialize_schema(schema: FeatureSchema, file_path: Path):
+    """
+    Serializes a FeatureSchema object to a .joblib file.
+    This is a high-level wrapper around `serialize_object` that
+    specifically handles `FeatureSchema` instances and ensures
+    errors are raised on failure.
+    Args:
+        schema (FeatureSchema): The schema object to serialize.
+        file_path (Path): The full file path to save the schema to.
+    """
+    serialize_object(obj=schema,
+                     file_path=file_path,
+                     verbose=True,
+                     raise_on_error=True)
+def deserialize_schema(file_path: Path):
+    """
+    Deserializes a FeatureSchema object from a .joblib file.
+    This is a high-level wrapper around `deserialize_object` that
+    validates the loaded object is an instance of `FeatureSchema`.
+    Args:
+        file_path (Path): The full file path of the serialized schema.
+    Returns:
+        FeatureSchema: The deserialized schema object.
+    Raises:
+        TypeError: If the deserialized object is not an instance of `FeatureSchema`.
+    """
+    schema = deserialize_object(filepath=file_path,
+                                expected_type=FeatureSchema,
+                                verbose=True)
+    return schema
 def info():
     _script_info(__all__)

dragon-ml-toolbox 13.0.0__py3-none-any.whl → 14.7.0__py3-none-any.whl

dragon-ml-toolbox 13.0.0py3-none-any.whl → 14.7.0py3-none-any.whl