PyPI - dragon-ml-toolbox - Versions diffs - 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dragon-ml-toolbox might be problematic. Click here for more details.

Files changed (48) hide show

{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/METADATA +38 -63
dragon_ml_toolbox-14.2.0.dist-info/RECORD +48 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE +1 -1
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +11 -0
ml_tools/ETL_cleaning.py +72 -34
ml_tools/ETL_engineering.py +506 -70
ml_tools/GUI_tools.py +2 -1
ml_tools/MICE_imputation.py +212 -7
ml_tools/ML_callbacks.py +73 -40
ml_tools/ML_datasetmaster.py +267 -284
ml_tools/ML_evaluation.py +119 -58
ml_tools/ML_evaluation_multi.py +107 -32
ml_tools/ML_inference.py +15 -5
ml_tools/ML_models.py +234 -170
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +321 -97
ml_tools/ML_scaler.py +10 -5
ml_tools/ML_trainer.py +585 -40
ml_tools/ML_utilities.py +528 -0
ml_tools/ML_vision_datasetmaster.py +1315 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +627 -0
ml_tools/ML_vision_transformers.py +58 -0
ml_tools/PSO_optimization.py +10 -7
ml_tools/RNN_forecast.py +2 -0
ml_tools/SQL.py +22 -9
ml_tools/VIF_factor.py +4 -3
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/__init__.py +1 -0
ml_tools/_logger.py +0 -2
ml_tools/_schema.py +96 -0
ml_tools/constants.py +79 -0
ml_tools/custom_logger.py +164 -16
ml_tools/data_exploration.py +1092 -109
ml_tools/ensemble_evaluation.py +48 -1
ml_tools/ensemble_inference.py +6 -7
ml_tools/ensemble_learning.py +4 -3
ml_tools/handle_excel.py +1 -0
ml_tools/keys.py +80 -0
ml_tools/math_utilities.py +259 -0
ml_tools/optimization_tools.py +198 -24
ml_tools/path_manager.py +144 -45
ml_tools/serde.py +192 -0
ml_tools/utilities.py +287 -227
dragon_ml_toolbox-10.2.0.dist-info/RECORD +0 -36
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-10.2.0.dist-info → dragon_ml_toolbox-14.2.0.dist-info}/top_level.txt +0 -0

ml_tools/ensemble_evaluation.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import Union, Optional, Literal
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER
+from .keys import SHAPKeys
 __all__ = [
@@ -472,7 +473,7 @@ def get_shap_values(
         save_dir: Directory to save visualizations.
     """
     sanitized_target_name = sanitize_filename(target_name)
-    global_save_path = make_fullpath(save_dir, make=True)
+    global_save_path = make_fullpath(save_dir, make=True, enforce="directory")
     def _apply_plot_style():
         styles = ['seaborn', 'seaborn-v0_8-darkgrid', 'seaborn-v0_8', 'default']
@@ -539,6 +540,15 @@ def get_shap_values(
                         plot_type=plot_type,
                         title=f"{model_name} - {target_name} (Class {class_name})"
                     )
+                # Save the summary data for the current class
+                summary_save_path = global_save_path / f"SHAP_{sanitized_target_name}_{class_name}.csv"
+                _save_summary_csv(
+                    shap_values_for_summary=class_shap,
+                    feature_names=feature_names,
+                    save_path=summary_save_path
+                )
         else:
             values = shap_values[1] if isinstance(shap_values, list) else shap_values
             for plot_type in ["bar", "dot"]:
@@ -549,6 +559,15 @@ def get_shap_values(
                     plot_type=plot_type,
                     title=f"{model_name} - {target_name}"
                 )
+            # Save the summary data for the positive class
+            shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+            summary_save_path = global_save_path / shap_summary_filename
+            _save_summary_csv(
+                shap_values_for_summary=values,
+                feature_names=feature_names,
+                save_path=summary_save_path
+            )
     def _plot_for_regression(shap_values):
         for plot_type in ["bar", "dot"]:
@@ -559,6 +578,34 @@ def get_shap_values(
                 plot_type=plot_type,
                 title=f"{model_name} - {target_name}"
             )
+        # Save the summary data to a CSV file
+        shap_summary_filename = SHAPKeys.SAVENAME + ".csv"
+        summary_save_path = global_save_path / shap_summary_filename
+        _save_summary_csv(
+            shap_values_for_summary=shap_values,
+            feature_names=feature_names,
+            save_path=summary_save_path
+        )
+    def _save_summary_csv(shap_values_for_summary: np.ndarray, feature_names: list[str], save_path: Path):
+        """Calculates and saves the SHAP summary data to a CSV file."""
+        mean_abs_shap = np.abs(shap_values_for_summary).mean(axis=0)
+        # Create default feature names if none are provided
+        current_feature_names = feature_names
+        if current_feature_names is None:
+            current_feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
+        summary_df = pd.DataFrame({
+            SHAPKeys.FEATURE_COLUMN: feature_names,
+            SHAPKeys.SHAP_VALUE_COLUMN: mean_abs_shap
+        }).sort_values(SHAPKeys.SHAP_VALUE_COLUMN, ascending=False)
+        summary_df.to_csv(save_path, index=False)
+        # print(f"📝 SHAP summary data saved as '{save_path.name}'")
     #START_O
     explainer = shap.TreeExplainer(model)

ml_tools/ensemble_inference.py CHANGED Viewed

@@ -1,18 +1,17 @@
-from ._script_info import _script_info
-from ._logger import _LOGGER
-from .path_manager import make_fullpath, list_files_by_extension
-from .keys import EnsembleKeys
 from typing import Union, Literal, Dict, Any, Optional, List
 from pathlib import Path
 import json
 import joblib
 import numpy as np
 # Inference models
 import xgboost
 import lightgbm
+from ._script_info import _script_info
+from ._logger import _LOGGER
+from .path_manager import make_fullpath, list_files_by_extension
+from .keys import EnsembleKeys
 __all__ = [
     "InferenceHandler",
@@ -219,7 +218,7 @@ def model_report(
     return report_data
-# Local implementation to avoid calling utilities' dependencies
+# Local implementation to avoid calling utilities dependencies
 def _deserialize_object(filepath: Union[str,Path], verbose: bool=True, raise_on_error: bool=True) -> Optional[Any]:
     """
     Loads a serialized object from a .joblib file.

ml_tools/ensemble_learning.py CHANGED Viewed

@@ -13,7 +13,8 @@ import lightgbm as lgb
 from sklearn.model_selection import train_test_split
 from sklearn.base import clone
-from .utilities import yield_dataframes_from_dir, serialize_object, train_dataset_yielder
+from .utilities import yield_dataframes_from_dir, train_dataset_yielder
+from .serde import serialize_object_filename
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from .keys import EnsembleKeys
@@ -410,7 +411,7 @@ def _save_model(trained_model, model_name: str, target_name:str, feature_names:
                EnsembleKeys.FEATURES: feature_names,
                EnsembleKeys.TARGET: target_name}
-    serialize_object(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
+    serialize_object_filename(obj=to_save, save_dir=save_directory, filename=filename, verbose=False, raise_on_error=True)
 # TRAIN EVALUATE PIPELINE
@@ -481,7 +482,7 @@ def train_test_pipeline(model, model_name: str, dataset_id: str, task: Literal["
 ###### 4. Execution ######
 def run_ensemble_pipeline(datasets_dir: Union[str,Path], save_dir: Union[str,Path], target_columns: list[str], model_object: Union[RegressionTreeModels, ClassificationTreeModels],
-         handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=False,
+         handle_classification_imbalance: HandleImbalanceStrategy=None, save_model: bool=True,
          test_size: float=0.2, debug:bool=False, generate_learning_curves: bool = False):
     #Check models
     if isinstance(model_object, RegressionTreeModels):

ml_tools/handle_excel.py CHANGED Viewed

@@ -2,6 +2,7 @@ from pathlib import Path
 from openpyxl import load_workbook, Workbook
 import pandas as pd
 from typing import List, Optional, Union
 from .path_manager import sanitize_filename, make_fullpath
 from ._script_info import _script_info
 from ._logger import _LOGGER

ml_tools/keys.py CHANGED Viewed

@@ -36,6 +36,86 @@ class PyTorchInferenceKeys:
     # For classification tasks
     LABELS = "labels"
     PROBABILITIES = "probabilities"
+    LABEL_NAMES = "label_names"
+class PytorchModelArchitectureKeys:
+    """Keys for saving and loading model architecture."""
+    MODEL = 'model_class'
+    CONFIG = "config"
+    SAVENAME = "architecture"
+class PytorchArtifactPathKeys:
+    """Keys for model artifact paths."""
+    FEATURES_PATH = "feature_names_path"
+    TARGETS_PATH = "target_names_path"
+    ARCHITECTURE_PATH = "model_architecture_path"
+    WEIGHTS_PATH = "model_weights_path"
+    SCALER_PATH = "scaler_path"
+class DatasetKeys:
+    """Keys for saving dataset artifacts. Also used by FeatureSchema"""
+    FEATURE_NAMES = "feature_names"
+    TARGET_NAMES = "target_names"
+    SCALER_PREFIX = "scaler_"
+    # Feature Schema
+    CONTINUOUS_NAMES = "continuous_feature_names"
+    CATEGORICAL_NAMES = "categorical_feature_names"
+class SHAPKeys:
+    """Keys for SHAP functions"""
+    FEATURE_COLUMN = "feature"
+    SHAP_VALUE_COLUMN = "mean_abs_shap_value"
+    SAVENAME = "shap_summary"
+class PyTorchCheckpointKeys:
+    """Keys for saving/loading a training checkpoint dictionary."""
+    MODEL_STATE = "model_state_dict"
+    OPTIMIZER_STATE = "optimizer_state_dict"
+    SCHEDULER_STATE = "scheduler_state_dict"
+    EPOCH = "epoch"
+    BEST_SCORE = "best_score"
+class UtilityKeys:
+    """Keys used for utility modules"""
+    MODEL_PARAMS_FILE = "model_parameters"
+    TOTAL_PARAMS = "Total Parameters"
+    TRAINABLE_PARAMS = "Trainable Parameters"
+    PTH_FILE = "pth report "
+    MODEL_ARCHITECTURE_FILE = "model_architecture_summary"
+class VisionKeys:
+    """For vision ML metrics"""
+    SEGMENTATION_REPORT = "segmentation_report"
+    SEGMENTATION_HEATMAP = "segmentation_metrics_heatmap"
+    SEGMENTATION_CONFUSION_MATRIX = "segmentation_confusion_matrix"
+    # Object detection
+    OBJECT_DETECTION_REPORT = "object_detection_report"
+class VisionTransformRecipeKeys:
+    """Defines the key names for the transform recipe JSON file."""
+    TASK = "task"
+    PIPELINE = "pipeline"
+    NAME = "name"
+    KWARGS = "_kwargs"
+    PRE_TRANSFORMS = "pre_transforms"
+    RESIZE_SIZE = "resize_size"
+    CROP_SIZE = "crop_size"
+    MEAN = "mean"
+    STD = "std"
+class ObjectDetectionKeys:
+    """Used by the object detection dataset"""
+    BOXES = "boxes"
+    LABELS = "labels"
 class _OneHotOtherPlaceholder:

ml_tools/math_utilities.py ADDED Viewed

@@ -0,0 +1,259 @@
+import pandas as pd
+import numpy as np
+import math
+from typing import Union, Sequence, Optional
+from ._script_info import _script_info
+from ._logger import _LOGGER
+__all__ = [
+    "normalize_mixed_list",
+    "threshold_binary_values",
+    "threshold_binary_values_batch",
+    "discretize_categorical_values",
+]
+def normalize_mixed_list(data: list, threshold: int = 2) -> list[float]:
+    """
+    Normalize a mixed list of numeric values and strings casted to floats so that the sum of the values equals 1.0,
+    applying heuristic adjustments to correct for potential data entry scale mismatches.
+    Parameters:
+        data (list):
+            A list of values that may include strings, floats, integers, or None.
+            None values are treated as 0.0.
+        threshold (int, optional):
+            The number of log10 orders of magnitude below the median scale
+            at which a value is considered suspect and is scaled upward accordingly.
+            Default is 2.
+    Returns:
+        List[float]: A list of normalized float values summing to 1.0.
+    Notes:
+        - Zeros and None values remain zero.
+        - Input strings are automatically cast to floats if possible.
+    Example:
+        >>> normalize_mixed_list([1, "0.01", 4, None])
+        [0.2, 0.2, 0.6, 0.0]
+    """
+    # Step 1: Convert all values to float, treat None as 0.0
+    float_list = [float(x) if x is not None else 0.0 for x in data]
+    # Raise for negative values
+    if any(x < 0 for x in float_list):
+        _LOGGER.error("Negative values are not allowed in the input list.")
+        raise ValueError()
+    # Step 2: Compute log10 of non-zero values
+    nonzero = [x for x in float_list if x > 0]
+    if not nonzero:
+        return [0.0 for _ in float_list]
+    log_scales = [math.log10(x) for x in nonzero]
+    log_median = np.median(log_scales)
+    # Step 3: Adjust values that are much smaller than median
+    adjusted = []
+    for x in float_list:
+        if x == 0.0:
+            adjusted.append(0.0)
+        else:
+            log_x = math.log10(x)
+            if log_median - log_x > threshold:
+                scale_diff = round(log_median - log_x)
+                adjusted.append(x * (10 ** scale_diff))
+            else:
+                adjusted.append(x)
+    # Step 4: Normalize to sum to 1.0
+    total = sum(adjusted)
+    if total == 0:
+        return [0.0 for _ in adjusted]
+    return [x / total for x in adjusted]
+def threshold_binary_values(
+    input_array: Union[Sequence[float], np.ndarray, pd.Series],
+    binary_values: Optional[int] = None
+) -> Union[np.ndarray, pd.Series, list[float], tuple[float]]:
+    """
+    Thresholds binary features in a 1D input. The number of binary features are counted starting from the end.
+    Binary elements are converted to 0 or 1 using a 0.5 threshold.
+    Parameters:
+        input_array: 1D sequence, NumPy array, or pandas Series.
+        binary_values (Optional[int]) :
+            - If `None`, all values are treated as binary.
+            - If `int`, only this many last `binary_values` are thresholded.
+    Returns:
+        Any:
+        Same type as input
+    """
+    original_type = type(input_array)
+    if isinstance(input_array, (pd.Series, np.ndarray)):
+        array = np.asarray(input_array)
+    elif isinstance(input_array, (list, tuple)):
+        array = np.array(input_array)
+    else:
+        _LOGGER.error("Unsupported input type")
+        raise TypeError()
+    array = array.flatten()
+    total = array.shape[0]
+    bin_count = total if binary_values is None else binary_values
+    if not (0 <= bin_count <= total):
+        _LOGGER.error("'binary_values' must be between 0 and the total number of elements")
+        raise ValueError()
+    if bin_count == 0:
+        result = array
+    else:
+        cont_part = array[:-bin_count] if bin_count < total else np.array([])
+        bin_part = (array[-bin_count:] > 0.5).astype(int)
+        result = np.concatenate([cont_part, bin_part])
+    if original_type is pd.Series:
+        return pd.Series(result, index=input_array.index if hasattr(input_array, 'index') else None) # type: ignore
+    elif original_type is list:
+        return result.tolist()
+    elif original_type is tuple:
+        return tuple(result)
+    else:
+        return result
+def threshold_binary_values_batch(
+    input_array: np.ndarray,
+    binary_values: int
+) -> np.ndarray:
+    """
+    Threshold the last `binary_values` columns of a 2D NumPy array to binary {0,1} using 0.5 cutoff.
+    Parameters
+    ----------
+    input_array : np.ndarray
+        2D array with shape (batch_size, n_features).
+    binary_values : int
+        Number of binary features located at the END of each row.
+    Returns
+    -------
+    np.ndarray
+        Thresholded array, same shape as input.
+    """
+    if input_array.ndim != 2:
+        _LOGGER.error(f"Expected 2D array, got {input_array.ndim}D array.")
+        raise AssertionError()
+    batch_size, total_features = input_array.shape
+    if not (0 <= binary_values <= total_features):
+        _LOGGER.error("'binary_values' out of valid range.")
+        raise AssertionError()
+    if binary_values == 0:
+        return input_array.copy()
+    cont_part = input_array[:, :-binary_values] if binary_values < total_features else np.empty((batch_size, 0))
+    bin_part = input_array[:, -binary_values:] > 0.5
+    bin_part = bin_part.astype(np.int32)
+    return np.hstack([cont_part, bin_part])
+def discretize_categorical_values(
+    input_array: np.ndarray,
+    categorical_info: dict[int, int],
+    start_at_zero: bool = True
+) -> np.ndarray:
+    """
+    Rounds specified columns of a 2D NumPy array to the nearest integer and
+    clamps the result to a valid categorical range.
+    If a 1D array is provided, it is treated as a single batch.
+    Parameters
+    ----------
+    input_array : np.ndarray
+        1D array (n_features,) or 2D array with shape (batch_size, n_features) containing continuous values.
+    categorical_info : dict[int, int]
+        A dictionary mapping column indices to their cardinality (number of categories).
+        Example: {3: 4} means column 3 will be clamped to its 4 valid categories.
+    start_at_zero : bool
+        If True, categories range from 0 to k-1.
+        If False, categories range from 1 to k.
+    Returns
+    -------
+    np.ndarray
+        A new array with the specified columns converted to integer categories.
+        Shape matches the input array's original shape.
+    """
+    # --- Input Validation ---
+    if not isinstance(input_array, np.ndarray):
+         _LOGGER.error(f"Expected np.ndarray, got {type(input_array)}.")
+         raise ValueError()
+    if input_array.ndim == 1:
+        # Reshape 1D array (n_features,) to 2D (1, n_features)
+        working_array = input_array.reshape(1, -1)
+        original_was_1d = True
+    elif input_array.ndim == 2:
+        working_array = input_array
+        original_was_1d = False
+    else:
+        _LOGGER.error(f"Expected 1D or 2D array, got {input_array.ndim}D array.")
+        raise ValueError()
+    if not isinstance(categorical_info, dict) or not categorical_info:
+        _LOGGER.error(f"'categorical_info' is not a dictionary, or is empty.")
+        raise ValueError()
+    _, total_features = working_array.shape
+    for col_idx, cardinality in categorical_info.items():
+        if not isinstance(col_idx, int):
+             _LOGGER.error(f"Column index key {col_idx} is not an integer.")
+             raise TypeError()
+        if not (0 <= col_idx < total_features):
+            _LOGGER.error(f"Column index {col_idx} is out of bounds for an array with {total_features} features.")
+            raise ValueError()
+        if not isinstance(cardinality, int) or cardinality < 2:
+            _LOGGER.error(f"Cardinality for column {col_idx} must be an integer >= 2, but got {cardinality}.")
+            raise ValueError()
+    # --- Core Logic ---
+    output_array = working_array.copy()
+    for col_idx, cardinality in categorical_info.items():
+        # 1. Round the column values using "round half up"
+        rounded_col = np.floor(output_array[:, col_idx] + 0.5)
+        # 2. Determine clamping bounds
+        min_bound = 0 if start_at_zero else 1
+        max_bound = cardinality - 1 if start_at_zero else cardinality
+        # 3. Clamp the values and update the output array
+        output_array[:, col_idx] = np.clip(rounded_col, min_bound, max_bound)
+    final_output = output_array.astype(np.int32)
+    # --- Output Shape Handling ---
+    if original_was_1d:
+        # Squeeze the batch dimension to return a 1D array
+        return final_output.squeeze(axis=0)
+    else:
+        return final_output
+def info():
+    _script_info(__all__)

dragon-ml-toolbox 10.2.0__py3-none-any.whl → 14.2.0__py3-none-any.whl

Potentially problematic release.

dragon-ml-toolbox 10.2.0py3-none-any.whl → 14.2.0py3-none-any.whl