PyPI - dragon-ml-toolbox - Versions diffs - 13.3.0__py3-none-any.whl → 14.7.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.3.0py3-none-any.whl → 14.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/METADATA +12 -2
dragon_ml_toolbox-14.7.0.dist-info/RECORD +49 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
ml_tools/MICE_imputation.py +207 -5
ml_tools/ML_configuration.py +108 -0
ml_tools/ML_datasetmaster.py +106 -206
ml_tools/ML_evaluation.py +229 -76
ml_tools/ML_evaluation_multi.py +45 -16
ml_tools/ML_inference.py +0 -1
ml_tools/ML_models.py +22 -6
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_trainer.py +498 -29
ml_tools/ML_utilities.py +351 -4
ml_tools/ML_vision_datasetmaster.py +1492 -0
ml_tools/ML_vision_evaluation.py +260 -0
ml_tools/ML_vision_inference.py +428 -0
ml_tools/ML_vision_models.py +641 -0
ml_tools/ML_vision_transformers.py +203 -0
ml_tools/_ML_vision_recipe.py +88 -0
ml_tools/custom_logger.py +37 -14
ml_tools/data_exploration.py +502 -93
ml_tools/ensemble_evaluation.py +53 -10
ml_tools/keys.py +39 -0
ml_tools/math_utilities.py +1 -1
ml_tools/serde.py +2 -2
ml_tools/utilities.py +192 -3
dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-14.7.0.dist-info}/top_level.txt +0 -0

ml_tools/ML_evaluation.py CHANGED Viewed

@@ -21,10 +21,10 @@ from pathlib import Path
 from typing import Union, Optional, List, Literal
 import warnings
-from .path_manager import make_fullpath
+from .path_manager import make_fullpath, sanitize_filename
 from ._logger import _LOGGER
 from ._script_info import _script_info
-from .keys import SHAPKeys
+from .keys import SHAPKeys, PyTorchLogKeys
 __all__ = [
@@ -35,6 +35,8 @@ __all__ = [
     "plot_attention_importance"
 ]
+DPI_value = 250
 def plot_losses(history: dict, save_dir: Union[str, Path]):
     """
@@ -44,14 +46,14 @@ def plot_losses(history: dict, save_dir: Union[str, Path]):
         history (dict): A dictionary containing 'train_loss' and 'val_loss'.
         save_dir (str | Path): Directory to save the plot image.
     """
-    train_loss = history.get('train_loss', [])
-    val_loss = history.get('val_loss', [])
+    train_loss = history.get(PyTorchLogKeys.TRAIN_LOSS, [])
+    val_loss = history.get(PyTorchLogKeys.VAL_LOSS, [])
     if not train_loss and not val_loss:
-        print("Warning: Loss history is empty or incomplete. Cannot plot.")
+        _LOGGER.warning("Loss history is empty or incomplete. Cannot plot.")
         return
-    fig, ax = plt.subplots(figsize=(10, 5), dpi=100)
+    fig, ax = plt.subplots(figsize=(10, 5), dpi=DPI_value)
     # Plot training loss only if data for it exists
     if train_loss:
@@ -78,8 +80,15 @@ def plot_losses(history: dict, save_dir: Union[str, Path]):
     plt.close(fig)
-def classification_metrics(save_dir: Union[str, Path], y_true: np.ndarray, y_pred: np.ndarray, y_prob: Optional[np.ndarray] = None,
-                           cmap: str = "Blues"):
+def classification_metrics(save_dir: Union[str, Path],
+                           y_true: np.ndarray,
+                           y_pred: np.ndarray,
+                           y_prob: Optional[np.ndarray] = None,
+                           cmap: str = "Blues",
+                           class_map: Optional[dict[str,int]]=None,
+                           ROC_PR_line: str='darkorange',
+                           calibration_bins: int=15,
+                           font_size: int=16):
     """
     Saves classification metrics and plots.
@@ -89,12 +98,31 @@ def classification_metrics(save_dir: Union[str, Path], y_true: np.ndarray, y_pre
         y_prob (np.ndarray, optional): Predicted probabilities for ROC curve.
         cmap (str): Colormap for the confusion matrix.
         save_dir (str | Path): Directory to save plots.
+        class_map (dict[str, int], None): A map of {class_name: index} used to order and label the confusion matrix.
     """
-    print("--- Classification Report ---")
+    original_rc_params = plt.rcParams.copy()
+    plt.rcParams.update({'font.size': font_size})
+    # print("--- Classification Report ---")
+    # --- Parse class_map ---
+    map_labels = None
+    map_display_labels = None
+    if class_map:
+        # Sort the map by its values (the indices) to ensure correct order
+        try:
+            sorted_items = sorted(class_map.items(), key=lambda item: item[1])
+            map_labels = [item[1] for item in sorted_items]
+            map_display_labels = [item[0] for item in sorted_items]
+        except Exception as e:
+            _LOGGER.warning(f"Could not parse 'class_map': {e}")
+            map_labels = None
+            map_display_labels = None
     # Generate report as both text and dictionary
-    report_text: str = classification_report(y_true, y_pred) # type: ignore
-    report_dict: dict = classification_report(y_true, y_pred, output_dict=True) # type: ignore
-    print(report_text)
+    report_text: str = classification_report(y_true, y_pred, labels=map_labels, target_names=map_display_labels) # type: ignore
+    report_dict: dict = classification_report(y_true, y_pred, output_dict=True, labels=map_labels, target_names=map_display_labels) # type: ignore
+    # print(report_text)
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     # Save text report
@@ -104,8 +132,15 @@ def classification_metrics(save_dir: Union[str, Path], y_true: np.ndarray, y_pre
     # --- Save Classification Report Heatmap ---
     try:
-        plt.figure(figsize=(8, 6), dpi=100)
-        sns.heatmap(pd.DataFrame(report_dict).iloc[:-1, :].T, annot=True, cmap='viridis', fmt='.2f')
+        plt.figure(figsize=(8, 6), dpi=DPI_value)
+        sns.set_theme(font_scale=1.2) # Scale seaborn font
+        sns.heatmap(pd.DataFrame(report_dict).iloc[:-1, :].T,
+                    annot=True,
+                    cmap=cmap,
+                    fmt='.2f',
+                    vmin=0.0,
+                    vmax=1.0)
+        sns.set_theme(font_scale=1.0) # Reset seaborn scale
         plt.title("Classification Report")
         plt.tight_layout()
         heatmap_path = save_dir_path / "classification_report_heatmap.svg"
@@ -114,69 +149,179 @@ def classification_metrics(save_dir: Union[str, Path], y_true: np.ndarray, y_pre
         plt.close()
     except Exception as e:
         _LOGGER.error(f"Could not generate classification report heatmap: {e}")
+    # --- labels for Confusion Matrix ---
+    plot_labels = map_labels
+    plot_display_labels = map_display_labels
     # Save Confusion Matrix
-    fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
-    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, cmap=cmap, ax=ax_cm)
+    fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+    disp_ = ConfusionMatrixDisplay.from_predictions(y_true,
+                                            y_pred,
+                                            cmap=cmap,
+                                            ax=ax_cm,
+                                            normalize='true',
+                                            labels=plot_labels,
+                                            display_labels=plot_display_labels)
+    disp_.im_.set_clim(vmin=0.0, vmax=1.0)
+    # Turn off gridlines
+    ax_cm.grid(False)
+    # Manually update font size of cell texts
+    for text in ax_cm.texts:
+        text.set_fontsize(font_size)
+    fig_cm.tight_layout()
     ax_cm.set_title("Confusion Matrix")
     cm_path = save_dir_path / "confusion_matrix.svg"
     plt.savefig(cm_path)
     _LOGGER.info(f"❇️ Confusion matrix saved as '{cm_path.name}'")
     plt.close(fig_cm)
-    # Plotting logic for ROC and PR Curves
-    if y_prob is not None and y_prob.ndim > 1 and y_prob.shape[1] >= 2:
-        # Use probabilities of the positive class
-        y_score = y_prob[:, 1]
+    # Plotting logic for ROC, PR, and Calibration Curves
+    if y_prob is not None and y_prob.ndim == 2:
+        num_classes = y_prob.shape[1]
-        # --- Save ROC Curve ---
-        fpr, tpr, _ = roc_curve(y_true, y_score)
-        auc = roc_auc_score(y_true, y_score)
-        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
-        ax_roc.plot([0, 1], [0, 1], 'k--')
-        ax_roc.set_title('Receiver Operating Characteristic (ROC) Curve')
-        ax_roc.set_xlabel('False Positive Rate')
-        ax_roc.set_ylabel('True Positive Rate')
-        ax_roc.legend(loc='lower right')
-        ax_roc.grid(True)
-        roc_path = save_dir_path / "roc_curve.svg"
-        plt.savefig(roc_path)
-        _LOGGER.info(f"📈 ROC curve saved as '{roc_path.name}'")
-        plt.close(fig_roc)
-        # --- Save Precision-Recall Curve ---
-        precision, recall, _ = precision_recall_curve(y_true, y_score)
-        ap_score = average_precision_score(y_true, y_score)
-        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}')
-        ax_pr.set_title('Precision-Recall Curve')
-        ax_pr.set_xlabel('Recall')
-        ax_pr.set_ylabel('Precision')
-        ax_pr.legend(loc='lower left')
-        ax_pr.grid(True)
-        pr_path = save_dir_path / "pr_curve.svg"
-        plt.savefig(pr_path)
-        _LOGGER.info(f"📈 PR curve saved as '{pr_path.name}'")
-        plt.close(fig_pr)
+        # --- Determine which classes to loop over ---
+        class_indices_to_plot = []
+        plot_titles = []
+        save_suffixes = []
+        if num_classes == 2:
+            # Binary case: Only plot for the positive class (index 1)
+            class_indices_to_plot = [1]
+            plot_titles = [""] # No extra title
+            save_suffixes = [""] # No extra suffix
+            _LOGGER.info("Generating binary classification plots (ROC, PR, Calibration).")
-        # --- Save Calibration Plot ---
-        if y_prob.ndim > 1 and y_prob.shape[1] >= 2:
-            y_score = y_prob[:, 1] # Use probabilities of the positive class
+        elif num_classes > 2:
+            _LOGGER.info(f"Generating One-vs-Rest plots for {num_classes} classes.")
+            # Multiclass case: Plot for every class (One-vs-Rest)
+            class_indices_to_plot = list(range(num_classes))
+            # --- Use class_map names if available ---
+            use_generic_names = True
+            if map_display_labels and len(map_display_labels) == num_classes:
+                try:
+                    # Ensure labels are safe for filenames
+                    safe_names = [sanitize_filename(name) for name in map_display_labels]
+                    plot_titles = [f" ({name} vs. Rest)" for name in map_display_labels]
+                    save_suffixes = [f"_{safe_names[i]}" for i in class_indices_to_plot]
+                    use_generic_names = False
+                except Exception as e:
+                    _LOGGER.warning(f"Failed to use 'class_map' for plot titles: {e}. Reverting to generic names.")
+                    use_generic_names = True
+            if use_generic_names:
+                plot_titles = [f" (Class {i} vs. Rest)" for i in class_indices_to_plot]
+                save_suffixes = [f"_class_{i}" for i in class_indices_to_plot]
+        else:
+            # Should not happen, but good to check
+            _LOGGER.warning(f"Probability array has invalid shape {y_prob.shape}. Skipping ROC/PR/Calibration plots.")
+        # --- Loop and generate plots ---
+        for i, class_index in enumerate(class_indices_to_plot):
+            plot_title = plot_titles[i]
+            save_suffix = save_suffixes[i]
+            # Get scores for the current class
+            y_score = y_prob[:, class_index]
+            # Binarize y_true for the current class
+            y_true_binary = (y_true == class_index).astype(int)
+            # --- Save ROC Curve ---
+            fpr, tpr, _ = roc_curve(y_true_binary, y_score)
+            # Calculate AUC.
+            # Note: For multiclass, roc_auc_score(y_true, y_prob, multi_class='ovr') could average, but plotting individual curves is more informative.
+            # Here we calculate the specific AUC for the binarized problem.
+            auc = roc_auc_score(y_true_binary, y_score)
-            fig_cal, ax_cal = plt.subplots(figsize=(8, 8), dpi=100)
-            CalibrationDisplay.from_predictions(y_true, y_score, n_bins=15, ax=ax_cal)
+            fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+            ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}', color=ROC_PR_line)
+            ax_roc.plot([0, 1], [0, 1], 'k--')
+            ax_roc.set_title(f'Receiver Operating Characteristic{plot_title}')
+            ax_roc.set_xlabel('False Positive Rate')
+            ax_roc.set_ylabel('True Positive Rate')
+            ax_roc.legend(loc='lower right')
+            ax_roc.grid(True)
+            roc_path = save_dir_path / f"roc_curve{save_suffix}.svg"
+            plt.savefig(roc_path)
+            plt.close(fig_roc)
+            # --- Save Precision-Recall Curve ---
+            precision, recall, _ = precision_recall_curve(y_true_binary, y_score)
+            ap_score = average_precision_score(y_true_binary, y_score)
+            fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+            ax_pr.plot(recall, precision, label=f'Avg Precision = {ap_score:.2f}', color=ROC_PR_line)
+            ax_pr.set_title(f'Precision-Recall Curve{plot_title}')
+            ax_pr.set_xlabel('Recall')
+            ax_pr.set_ylabel('Precision')
+            ax_pr.legend(loc='lower left')
+            ax_pr.grid(True)
+            pr_path = save_dir_path / f"pr_curve{save_suffix}.svg"
+            plt.savefig(pr_path)
+            plt.close(fig_pr)
-            ax_cal.set_title('Reliability Curve')
+            # --- Save Calibration Plot ---
+            fig_cal, ax_cal = plt.subplots(figsize=(8, 8), dpi=DPI_value)
+            # --- Step 1: Get binned data *without* plotting ---
+            with plt.ioff(): # Suppress showing the temporary plot
+                fig_temp, ax_temp = plt.subplots()
+                cal_display_temp = CalibrationDisplay.from_predictions(
+                    y_true_binary, # Use binarized labels
+                    y_score,
+                    n_bins=calibration_bins,
+                    ax=ax_temp,
+                    name="temp" # Add a name to suppress potential warnings
+                )
+                # Get the x, y coordinates of the binned data
+                line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
+                plt.close(fig_temp) # Close the temporary plot
+            # --- Step 2: Build the plot from scratch ---
+            ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+            sns.regplot(
+                x=line_x,
+                y=line_y,
+                ax=ax_cal,
+                scatter=False,
+                label=f"Calibration Curve ({calibration_bins} bins)",
+                line_kws={
+                    'color': ROC_PR_line,
+                    'linestyle': '--',
+                    'linewidth': 2,
+                    }
+            )
+            ax_cal.set_title(f'Reliability Curve{plot_title}')
             ax_cal.set_xlabel('Mean Predicted Probability')
             ax_cal.set_ylabel('Fraction of Positives')
+            # --- Step 3: Set final limits *after* plotting ---
+            ax_cal.set_ylim(0.0, 1.0)
+            ax_cal.set_xlim(0.0, 1.0)
+            ax_cal.legend(loc='lower right')
             ax_cal.grid(True)
             plt.tight_layout()
-            cal_path = save_dir_path / "calibration_plot.svg"
+            cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
             plt.savefig(cal_path)
-            _LOGGER.info(f"📈 Calibration plot saved as '{cal_path.name}'")
             plt.close(fig_cal)
+        _LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
+    # restore RC params
+    plt.rcParams.update(original_rc_params)
 def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[str, Path]):
@@ -211,7 +356,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
     # Save residual plot
     residuals = y_true - y_pred
-    fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
+    fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=DPI_value)
     ax_res.scatter(y_pred, residuals, alpha=0.6)
     ax_res.axhline(0, color='red', linestyle='--')
     ax_res.set_xlabel("Predicted Values")
@@ -225,7 +370,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
     plt.close(fig_res)
     # Save true vs predicted plot
-    fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
+    fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=DPI_value)
     ax_tvp.scatter(y_true, y_pred, alpha=0.6)
     ax_tvp.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)
     ax_tvp.set_xlabel('True Values')
@@ -239,7 +384,7 @@ def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, save_dir: Union[s
     plt.close(fig_tvp)
     # Save Histogram of Residuals
-    fig_hist, ax_hist = plt.subplots(figsize=(8, 6), dpi=100)
+    fig_hist, ax_hist = plt.subplots(figsize=(8, 6), dpi=DPI_value)
     sns.histplot(residuals, kde=True, ax=ax_hist)
     ax_hist.set_xlabel("Residual Value")
     ax_hist.set_ylabel("Frequency")
@@ -258,7 +403,7 @@ def shap_summary_plot(model,
                       feature_names: Optional[list[str]],
                       save_dir: Union[str, Path],
                       device: torch.device = torch.device('cpu'),
-                      explainer_type: Literal['deep', 'kernel'] = 'deep'):
+                      explainer_type: Literal['deep', 'kernel'] = 'kernel'):
     """
     Calculates SHAP values and saves summary plots and data.
@@ -270,13 +415,13 @@ def shap_summary_plot(model,
         save_dir (str | Path): Directory to save SHAP artifacts.
         device (torch.device): The torch device for SHAP calculations.
         explainer_type (Literal['deep', 'kernel']): The explainer to use.
-            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient for
+            - 'deep': Uses shap.DeepExplainer. Fast and efficient for
               PyTorch models.
             - 'kernel': Uses shap.KernelExplainer. Model-agnostic but EXTREMELY
               slow and memory-intensive.
     """
-    print(f"\n--- SHAP Value Explanation Using {explainer_type.upper()} Explainer ---")
+    _LOGGER.info(f"📊 Running SHAP Value Explanation Using {explainer_type.upper()} Explainer")
     model.eval()
     # model.cpu() # Run explanations on CPU
@@ -285,7 +430,7 @@ def shap_summary_plot(model,
     instances_to_explain_np = None
     if explainer_type == 'deep':
-        # --- 1. Use DeepExplainer (Preferred) ---
+        # --- 1. Use DeepExplainer  ---
         # Ensure data is torch.Tensor
         if isinstance(background_data, np.ndarray):
@@ -309,10 +454,9 @@ def shap_summary_plot(model,
         instances_to_explain_np = instances_to_explain.cpu().numpy()
     elif explainer_type == 'kernel':
-        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        # --- 2. Use KernelExplainer ---
         _LOGGER.warning(
-            "Using KernelExplainer. This is memory-intensive and slow. "
-            "Consider reducing 'n_samples' if the process terminates unexpectedly."
+            "KernelExplainer is memory-intensive and slow. Consider reducing the number of instances to explain if the process terminates unexpectedly."
         )
         # Ensure data is np.ndarray
@@ -348,14 +492,26 @@ def shap_summary_plot(model,
     else:
         _LOGGER.error(f"Invalid explainer_type: '{explainer_type}'. Must be 'deep' or 'kernel'.")
         raise ValueError()
+    if not isinstance(shap_values, list) and shap_values.ndim == 3 and shap_values.shape[2] == 1: # type: ignore
+        # _LOGGER.info("Squeezing SHAP values from (N, F, 1) to (N, F) for regression plot.")
+        shap_values = shap_values.squeeze(-1) # type: ignore
     # --- 3. Plotting and Saving ---
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     plt.ioff()
+    # Convert instances to a DataFrame. robust way to ensure SHAP correctly maps values to feature names.
+    if feature_names is None:
+        # Create generic names if none were provided
+        num_features = instances_to_explain_np.shape[1]
+        feature_names = [f'feature_{i}' for i in range(num_features)]
+    instances_df = pd.DataFrame(instances_to_explain_np, columns=feature_names)
     # Save Bar Plot
     bar_path = save_dir_path / "shap_bar_plot.svg"
-    shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="bar", show=False)
+    shap.summary_plot(shap_values, instances_df, plot_type="bar", show=False)
     ax = plt.gca()
     ax.set_xlabel("SHAP Value Impact", labelpad=10)
     plt.title("SHAP Feature Importance")
@@ -366,7 +522,7 @@ def shap_summary_plot(model,
     # Save Dot Plot
     dot_path = save_dir_path / "shap_dot_plot.svg"
-    shap.summary_plot(shap_values, instances_to_explain_np, feature_names=feature_names, plot_type="dot", show=False)
+    shap.summary_plot(shap_values, instances_df, plot_type="dot", show=False)
     ax = plt.gca()
     ax.set_xlabel("SHAP Value Impact", labelpad=10)
     if plt.gcf().axes and len(plt.gcf().axes) > 1:
@@ -389,9 +545,6 @@ def shap_summary_plot(model,
         mean_abs_shap = np.abs(shap_values).mean(axis=0)
     mean_abs_shap = mean_abs_shap.flatten()
-    if feature_names is None:
-        feature_names = [f'feature_{i}' for i in range(len(mean_abs_shap))]
     summary_df = pd.DataFrame({
         SHAPKeys.FEATURE_COLUMN: feature_names,
@@ -401,7 +554,7 @@ def shap_summary_plot(model,
     summary_df.to_csv(summary_path, index=False)
     _LOGGER.info(f"📝 SHAP summary data saved as '{summary_path.name}'")
-    plt.ion()
+    plt.ion()
 def plot_attention_importance(weights: List[torch.Tensor], feature_names: Optional[List[str]], save_dir: Union[str, Path], top_n: int = 10):
@@ -447,7 +600,7 @@ def plot_attention_importance(weights: List[torch.Tensor], feature_names: Option
     # --- Step 3: Create and save the plot for top N features ---
     plot_df = summary_df.head(top_n).sort_values('mean_attention', ascending=True)
-    plt.figure(figsize=(10, 8), dpi=100)
+    plt.figure(figsize=(10, 8), dpi=DPI_value)
     # Create horizontal bar plot with error bars
     plt.barh(

ml_tools/ML_evaluation_multi.py CHANGED Viewed

@@ -34,6 +34,8 @@ __all__ = [
     "multi_target_shap_summary_plot",
 ]
+DPI_value = 250
 def multi_target_regression_metrics(
     y_true: np.ndarray,
@@ -90,7 +92,7 @@ def multi_target_regression_metrics(
         # --- Save Residual Plot ---
         residuals = true_i - pred_i
-        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
+        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=DPI_value)
         ax_res.scatter(pred_i, residuals, alpha=0.6, edgecolors='k', s=50)
         ax_res.axhline(0, color='red', linestyle='--')
         ax_res.set_xlabel("Predicted Values")
@@ -103,7 +105,7 @@ def multi_target_regression_metrics(
         plt.close(fig_res)
         # --- Save True vs. Predicted Plot ---
-        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
+        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=DPI_value)
         ax_tvp.scatter(true_i, pred_i, alpha=0.6, edgecolors='k', s=50)
         ax_tvp.plot([true_i.min(), true_i.max()], [true_i.min(), true_i.max()], 'k--', lw=2)
         ax_tvp.set_xlabel('True Values')
@@ -127,7 +129,10 @@ def multi_label_classification_metrics(
     y_prob: np.ndarray,
     target_names: List[str],
     save_dir: Union[str, Path],
-    threshold: float = 0.5
+    threshold: float = 0.5,
+    ROC_PR_line: str='darkorange',
+    cmap: str = "Blues",
+    font_size: int = 16
 ):
     """
     Calculates and saves classification metrics for each label individually.
@@ -158,6 +163,10 @@ def multi_label_classification_metrics(
     # Generate binary predictions from probabilities
     y_pred = (y_prob >= threshold).astype(int)
+    # --- Save current RC params and update font size ---
+    original_rc_params = plt.rcParams.copy()
+    plt.rcParams.update({'font.size': font_size})
     _LOGGER.info("--- Multi-Label Classification Evaluation ---")
@@ -174,7 +183,7 @@ def multi_label_classification_metrics(
         f"Jaccard Score (macro): {j_score_macro:.4f}\n"
         f"--------------------------------------------------\n"
     )
-    print(overall_report)
+    # print(overall_report)
     overall_report_path = save_dir_path / "classification_report_overall.txt"
     overall_report_path.write_text(overall_report)
@@ -192,8 +201,26 @@ def multi_label_classification_metrics(
         report_path.write_text(report_text) # type: ignore
         # --- Save Confusion Matrix ---
-        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
-        ConfusionMatrixDisplay.from_predictions(true_i, pred_i, cmap="Blues", ax=ax_cm)
+        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        disp_ = ConfusionMatrixDisplay.from_predictions(true_i,
+                                                pred_i,
+                                                cmap=cmap,
+                                                ax=ax_cm,
+                                                normalize='true',
+                                                labels=[0, 1],
+                                                display_labels=["Negative", "Positive"])
+        disp_.im_.set_clim(vmin=0.0, vmax=1.0)
+        # Turn off gridlines
+        ax_cm.grid(False)
+        # Manually update font size of cell texts
+        for text in ax_cm.texts:
+            text.set_fontsize(font_size)
+        fig_cm.tight_layout()
         ax_cm.set_title(f"Confusion Matrix for '{name}'")
         cm_path = save_dir_path / f"confusion_matrix_{sanitized_name}.svg"
         plt.savefig(cm_path)
@@ -202,8 +229,8 @@ def multi_label_classification_metrics(
         # --- Save ROC Curve ---
         fpr, tpr, _ = roc_curve(true_i, prob_i)
         auc = roc_auc_score(true_i, prob_i)
-        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
+        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}', color=ROC_PR_line)
         ax_roc.plot([0, 1], [0, 1], 'k--')
         ax_roc.set_title(f'ROC Curve for "{name}"')
         ax_roc.set_xlabel('False Positive Rate'); ax_roc.set_ylabel('True Positive Rate')
@@ -215,14 +242,17 @@ def multi_label_classification_metrics(
         # --- Save Precision-Recall Curve ---
         precision, recall, _ = precision_recall_curve(true_i, prob_i)
         ap_score = average_precision_score(true_i, prob_i)
-        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}')
+        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}', color=ROC_PR_line)
         ax_pr.set_title(f'Precision-Recall Curve for "{name}"')
         ax_pr.set_xlabel('Recall'); ax_pr.set_ylabel('Precision')
         ax_pr.legend(loc='lower left'); ax_pr.grid(True, linestyle='--', alpha=0.6)
         pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
         plt.savefig(pr_path)
         plt.close(fig_pr)
+    # restore RC params
+    plt.rcParams.update(original_rc_params)
     _LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")
@@ -235,7 +265,7 @@ def multi_target_shap_summary_plot(
     target_names: List[str],
     save_dir: Union[str, Path],
     device: torch.device = torch.device('cpu'),
-    explainer_type: Literal['deep', 'kernel'] = 'deep'
+    explainer_type: Literal['deep', 'kernel'] = 'kernel'
 ):
     """
     Calculates SHAP values for a multi-target model and saves summary plots and data for each target.
@@ -249,7 +279,7 @@ def multi_target_shap_summary_plot(
         save_dir (str | Path): Directory to save SHAP artifacts.
         device (torch.device): The torch device for SHAP calculations.
         explainer_type (Literal['deep', 'kernel']): The explainer to use.
-            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient.
+            - 'deep': Uses shap.DeepExplainer. Fast and efficient.
             - 'kernel': Uses shap.KernelExplainer. Model-agnostic but slow and memory-intensive.
     """
     _LOGGER.info(f"--- Multi-Target SHAP Value Explanation (Using: {explainer_type.upper()}Explainer) ---")
@@ -260,7 +290,7 @@ def multi_target_shap_summary_plot(
     instances_to_explain_np = None
     if explainer_type == 'deep':
-        # --- 1. Use DeepExplainer (Preferred) ---
+        # --- 1. Use DeepExplainer ---
         # Ensure data is torch.Tensor
         if isinstance(background_data, np.ndarray):
@@ -285,10 +315,9 @@ def multi_target_shap_summary_plot(
         instances_to_explain_np = instances_to_explain.cpu().numpy()
     elif explainer_type == 'kernel':
-        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        # --- 2. Use KernelExplainer  ---
         _LOGGER.warning(
-            "Using KernelExplainer. This is memory-intensive and slow. "
-            "Consider reducing 'n_samples' if the process terminates."
+            "KernelExplainer is memory-intensive and slow. Consider reducing the number of instances to explain if the process terminates unexpectedly."
         )
         # Convert all data to numpy

ml_tools/ML_inference.py CHANGED Viewed

@@ -82,7 +82,6 @@ class _BaseInferenceHandler(ABC):
             _LOGGER.warning("CUDA not available, switching to CPU.")
             device_lower = "cpu"
         elif device_lower == "mps" and not torch.backends.mps.is_available():
-            # Your M-series Mac will appreciate this check!
             _LOGGER.warning("Apple Metal Performance Shaders (MPS) not available, switching to CPU.")
             device_lower = "cpu"
         return torch.device(device_lower)

dragon-ml-toolbox 13.3.0__py3-none-any.whl → 14.7.0__py3-none-any.whl

dragon-ml-toolbox 13.3.0py3-none-any.whl → 14.7.0py3-none-any.whl