PyPI - dragon-ml-toolbox - Versions diffs - 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl - Mend

dragon-ml-toolbox 13.3.0py3-none-any.whl → 16.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/METADATA +20 -6
dragon_ml_toolbox-16.2.0.dist-info/RECORD +51 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +10 -0
ml_tools/ETL_cleaning.py +20 -20
ml_tools/ETL_engineering.py +23 -25
ml_tools/GUI_tools.py +20 -20
ml_tools/MICE_imputation.py +207 -5
ml_tools/ML_callbacks.py +43 -26
ml_tools/ML_configuration.py +788 -0
ml_tools/ML_datasetmaster.py +303 -448
ml_tools/ML_evaluation.py +351 -93
ml_tools/ML_evaluation_multi.py +139 -42
ml_tools/ML_inference.py +290 -209
ml_tools/ML_models.py +33 -106
ml_tools/ML_models_advanced.py +323 -0
ml_tools/ML_optimization.py +12 -12
ml_tools/ML_scaler.py +11 -11
ml_tools/ML_sequence_datasetmaster.py +341 -0
ml_tools/ML_sequence_evaluation.py +219 -0
ml_tools/ML_sequence_inference.py +391 -0
ml_tools/ML_sequence_models.py +139 -0
ml_tools/ML_trainer.py +1604 -179
ml_tools/ML_utilities.py +351 -4
ml_tools/ML_vision_datasetmaster.py +1540 -0
ml_tools/ML_vision_evaluation.py +284 -0
ml_tools/ML_vision_inference.py +405 -0
ml_tools/ML_vision_models.py +641 -0
ml_tools/ML_vision_transformers.py +284 -0
ml_tools/PSO_optimization.py +6 -6
ml_tools/SQL.py +4 -4
ml_tools/_keys.py +171 -0
ml_tools/_schema.py +1 -1
ml_tools/custom_logger.py +37 -14
ml_tools/data_exploration.py +502 -93
ml_tools/ensemble_evaluation.py +54 -11
ml_tools/ensemble_inference.py +7 -33
ml_tools/ensemble_learning.py +1 -1
ml_tools/math_utilities.py +1 -1
ml_tools/optimization_tools.py +2 -2
ml_tools/path_manager.py +5 -5
ml_tools/serde.py +2 -2
ml_tools/utilities.py +192 -4
dragon_ml_toolbox-13.3.0.dist-info/RECORD +0 -41
ml_tools/RNN_forecast.py +0 -56
ml_tools/keys.py +0 -87
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/WHEEL +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/licenses/LICENSE +0 -0
{dragon_ml_toolbox-13.3.0.dist-info → dragon_ml_toolbox-16.2.0.dist-info}/top_level.txt +0 -0

ml_tools/ML_evaluation_multi.py CHANGED Viewed

@@ -19,13 +19,17 @@ from sklearn.metrics import (
     jaccard_score
 )
 from pathlib import Path
-from typing import Union, List, Literal
+from typing import Union, List, Literal, Optional
 import warnings
 from .path_manager import make_fullpath, sanitize_filename
 from ._logger import _LOGGER
 from ._script_info import _script_info
-from .keys import SHAPKeys
+from ._keys import SHAPKeys
+from .ML_configuration import (MultiTargetRegressionMetricsFormat,
+                               _BaseRegressionFormat,
+                               MultiLabelBinaryClassificationMetricsFormat,
+                               _BaseMultiLabelFormat)
 __all__ = [
@@ -34,12 +38,15 @@ __all__ = [
     "multi_target_shap_summary_plot",
 ]
+DPI_value = 250
 def multi_target_regression_metrics(
     y_true: np.ndarray,
     y_pred: np.ndarray,
     target_names: List[str],
-    save_dir: Union[str, Path]
+    save_dir: Union[str, Path],
+    config: Optional[MultiTargetRegressionMetricsFormat] = None
 ):
     """
     Calculates and saves regression metrics for each target individually.
@@ -53,6 +60,7 @@ def multi_target_regression_metrics(
         y_pred (np.ndarray): Predicted values, shape (n_samples, n_targets).
         target_names (List[str]): A list of names for the target variables.
         save_dir (str | Path): Directory to save plots and the report.
+        config (object): Formatting configuration object.
     """
     if y_true.ndim != 2 or y_pred.ndim != 2:
         _LOGGER.error("y_true and y_pred must be 2D arrays for multi-target regression.")
@@ -66,8 +74,19 @@ def multi_target_regression_metrics(
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     metrics_summary = []
+    # --- Parse Config or use defaults ---
+    if config is None:
+        # Create a default config if one wasn't provided
+        format_config = _BaseRegressionFormat()
+    else:
+        format_config = config
+    # --- Set Matplotlib font size ---
+    original_rc_params = plt.rcParams.copy()
+    plt.rcParams.update({'font.size': format_config.font_size})
-    _LOGGER.info("--- Multi-Target Regression Evaluation ---")
+    _LOGGER.debug("--- Multi-Target Regression Evaluation ---")
     for i, name in enumerate(target_names):
         print(f"  -> Evaluating target: '{name}'")
@@ -90,9 +109,13 @@ def multi_target_regression_metrics(
         # --- Save Residual Plot ---
         residuals = true_i - pred_i
-        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=100)
-        ax_res.scatter(pred_i, residuals, alpha=0.6, edgecolors='k', s=50)
-        ax_res.axhline(0, color='red', linestyle='--')
+        fig_res, ax_res = plt.subplots(figsize=(8, 6), dpi=DPI_value)
+        ax_res.scatter(pred_i, residuals,
+                       alpha=format_config.scatter_alpha,
+                       edgecolors='k',
+                       s=50,
+                       color=format_config.scatter_color) # Use config color
+        ax_res.axhline(0, color=format_config.residual_line_color, linestyle='--') # Use config color
         ax_res.set_xlabel("Predicted Values")
         ax_res.set_ylabel("Residuals (True - Predicted)")
         ax_res.set_title(f"Residual Plot for '{name}'")
@@ -103,9 +126,16 @@ def multi_target_regression_metrics(
         plt.close(fig_res)
         # --- Save True vs. Predicted Plot ---
-        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=100)
-        ax_tvp.scatter(true_i, pred_i, alpha=0.6, edgecolors='k', s=50)
-        ax_tvp.plot([true_i.min(), true_i.max()], [true_i.min(), true_i.max()], 'k--', lw=2)
+        fig_tvp, ax_tvp = plt.subplots(figsize=(8, 6), dpi=DPI_value)
+        ax_tvp.scatter(true_i, pred_i,
+                       alpha=format_config.scatter_alpha,
+                       edgecolors='k',
+                       s=50,
+                       color=format_config.scatter_color) # Use config color
+        ax_tvp.plot([true_i.min(), true_i.max()], [true_i.min(), true_i.max()],
+                    linestyle='--',
+                    lw=2,
+                    color=format_config.ideal_line_color) # Use config color
         ax_tvp.set_xlabel('True Values')
         ax_tvp.set_ylabel('Predicted Values')
         ax_tvp.set_title(f'True vs. Predicted Values for "{name}"')
@@ -120,14 +150,18 @@ def multi_target_regression_metrics(
     report_path = save_dir_path / "regression_report_multi.csv"
     summary_df.to_csv(report_path, index=False)
     _LOGGER.info(f"Full regression report saved to '{report_path.name}'")
+    # --- Restore RC params ---
+    plt.rcParams.update(original_rc_params)
 def multi_label_classification_metrics(
     y_true: np.ndarray,
+    y_pred: np.ndarray,
     y_prob: np.ndarray,
     target_names: List[str],
     save_dir: Union[str, Path],
-    threshold: float = 0.5
+    config: Optional[MultiLabelBinaryClassificationMetricsFormat] = None
 ):
     """
     Calculates and saves classification metrics for each label individually.
@@ -138,17 +172,17 @@ def multi_label_classification_metrics(
     Args:
         y_true (np.ndarray): Ground truth binary labels, shape (n_samples, n_labels).
+        y_pred (np.ndarray): Predicted binary labels, shape (n_samples, n_labels).
         y_prob (np.ndarray): Predicted probabilities, shape (n_samples, n_labels).
         target_names (List[str]): A list of names for the labels.
         save_dir (str | Path): Directory to save plots and reports.
-        threshold (float): The probability threshold to convert probabilities into
-                           binary predictions for metrics like the confusion matrix.
+        config (object): Formatting configuration object.
     """
-    if y_true.ndim != 2 or y_prob.ndim != 2:
-        _LOGGER.error("y_true and y_prob must be 2D arrays for multi-label classification.")
+    if y_true.ndim != 2 or y_prob.ndim != 2 or y_pred.ndim != 2:
+        _LOGGER.error("y_true, y_pred, and y_prob must be 2D arrays for multi-label classification.")
         raise ValueError()
-    if y_true.shape != y_prob.shape:
-        _LOGGER.error("Shapes of y_true and y_prob must match.")
+    if y_true.shape != y_prob.shape or y_true.shape != y_pred.shape:
+        _LOGGER.error("Shapes of y_true, y_pred, and y_prob must match.")
         raise ValueError()
     if y_true.shape[1] != len(target_names):
         _LOGGER.error("Number of target names must match the number of columns in y_true.")
@@ -156,25 +190,35 @@ def multi_label_classification_metrics(
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
-    # Generate binary predictions from probabilities
-    y_pred = (y_prob >= threshold).astype(int)
+    # --- Parse Config or use defaults ---
+    if config is None:
+        # Create a default config if one wasn't provided
+        format_config = _BaseMultiLabelFormat()
+    else:
+        format_config = config
+    # y_pred is now passed in directly, no threshold needed.
+    # --- Save current RC params and update font size ---
+    original_rc_params = plt.rcParams.copy()
+    plt.rcParams.update({'font.size': format_config.font_size})
-    _LOGGER.info("--- Multi-Label Classification Evaluation ---")
+    # _LOGGER.info("--- Multi-Label Classification Evaluation ---")
-    # --- Calculate and Save Overall Metrics ---
+    # --- Calculate and Save Overall Metrics (using y_pred) ---
     h_loss = hamming_loss(y_true, y_pred)
     j_score_micro = jaccard_score(y_true, y_pred, average='micro')
     j_score_macro = jaccard_score(y_true, y_pred, average='macro')
     overall_report = (
-        f"Overall Multi-Label Metrics (Threshold = {threshold}):\n"
+        f"Overall Multi-Label Metrics:\n" # No threshold to report here
         f"--------------------------------------------------\n"
         f"Hamming Loss: {h_loss:.4f}\n"
         f"Jaccard Score (micro): {j_score_micro:.4f}\n"
         f"Jaccard Score (macro): {j_score_macro:.4f}\n"
         f"--------------------------------------------------\n"
     )
-    print(overall_report)
+    # print(overall_report)
     overall_report_path = save_dir_path / "classification_report_overall.txt"
     overall_report_path.write_text(overall_report)
@@ -182,28 +226,79 @@ def multi_label_classification_metrics(
     for i, name in enumerate(target_names):
         print(f"  -> Evaluating label: '{name}'")
         true_i = y_true[:, i]
-        pred_i = y_pred[:, i]
-        prob_i = y_prob[:, i]
+        pred_i = y_pred[:, i] # Use passed-in y_pred
+        prob_i = y_prob[:, i] # Use passed-in y_prob
         sanitized_name = sanitize_filename(name)
-        # --- Save Classification Report for the label ---
+        # --- Save Classification Report for the label (uses y_pred) ---
         report_text = classification_report(true_i, pred_i)
         report_path = save_dir_path / f"classification_report_{sanitized_name}.txt"
         report_path.write_text(report_text) # type: ignore
-        # --- Save Confusion Matrix ---
-        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=100)
-        ConfusionMatrixDisplay.from_predictions(true_i, pred_i, cmap="Blues", ax=ax_cm)
+        # --- Save Confusion Matrix (uses y_pred) ---
+        fig_cm, ax_cm = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        disp_ = ConfusionMatrixDisplay.from_predictions(true_i,
+                                                pred_i,
+                                                cmap=format_config.cmap, # Use config cmap
+                                                ax=ax_cm,
+                                                normalize='true',
+                                                labels=[0, 1],
+                                                display_labels=["Negative", "Positive"])
+        disp_.im_.set_clim(vmin=0.0, vmax=1.0)
+        # Turn off gridlines
+        ax_cm.grid(False)
+        # Manually update font size of cell texts
+        for text in ax_cm.texts:
+            text.set_fontsize(format_config.font_size) # Use config font_size
+        fig_cm.tight_layout()
         ax_cm.set_title(f"Confusion Matrix for '{name}'")
         cm_path = save_dir_path / f"confusion_matrix_{sanitized_name}.svg"
         plt.savefig(cm_path)
         plt.close(fig_cm)
-        # --- Save ROC Curve ---
-        fpr, tpr, _ = roc_curve(true_i, prob_i)
+        # --- Save ROC Curve (uses y_prob) ---
+        fpr, tpr, thresholds = roc_curve(true_i, prob_i)
+        try:
+            # Calculate Youden's J statistic (tpr - fpr)
+            J = tpr - fpr
+            # Find the index of the best threshold
+            best_index = np.argmax(J)
+            optimal_threshold = thresholds[best_index]
+            best_tpr = tpr[best_index]
+            best_fpr = fpr[best_index]
+            # Define the filename
+            threshold_filename = f"best_threshold_{sanitized_name}.txt"
+            threshold_path = save_dir_path / threshold_filename
+            # The class name is the target_name for this label
+            class_name = name
+            # Create content for the file
+            file_content = (
+                f"Optimal Classification Threshold (Youden's J Statistic)\n"
+                f"Class/Label: {class_name}\n"
+                f"--------------------------------------------------\n"
+                f"Threshold: {optimal_threshold:.6f}\n"
+                f"True Positive Rate (TPR): {best_tpr:.6f}\n"
+                f"False Positive Rate (FPR): {best_fpr:.6f}\n"
+            )
+            threshold_path.write_text(file_content, encoding="utf-8")
+            _LOGGER.info(f"💾 Optimal threshold for '{name}' saved to '{threshold_path.name}'")
+        except Exception as e:
+            _LOGGER.warning(f"Could not calculate or save optimal threshold for '{name}': {e}")
         auc = roc_auc_score(true_i, prob_i)
-        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}')
+        fig_roc, ax_roc = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}', color=format_config.ROC_PR_line) # Use config color
         ax_roc.plot([0, 1], [0, 1], 'k--')
         ax_roc.set_title(f'ROC Curve for "{name}"')
         ax_roc.set_xlabel('False Positive Rate'); ax_roc.set_ylabel('True Positive Rate')
@@ -212,17 +307,20 @@ def multi_label_classification_metrics(
         plt.savefig(roc_path)
         plt.close(fig_roc)
-        # --- Save Precision-Recall Curve ---
+        # --- Save Precision-Recall Curve (uses y_prob) ---
         precision, recall, _ = precision_recall_curve(true_i, prob_i)
         ap_score = average_precision_score(true_i, prob_i)
-        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=100)
-        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}')
+        fig_pr, ax_pr = plt.subplots(figsize=(6, 6), dpi=DPI_value)
+        ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}', color=format_config.ROC_PR_line) # Use config color
         ax_pr.set_title(f'Precision-Recall Curve for "{name}"')
         ax_pr.set_xlabel('Recall'); ax_pr.set_ylabel('Precision')
         ax_pr.legend(loc='lower left'); ax_pr.grid(True, linestyle='--', alpha=0.6)
         pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
         plt.savefig(pr_path)
         plt.close(fig_pr)
+    # restore RC params
+    plt.rcParams.update(original_rc_params)
     _LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")
@@ -235,7 +333,7 @@ def multi_target_shap_summary_plot(
     target_names: List[str],
     save_dir: Union[str, Path],
     device: torch.device = torch.device('cpu'),
-    explainer_type: Literal['deep', 'kernel'] = 'deep'
+    explainer_type: Literal['deep', 'kernel'] = 'kernel'
 ):
     """
     Calculates SHAP values for a multi-target model and saves summary plots and data for each target.
@@ -249,7 +347,7 @@ def multi_target_shap_summary_plot(
         save_dir (str | Path): Directory to save SHAP artifacts.
         device (torch.device): The torch device for SHAP calculations.
         explainer_type (Literal['deep', 'kernel']): The explainer to use.
-            - 'deep': (Default) Uses shap.DeepExplainer. Fast and efficient.
+            - 'deep': Uses shap.DeepExplainer. Fast and efficient.
             - 'kernel': Uses shap.KernelExplainer. Model-agnostic but slow and memory-intensive.
     """
     _LOGGER.info(f"--- Multi-Target SHAP Value Explanation (Using: {explainer_type.upper()}Explainer) ---")
@@ -260,7 +358,7 @@ def multi_target_shap_summary_plot(
     instances_to_explain_np = None
     if explainer_type == 'deep':
-        # --- 1. Use DeepExplainer (Preferred) ---
+        # --- 1. Use DeepExplainer ---
         # Ensure data is torch.Tensor
         if isinstance(background_data, np.ndarray):
@@ -285,10 +383,9 @@ def multi_target_shap_summary_plot(
         instances_to_explain_np = instances_to_explain.cpu().numpy()
     elif explainer_type == 'kernel':
-        # --- 2. Use KernelExplainer (Slow Fallback) ---
+        # --- 2. Use KernelExplainer  ---
         _LOGGER.warning(
-            "Using KernelExplainer. This is memory-intensive and slow. "
-            "Consider reducing 'n_samples' if the process terminates."
+            "KernelExplainer is memory-intensive and slow. Consider reducing the number of instances to explain if the process terminates unexpectedly."
         )
         # Convert all data to numpy

dragon-ml-toolbox 13.3.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

dragon-ml-toolbox 13.3.0py3-none-any.whl → 16.2.0py3-none-any.whl