PyPI - dragon-ml-toolbox - Versions diffs - 20.7.1__py3-none-any.whl → 20.9.0__py3-none-any.whl - Mend

dragon-ml-toolbox 20.7.1py3-none-any.whl → 20.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.7.1
+Version: 20.9.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT
@@ -174,6 +174,7 @@ ML_vision_transformers
 optimization_tools
 path_manager
 plot_fonts
+resampling
 schema
 serde
 SQL
@@ -206,6 +207,7 @@ optimization_tools
 path_manager
 plot_fonts
 PSO_optimization
+resampling
 schema
 serde
 SQL

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-dragon_ml_toolbox-20.7.1.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-20.7.1.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
+dragon_ml_toolbox-20.9.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-20.9.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
 ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
@@ -30,19 +30,20 @@ ml_tools/ML_chain/_update_schema.py,sha256=z1Us7lv6hy6GwSu1mcid50Jmqq3sh91hMQ0Ln
 ml_tools/ML_configuration/__init__.py,sha256=ogktFnYxz5jWJkhHS4DVaMldHkt3lT2gw9jx5PQ3d78,2755
 ml_tools/ML_configuration/_base_model_config.py,sha256=95L3IfobNFMtnNr79zYpDGerC1q1v7M05tWZvTS2cwE,2247
 ml_tools/ML_configuration/_finalize.py,sha256=l_n13bLu0avMdJ8hNRrH8V_wOBQZM1UGsTydKBkTysM,15047
-ml_tools/ML_configuration/_metrics.py,sha256=xKtEKzphtidwwU8UuUpGv4B8Y6Bv0tAOjEFUYfz8Ehc,23758
+ml_tools/ML_configuration/_metrics.py,sha256=KJM7HQeoEmJUUUrxNa4wYf2N9NawGPJoy7AGdNO3gxQ,24059
 ml_tools/ML_configuration/_models.py,sha256=lvuuqvD6DWUzOa3i06NZfrdfOi9bu2e26T_QO6BGMSw,7629
 ml_tools/ML_configuration/_training.py,sha256=_M_TwouHFNbGrZQtQNAvyG_poSVpmN99cbyUonZsHhk,8969
 ml_tools/ML_datasetmaster/__init__.py,sha256=UltQzuXnlXVCkD-aeA5TW4IcMVLnQf1_aglawg4WyrI,580
-ml_tools/ML_datasetmaster/_base_datasetmaster.py,sha256=lmqo9CN09xMu-YKYtKEnC2ZEzkxcZFJ0rS1B7K2-PKY,14691
+ml_tools/ML_datasetmaster/_base_datasetmaster.py,sha256=IgyVzRY3mlKDyBDklawvPF9SMjZFu8T2red6M-3MlQ4,16074
 ml_tools/ML_datasetmaster/_datasetmaster.py,sha256=Oy2UE3YJpKTaFwQF5TkQLgLB54-BFw_5b8wIPTxZIKU,19157
 ml_tools/ML_datasetmaster/_sequence_datasetmaster.py,sha256=cW3fuILZWs-7Yuo4T2fgGfTC4vwho3Gp4ohIKJYS7O0,18452
 ml_tools/ML_datasetmaster/_vision_datasetmaster.py,sha256=kvSqXYeNBN1JSRfSEEXYeIcsqy9HsJAl_EwFWClqlsw,67025
 ml_tools/ML_evaluation/__init__.py,sha256=e3c8JNP0tt4Kxc7QSQpGcOgrxf8JAucH4UkJvJxUL2E,1122
-ml_tools/ML_evaluation/_classification.py,sha256=8bKQejKrgMipnxU1T12ted7p60xvJS0d0MvHtdNBCBM,30971
+ml_tools/ML_evaluation/_classification.py,sha256=0URqIhNEgWedy-SYRmIJ2ejLKqatiuOU7qelJ6Cv3OE,33939
 ml_tools/ML_evaluation/_feature_importance.py,sha256=mTwi3LKom_axu6UFKunELj30APDdhG9GQC2w7I9mYhI,17137
+ml_tools/ML_evaluation/_helpers.py,sha256=kE1TSYIOAAcYI1EjdudyTfFeU47Wrl0E9eNL1EOwbKg,1217
 ml_tools/ML_evaluation/_loss.py,sha256=1a4O25i3Ya_3naNZNL7ELLUL46BY86g1scA7d7q2UFM,3625
-ml_tools/ML_evaluation/_regression.py,sha256=hnT2B2_6AnQ7aA7uk-X2lZL9G5JFGCduDXyZbr1gFCA,11037
+ml_tools/ML_evaluation/_regression.py,sha256=UZA7_fg85ZKJQWszioWDtmkplSiXeHJk2fBYR5bRXHY,11225
 ml_tools/ML_evaluation/_sequence.py,sha256=gUk9Uvmy7MrXkfrriMnfypkgJU5XERHdqekTa2gBaOM,8004
 ml_tools/ML_evaluation/_vision.py,sha256=abBHQ6Z2GunHNusL3wcLgfI1FVNA6hBUBTq1eOA8FSA,11489
 ml_tools/ML_evaluation_captum/_ML_evaluation_captum.py,sha256=6g3ymSxJGHXxwIN7WCD2Zi9zxKWEv-Qskd2cCGQQJ5Y,18439
@@ -103,10 +104,10 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
 ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
 ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
 ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
-ml_tools/data_exploration/__init__.py,sha256=nYKg1bPBgXibC5nhmNKPw3VaKFeVtlNGL_YpHixW-Pg,1795
-ml_tools/data_exploration/_analysis.py,sha256=H6LryV56FFCHWjvQdkhZbtprZy6aP8EqU_hC2Cf9CLE,7832
+ml_tools/data_exploration/__init__.py,sha256=efUBsruHL56B429tUadl3PdG73zAF639Y430uMQRfko,1917
+ml_tools/data_exploration/_analysis.py,sha256=PJNrEBz5ZZXHoUlQ6fh9Y86nzPQrLpVPv2Ye4NfOxgs,14181
 ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
-ml_tools/data_exploration/_features.py,sha256=wW-M8n2aLIy05DR2z4fI8wjpPjn3mOAnm9aSGYbMKwI,23363
+ml_tools/data_exploration/_features.py,sha256=Z1noJfDxBzFRfusFp6NlpLF2NItuZuzFHq4ssWFqny4,26273
 ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
 ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
 ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
@@ -118,7 +119,7 @@ ml_tools/ensemble_learning/_ensemble_learning.py,sha256=MHDZBR20_nStlSSeThFI3bSu
 ml_tools/excel_handler/__init__.py,sha256=AaWM3n_dqBhJLTs3OEA57ex5YykKXNOwVCyHlVsdnqI,530
 ml_tools/excel_handler/_excel_handler.py,sha256=TODudmeQgDSdxUKzLfAzizs--VL-g8WxDOfQ4sgxxLs,13965
 ml_tools/keys/__init__.py,sha256=-0c2pmrhyfROc-oQpEjJGLBMhSagA3CyFijQaaqZRqU,399
-ml_tools/keys/_keys.py,sha256=lL9NlijxOEAhfDPPqK_wL3QhjalrYK_fWM-KNniSIOA,9308
+ml_tools/keys/_keys.py,sha256=56hlyPl2VUMsq7cFFLBypWHr-JU6ehWGwZG38l6IjI0,9389
 ml_tools/math_utilities/__init__.py,sha256=K7Obkkc4rPKj4EbRZf1BsXHfiCg7FXYv_aN9Yc2Z_Vg,400
 ml_tools/math_utilities/_math_utilities.py,sha256=BYHIVcM9tuKIhVrkgLLiM5QalJ39zx7dXYy_M9aGgiM,9012
 ml_tools/optimization_tools/__init__.py,sha256=KD8JXpfGuPndO4AHnjJGu6uV1GRwhOfboD0KZV45kzw,658
@@ -129,6 +130,10 @@ ml_tools/path_manager/_dragonmanager.py,sha256=q9wHTKPmdzywEz6N14ipUoeR3MmW0bzB4
 ml_tools/path_manager/_path_tools.py,sha256=LcZE31QlkzZWUR8g1MW_N_mPY2DpKBJLA45VJz7ZYsw,11905
 ml_tools/plot_fonts/__init__.py,sha256=KIxXRCjQ3SliEoLhEcqs7zDVZbVTn38bmSdL-yR1Q2w,187
 ml_tools/plot_fonts/_plot_fonts.py,sha256=mfjXNT9P59ymHoTI85Q8CcvfxfK5BIFBWtTZH-hNIC4,2209
+ml_tools/resampling/__init__.py,sha256=WB1YlNQgOIdSSQn-9eCIaiB0AHLSCkziFufqa-1QBG0,278
+ml_tools/resampling/_base_resampler.py,sha256=8IqkEJ7uiAiC9bqbKfsC-5vIvrN3EwH7lLVDlRKQzM8,1617
+ml_tools/resampling/_multi_resampling.py,sha256=m_iVvXPAu3p_EoBt2VZpgjhPLY1LmKa8fGtQo5E0pWk,7199
+ml_tools/resampling/_single_resampling.py,sha256=zKL4Br7Lm4Jq90X-ewQ6AKTsP923bq9RIMnTxIxtXBc,3896
 ml_tools/schema/__init__.py,sha256=K6uiZ9f0GCQ7etw1yl2-dQVLhU7RkL3KHesO3HNX6v4,334
 ml_tools/schema/_feature_schema.py,sha256=MuPf6Nf7tDhUTGyX7tcFHZh-lLSNsJkLmlf9IxdF4O4,9660
 ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
@@ -138,7 +143,7 @@ ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9
 ml_tools/utilities/_translate.py,sha256=U8hRPa3PmTpIf9n9yR3gBGmp_hkcsjQLwjAHSHc0WHs,10325
 ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
 ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
-dragon_ml_toolbox-20.7.1.dist-info/METADATA,sha256=IB7aIajHgmlg0UvpBOjDfCiQWfNmM0G3NKSpiEvDlAs,7866
-dragon_ml_toolbox-20.7.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-20.7.1.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-20.7.1.dist-info/RECORD,,
+dragon_ml_toolbox-20.9.0.dist-info/METADATA,sha256=ehKhp6BpCkHcZnWpcoZU53rn4T0yI0Dboq3eH2vx8LU,7888
+dragon_ml_toolbox-20.9.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-20.9.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-20.9.0.dist-info/RECORD,,

ml_tools/ML_configuration/_metrics.py CHANGED Viewed

@@ -98,10 +98,11 @@ class _BaseMultiLabelFormat:
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
                  calibration_bins: Union[int, Literal['auto']]='auto',
-                 font_size: int = 25,
-                 xtick_size: int=20,
-                 ytick_size: int=20,
-                 legend_size: int=23) -> None:
+                 font_size: int = 26,
+                 xtick_size: int=22,
+                 ytick_size: int=22,
+                 legend_size: int=26,
+                 cm_font_size: int=26) -> None:
         """
         Initializes the formatting configuration for multi-label classification metrics.
@@ -127,6 +128,8 @@ class _BaseMultiLabelFormat:
             legend_size (int): Font size for plot legends.
+            cm_font_size (int): Font size for the confusion matrix.
         <br>
         ### [Matplotlib Colormaps](https://matplotlib.org/stable/users/explain/colors/colormaps.html)
@@ -142,6 +145,7 @@ class _BaseMultiLabelFormat:
         self.xtick_size = xtick_size
         self.ytick_size = ytick_size
         self.legend_size = legend_size
+        self.cm_font_size = cm_font_size
     def __repr__(self) -> str:
         parts = [
@@ -151,7 +155,8 @@ class _BaseMultiLabelFormat:
             f"font_size={self.font_size}",
             f"xtick_size={self.xtick_size}",
             f"ytick_size={self.ytick_size}",
-            f"legend_size={self.legend_size}"
+            f"legend_size={self.legend_size}",
+            f"cm_font_size={self.cm_font_size}"
         ]
         return f"{self.__class__.__name__}({', '.join(parts)})"
@@ -520,10 +525,11 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
                  calibration_bins: Union[int, Literal['auto']]='auto',
-                 font_size: int = 25,
-                 xtick_size: int=20,
-                 ytick_size: int=20,
-                 legend_size: int=23
+                 font_size: int = 26,
+                 xtick_size: int=22,
+                 ytick_size: int=22,
+                 legend_size: int=26,
+                 cm_font_size: int=26
                  ) -> None:
         super().__init__(cmap=cmap,
                          ROC_PR_line=ROC_PR_line,
@@ -531,7 +537,8 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
                          font_size=font_size,
                          xtick_size=xtick_size,
                          ytick_size=ytick_size,
-                         legend_size=legend_size)
+                         legend_size=legend_size,
+                         cm_font_size=cm_font_size)
 # Segmentation

ml_tools/ML_datasetmaster/_base_datasetmaster.py CHANGED Viewed

@@ -133,7 +133,7 @@ class _BaseDatasetMaker(ABC):
         # Get continuous feature indices *from the schema*
         if schema.continuous_feature_names:
-            if verbose >= 2:
+            if verbose >= 3:
                 _LOGGER.info("Getting continuous feature indices from schema.")
             try:
                 # Convert columns to a standard list for .index()
@@ -189,7 +189,7 @@ class _BaseDatasetMaker(ABC):
         # ------------------------------------------------------------------
         if self.target_scaler is None:
-            if verbose >= 2:
+            if verbose >= 3:
                 _LOGGER.info("Fitting a new DragonScaler on training targets.")
             # Convert to float tensor for calculation
             y_train_tensor = torch.tensor(y_train_arr, dtype=torch.float32)
@@ -202,6 +202,9 @@ class _BaseDatasetMaker(ABC):
             y_val_tensor = self.target_scaler.transform(torch.tensor(y_val_arr, dtype=torch.float32))
             y_test_tensor = self.target_scaler.transform(torch.tensor(y_test_arr, dtype=torch.float32))
             return y_train_tensor.numpy(), y_val_tensor.numpy(), y_test_tensor.numpy()
+        if verbose >= 2:
+            _LOGGER.info("Target scaling transformation complete.")
         return y_train_arr, y_val_arr, y_test_arr
@@ -214,6 +217,9 @@ class _BaseDatasetMaker(ABC):
     @property
     def train_dataset(self) -> Dataset:
+        """
+        Returns the training dataset.
+        """
         if self._train_ds is None:
             _LOGGER.error("Train Dataset not yet created.")
             raise RuntimeError()
@@ -221,6 +227,9 @@ class _BaseDatasetMaker(ABC):
     @property
     def validation_dataset(self) -> Dataset:
+        """
+        Returns the validation dataset.
+        """
         if self._val_ds is None:
             _LOGGER.error("Validation Dataset not yet created.")
             raise RuntimeError()
@@ -228,6 +237,9 @@ class _BaseDatasetMaker(ABC):
     @property
     def test_dataset(self) -> Dataset:
+        """
+        Returns the test dataset.
+        """
         if self._test_ds is None:
             _LOGGER.error("Test Dataset not yet created.")
             raise RuntimeError()
@@ -235,30 +247,50 @@ class _BaseDatasetMaker(ABC):
     @property
     def feature_names(self) -> list[str]:
+        """
+        Returns a list with the feature names.
+        """
         return self._feature_names
     @property
     def target_names(self) -> list[str]:
+        """
+        Returns a list with the target names.
+        """
         return self._target_names
     @property
     def number_of_features(self) -> int:
+        """
+        Returns the number of features.
+        """
         return len(self._feature_names)
     @property
     def number_of_targets(self) -> int:
+        """
+        Returns the number of targets.
+        """
         return len(self._target_names)
     @property
     def id(self) -> Optional[str]:
+        """
+        Returns the dataset ID if set, otherwise None.
+        """
         return self._id
     @id.setter
     def id(self, dataset_id: str):
-        if not isinstance(dataset_id, str): raise ValueError("ID must be a string.")
+        if not isinstance(dataset_id, str):
+            _LOGGER.error("Dataset ID must be a string.")
+            raise ValueError()
         self._id = dataset_id
     def dataframes_info(self) -> None:
+        """
+        Prints the shapes of the dataframes after the split.
+        """
         print("--- DataFrame Shapes After Split ---")
         print(f"  X_train shape: {self._X_train_shape}, y_train shape: {self._y_train_shape}")
         print(f"  X_val shape:   {self._X_val_shape}, y_val shape:   {self._y_val_shape}")
@@ -266,12 +298,26 @@ class _BaseDatasetMaker(ABC):
         print("------------------------------------")
     def save_feature_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
+        """
+        Saves the feature names to a text file.
+        Args:
+            directory (str | Path): Directory to save the feature names.
+            verbose (bool): Whether to print log messages.
+        """
         save_list_strings(list_strings=self._feature_names,
                           directory=directory,
                           filename=DatasetKeys.FEATURE_NAMES,
                           verbose=verbose)
     def save_target_names(self, directory: Union[str, Path], verbose: bool=True) -> None:
+        """
+        Saves the target names to a text file.
+        Args:
+            directory (str | Path): Directory to save the target names.
+            verbose (bool): Whether to print log messages.
+        """
         save_list_strings(list_strings=self._target_names,
                           directory=directory,
                           filename=DatasetKeys.TARGET_NAMES,
@@ -281,6 +327,10 @@ class _BaseDatasetMaker(ABC):
         """
         Saves both feature and target scalers (if they exist) to a single .pth file
         using a dictionary structure.
+        Args:
+            directory (str | Path): Directory to save the scaler.
+            verbose (bool): Whether to print log messages.
         """
         if self.feature_scaler is None and self.target_scaler is None:
             _LOGGER.warning("No scalers (feature or target) were fitted. Nothing to save.")

ml_tools/ML_evaluation/_classification.py CHANGED Viewed

@@ -28,6 +28,8 @@ from ..path_manager import make_fullpath, sanitize_filename
 from .._core import get_logger
 from ..keys._keys import _EvaluationConfig
+from ._helpers import check_and_abbreviate_name
 _LOGGER = get_logger("Classification Metrics")
@@ -85,7 +87,8 @@ def classification_metrics(save_dir: Union[str, Path],
         try:
             sorted_items = sorted(class_map.items(), key=lambda item: item[1])
             map_labels = [item[1] for item in sorted_items]
-            map_display_labels = [item[0] for item in sorted_items]
+            # Abbreviate display labels if needed
+            map_display_labels = [check_and_abbreviate_name(item[0]) for item in sorted_items]
         except Exception as e:
             _LOGGER.warning(f"Could not parse 'class_map': {e}")
             map_labels = None
@@ -397,6 +400,10 @@ def classification_metrics(save_dir: Union[str, Path],
             # --- Step 1: Get binned data directly ---
             # calculates reliability diagram data without needing a temporary plot
             prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
+            # Anchor the plot to (0,0) and (1,1) to ensure the line spans the full diagonal
+            prob_true = np.concatenate(([0.0], prob_true, [1.0]))
+            prob_pred = np.concatenate(([0.0], prob_pred, [1.0]))
             # --- Step 2: Plot ---
             ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
@@ -467,6 +474,9 @@ def multi_label_classification_metrics(
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
+    # --- Pre-process target names for abbreviation ---
+    target_names = [check_and_abbreviate_name(name) for name in target_names]
     # --- Parse Config or use defaults ---
     if config is None:
         # Create a default config if one wasn't provided
@@ -481,6 +491,10 @@ def multi_label_classification_metrics(
     ytick_size = format_config.ytick_size
     legend_size = format_config.legend_size
     base_font_size = format_config.font_size
+    # config font size for heatmap
+    cm_font_size = format_config.cm_font_size
+    cm_tick_size = cm_font_size - 4
     # --- Calculate and Save Overall Metrics (using y_pred) ---
     h_loss = hamming_loss(y_true, y_pred)
@@ -488,7 +502,7 @@ def multi_label_classification_metrics(
     j_score_macro = jaccard_score(y_true, y_pred, average='macro')
     overall_report = (
-        f"Overall Multi-Label Metrics:\n" # No threshold to report here
+        f"Overall Multi-Label Metrics:\n"
         f"--------------------------------------------------\n"
         f"Hamming Loss: {h_loss:.4f}\n"
         f"Jaccard Score (micro): {j_score_micro:.4f}\n"
@@ -499,9 +513,65 @@ def multi_label_classification_metrics(
     overall_report_path = save_dir_path / "classification_report.txt"
     overall_report_path.write_text(overall_report)
+    # --- Save Classification Report Heatmap (Multi-label) ---
+    try:
+         # Generate full report as dict
+        full_report_dict = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
+        report_df = pd.DataFrame(full_report_dict)
+        # Cleanup
+        # Remove 'accuracy' column if it exists
+        report_df = report_df.drop(columns=['accuracy'], errors='ignore')
+        # Remove 'support' row explicitly
+        if 'support' in report_df.index:
+            report_df = report_df.drop(index='support')
+        # Transpose: Rows = Classes/Averages, Cols = Metrics
+        plot_df = report_df.T
+        # Dynamic Height
+        fig_height = max(5.0, len(plot_df.index) * 0.5 + 4.0)
+        fig_width = 8.0
+        fig_heat, ax_heat = plt.subplots(figsize=(fig_width, fig_height), dpi=_EvaluationConfig.DPI)
+        # Plot
+        sns.heatmap(plot_df,
+                    annot=True,
+                    cmap=format_config.cmap,
+                    fmt='.2f',
+                    vmin=0.0,
+                    vmax=1.0,
+                    cbar_kws={'shrink': 0.9})
+        ax_heat.set_title("Classification Report Heatmap", pad=_EvaluationConfig.LABEL_PADDING, fontsize=cm_font_size)
+        # manually increase the font size of the elements
+        for text in ax_heat.texts:
+            text.set_fontsize(cm_tick_size)
+        cbar = ax_heat.collections[0].colorbar
+        cbar.ax.tick_params(labelsize=cm_tick_size - 4) # type: ignore
+        ax_heat.tick_params(axis='x', labelsize=cm_tick_size, pad=_EvaluationConfig.LABEL_PADDING)
+        ax_heat.tick_params(axis='y', labelsize=cm_tick_size, pad=_EvaluationConfig.LABEL_PADDING, rotation=0)
+        plt.tight_layout()
+        heatmap_path = save_dir_path / "classification_report_heatmap.svg"
+        plt.savefig(heatmap_path)
+        _LOGGER.info(f"📊 Report heatmap saved as '{heatmap_path.name}'")
+        plt.close(fig_heat)
+    except Exception as e:
+        _LOGGER.error(f"Could not generate multi-label classification report heatmap: {e}")
     # --- Per-Label Metrics and Plots ---
     for i, name in enumerate(target_names):
-        print(f"  -> Evaluating label: '{name}'")
+        # strip whitespace from name
+        name = name.strip()
+        # print(f"  -> Evaluating label: '{name}'")
         true_i = y_true[:, i]
         pred_i = y_pred[:, i] # Use passed-in y_pred
         prob_i = y_prob[:, i] # Use passed-in y_prob
@@ -537,7 +607,7 @@ def multi_label_classification_metrics(
         ax_cm.tick_params(axis='y', labelsize=ytick_size)
         # Set titles and labels with padding
-        ax_cm.set_title(f"Confusion Matrix for '{name}'", pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_cm.set_title(f"Confusion Matrix - {name}", pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
         ax_cm.set_xlabel(ax_cm.get_xlabel(), labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
         ax_cm.set_ylabel(ax_cm.get_ylabel(), labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
@@ -594,7 +664,7 @@ def multi_label_classification_metrics(
         ax_roc.plot(fpr, tpr, label=f'AUC = {auc:.2f}', color=format_config.ROC_PR_line) # Use config color
         ax_roc.plot([0, 1], [0, 1], 'k--')
-        ax_roc.set_title(f'ROC Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_roc.set_title(f'ROC Curve - {name}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
         ax_roc.set_xlabel('False Positive Rate', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
         ax_roc.set_ylabel('True Positive Rate', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
@@ -616,7 +686,7 @@ def multi_label_classification_metrics(
         ap_score = average_precision_score(true_i, prob_i)
         fig_pr, ax_pr = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
         ax_pr.plot(recall, precision, label=f'AP = {ap_score:.2f}', color=format_config.ROC_PR_line) # Use config color
-        ax_pr.set_title(f'Precision-Recall Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_pr.set_title(f'PR Curve - {name}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
         ax_pr.set_xlabel('Recall', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
         ax_pr.set_ylabel('Precision', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
@@ -654,15 +724,20 @@ def multi_label_classification_metrics(
         # Calculate calibration curve for this specific label
         prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
+        # Anchor the plot to (0,0) and (1,1)
+        prob_true = np.concatenate(([0.0], prob_true, [1.0]))
+        prob_pred = np.concatenate(([0.0], prob_pred, [1.0]))
+        # Plot the calibration curve
         ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
         ax_cal.plot(prob_pred,
                     prob_true,
                     marker='o',
                     linewidth=2,
-                    label=f"Calibration for '{name}'",
+                    label=f"Model Calibration",
                     color=format_config.ROC_PR_line)
-        ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_cal.set_title(f'Calibration - {name}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
         ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
         ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)

ml_tools/ML_evaluation/_helpers.py ADDED Viewed

@@ -0,0 +1,41 @@
+from ..keys._keys import _EvaluationConfig
+from ..path_manager import sanitize_filename
+from .._core import get_logger
+_LOGGER = get_logger("Metrics Helper")
+def check_and_abbreviate_name(name: str) -> str:
+    """
+    Checks if a name exceeds the NAME_LIMIT. If it does, creates an abbreviation
+    (initials of words) or truncates it if the abbreviation is empty.
+    Args:
+        name (str): The original label or target name.
+    Returns:
+        str: The potentially abbreviated name.
+    """
+    limit = _EvaluationConfig.NAME_LIMIT
+    # Strip whitespace
+    name = name.strip()
+    if len(name) <= limit:
+        return name
+    # Attempt abbreviation: First letter of each word (split by space or underscore)
+    parts = [w for w in name.replace("_", " ").split() if w]
+    abbr = "".join(p[0].upper() for p in parts)
+    # Keep only alphanumeric characters
+    abbr = "".join(ch for ch in abbr if ch.isalnum())
+    # Fallback if abbreviation failed or is empty
+    if not abbr:
+        sanitized = sanitize_filename(name)
+        abbr = sanitized[:limit]
+    _LOGGER.warning(f"Label '{name}' is too long. Abbreviating to '{abbr}'.")
+    return abbr

ml_tools/ML_evaluation/_regression.py CHANGED Viewed

@@ -19,6 +19,8 @@ from ..path_manager import make_fullpath, sanitize_filename
 from .._core import get_logger
 from ..keys._keys import _EvaluationConfig
+from ._helpers import check_and_abbreviate_name
 _LOGGER = get_logger("Regression Metrics")
@@ -180,6 +182,9 @@ def multi_target_regression_metrics(
     if y_true.shape[1] != len(target_names):
         _LOGGER.error("Number of target names must match the number of columns in y_true.")
         raise ValueError()
+    # --- Pre-process target names for abbreviation ---
+    target_names = [check_and_abbreviate_name(name) for name in target_names]
     save_dir_path = make_fullpath(save_dir, make=True, enforce="directory")
     metrics_summary = []

ml_tools/data_exploration/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@ from ._analysis import (
     summarize_dataframe,
     show_null_columns,
     match_and_filter_columns_by_regex,
+    check_class_balance,
 )
 from ._cleaning import (
@@ -28,6 +29,7 @@ from ._features import (
     split_continuous_binary,
     split_continuous_categorical_targets,
     encode_categorical_features,
+    encode_classification_target,
     reconstruct_one_hot,
     reconstruct_binary,
     reconstruct_multibinary,
@@ -44,7 +46,6 @@ from .._core import _imprimir_disponibles
 __all__ = [
     "summarize_dataframe",
-    "show_null_columns",
     "drop_constant_columns",
     "drop_rows_with_missing_data",
     "drop_columns_with_missing_data",
@@ -61,10 +62,13 @@ __all__ = [
     "plot_categorical_vs_target",
     "plot_correlation_heatmap",
     "encode_categorical_features",
+    "encode_classification_target",
     "finalize_feature_schema",
     "apply_feature_schema",
     "reconstruct_from_schema",
     "match_and_filter_columns_by_regex",
+    "show_null_columns",
+    "check_class_balance",
     "standardize_percentages",
     "reconstruct_one_hot",
     "reconstruct_binary",

ml_tools/data_exploration/_analysis.py CHANGED Viewed

@@ -16,6 +16,7 @@ __all__ = [
     "summarize_dataframe",
     "show_null_columns",
     "match_and_filter_columns_by_regex",
+    "check_class_balance",
 ]
@@ -212,3 +213,151 @@ def match_and_filter_columns_by_regex(
     return filtered_df, matched_columns
+def check_class_balance(
+    df: pd.DataFrame,
+    target: Union[str, list[str]],
+    plot_to_dir: Optional[Union[str, Path]] = None,
+    plot_filename: str = "Class_Balance"
+) -> pd.DataFrame:
+    """
+    Analyzes the class balance for classification targets.
+    Handles two cases:
+    1. Single Column (Binary/Multi-class): Calculates frequency of each unique value.
+    2. List of Columns (Multi-label Binary): Calculates the frequency of positive values (1) per column.
+    Args:
+        df (pd.DataFrame): The input DataFrame.
+        target (str | list[str]): The target column name (for single/multi-class classification)
+                                  or list of column names (for multi-label-binary classification).
+        plot_to_dir (str | Path | None): Directory to save the balance plot.
+        plot_filename (str): Filename for the plot (without extension).
+    Returns:
+        pd.DataFrame: Summary table of counts and percentages.
+    """
+    # Early fail for empty DataFrame and handle list of targets with only one item
+    if df.empty:
+        _LOGGER.error("Input DataFrame is empty.")
+        raise ValueError()
+    if isinstance(target, list):
+        if len(target) == 0:
+            _LOGGER.error("Target list is empty.")
+            raise ValueError()
+        elif len(target) == 1:
+            target = target[0]  # Simplify to single column case
+    # Case 1: Single Target (Binary or Multi-class)
+    if isinstance(target, str):
+        if target not in df.columns:
+            _LOGGER.error(f"Target column '{target}' not found in DataFrame.")
+            raise ValueError()
+        # Calculate stats
+        counts = df[target].value_counts(dropna=False).sort_index()
+        percents = df[target].value_counts(normalize=True, dropna=False).sort_index() * 100
+        summary = pd.DataFrame({
+            'Count': counts,
+            'Percentage': percents.round(2)
+        })
+        summary.index.name = "Class"
+        # Plotting
+        if plot_to_dir:
+            try:
+                save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
+                plt.figure(figsize=(10, 6))
+                # Convert index to str to handle numeric classes cleanly on x-axis
+                x_labels = summary.index.astype(str)
+                bars = plt.bar(x_labels, summary['Count'], color='lightgreen', edgecolor='black', alpha=0.7)
+                plt.title(f"Class Balance: {target}")
+                plt.xlabel(target)
+                plt.ylabel("Count")
+                plt.grid(axis='y', linestyle='--', alpha=0.5)
+                # Add percentage labels on top of bars
+                for bar, pct in zip(bars, summary['Percentage']):
+                    height = bar.get_height()
+                    plt.text(bar.get_x() + bar.get_width()/2, height,
+                             f'{pct:.1f}%', ha='center', va='bottom', fontsize=10)
+                plt.tight_layout()
+                full_filename = sanitize_filename(plot_filename) + ".svg"
+                plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
+                plt.close()
+                _LOGGER.info(f"Saved class balance plot: '{full_filename}'")
+            except Exception as e:
+                _LOGGER.error(f"Failed to plot class balance. Error: {e}")
+                plt.close()
+        return summary
+    # Case 2: Multi-label (List of binary columns)
+    elif isinstance(target, list):
+        missing_cols = [t for t in target if t not in df.columns]
+        if missing_cols:
+            _LOGGER.error(f"Target columns not found: {missing_cols}")
+            raise ValueError()
+        stats = []
+        for col in target:
+            # Assume 0/1 or False/True. Sum gives the count of positives.
+            # We enforce numeric to be safe
+            try:
+                numeric_series = pd.to_numeric(df[col], errors='coerce').fillna(0)
+                pos_count = numeric_series.sum()
+                total_count = len(df)
+                pct = (pos_count / total_count) * 100
+            except Exception:
+                _LOGGER.warning(f"Column '{col}' could not be processed as numeric. Assuming 0 positives.")
+                pos_count = 0
+                pct = 0.0
+            stats.append({
+                'Label': col,
+                'Positive_Count': int(pos_count),
+                'Positive_Percentage': round(pct, 2)
+            })
+        summary = pd.DataFrame(stats).set_index("Label").sort_values("Positive_Percentage", ascending=True)
+        # Plotting
+        if plot_to_dir:
+            try:
+                save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
+                # Dynamic height for many labels
+                height = max(6, len(target) * 0.4)
+                plt.figure(figsize=(10, height))
+                bars = plt.barh(summary.index, summary['Positive_Percentage'], color='lightgreen', edgecolor='black', alpha=0.7)
+                plt.title(f"Multi-label Binary Class Balance")
+                plt.xlabel("Positive Class Percentage (%)")
+                plt.xlim(0, 100)
+                plt.grid(axis='x', linestyle='--', alpha=0.5)
+                # Add count labels at the end of bars
+                for bar, count in zip(bars, summary['Positive_Count']):
+                    width = bar.get_width()
+                    plt.text(width + 1, bar.get_y() + bar.get_height()/2, f'{width:.1f}%', ha='left', va='center', fontsize=9)
+                plt.tight_layout()
+                full_filename = sanitize_filename(plot_filename) + ".svg"
+                plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
+                plt.close()
+                _LOGGER.info(f"Saved multi-label balance plot: '{full_filename}'")
+            except Exception as e:
+                _LOGGER.error(f"Failed to plot class balance. Error: {e}")
+                plt.close()
+        return summary.sort_values("Positive_Percentage", ascending=False)
+    else:
+        _LOGGER.error("Target must be a string or a list of strings.")
+        raise TypeError()

ml_tools/data_exploration/_features.py CHANGED Viewed

@@ -3,7 +3,10 @@ from pandas.api.types import is_numeric_dtype, is_object_dtype
 import numpy as np
 from typing import Any, Optional, Union
 import re
+import json
+from pathlib import Path
+from ..path_manager import make_fullpath
 from .._core import get_logger
@@ -15,6 +18,7 @@ __all__ = [
     "split_continuous_binary",
     "split_continuous_categorical_targets",
     "encode_categorical_features",
+    "encode_classification_target",
     "reconstruct_one_hot",
     "reconstruct_binary",
     "reconstruct_multibinary",
@@ -263,6 +267,78 @@ def encode_categorical_features(
     return df_encoded, mappings
+def encode_classification_target(
+    df: pd.DataFrame,
+    target_col: str,
+    save_dir: Union[str, Path],
+    verbose: int = 2
+) -> tuple[pd.DataFrame, dict[str, int]]:
+    """
+    Encodes a target classification column into integers (0, 1, 2...) and saves the mapping to a JSON file.
+    This ensures that the target variable is in the correct numeric format for training
+    and provides a persistent artifact (the JSON file) to map predictions back to labels later.
+    Args:
+        df (pd.DataFrame): Input DataFrame.
+        target_col (str): Name of the target column to encode.
+        save_dir (str | Path): Directory where the class map JSON will be saved.
+        verbose (int): Verbosity level for logging.
+    Returns:
+        Tuple (Dataframe, Dict):
+            - A new DataFrame with the target column encoded as integers.
+            - The dictionary mapping original labels (str) to integers (int).
+    """
+    if target_col not in df.columns:
+        _LOGGER.error(f"Target column '{target_col}' not found in DataFrame.")
+        raise ValueError()
+    # Validation: Check for missing values in target
+    if df[target_col].isnull().any():
+        n_missing = df[target_col].isnull().sum()
+        _LOGGER.error(f"Target column '{target_col}' contains {n_missing} missing values. Please handle them before encoding.")
+        raise ValueError()
+    # Ensure directory exists
+    save_path = make_fullpath(save_dir, make=True, enforce="directory")
+    file_path = save_path / "class_map.json"
+    # Get unique values and sort them to ensure deterministic encoding (0, 1, 2...)
+    # Convert to string to ensure the keys in JSON are strings
+    unique_labels = sorted(df[target_col].astype(str).unique())
+    # Create mapping: { Label -> Integer }
+    class_map = {label: idx for idx, label in enumerate(unique_labels)}
+    # Apply mapping
+    # cast column to string to match the keys in class_map
+    df_encoded = df.copy()
+    df_encoded[target_col] = df_encoded[target_col].astype(str).map(class_map)
+    # Save to JSON
+    try:
+        with open(file_path, 'w', encoding='utf-8') as f:
+            json.dump(class_map, f, indent=4)
+        if verbose >= 2:
+            _LOGGER.info(f"Class mapping saved to: '{file_path}'")
+        if verbose >= 3:
+            _LOGGER.info(f"Target '{target_col}' encoded with {len(class_map)} classes.")
+            # Print a preview
+            if len(class_map) <= 10:
+                print(f"  Mapping: {class_map}")
+            else:
+                print(f"  Mapping (first 5): {dict(list(class_map.items())[:5])} ...")
+    except Exception as e:
+        _LOGGER.error(f"Failed to save class map JSON. Error: {e}")
+        raise IOError()
+    return df_encoded, class_map
 def reconstruct_one_hot(
     df: pd.DataFrame,
     features_to_reconstruct: list[Union[str, tuple[str, Optional[str]]]],

ml_tools/keys/_keys.py CHANGED Viewed

@@ -306,6 +306,7 @@ class _EvaluationConfig:
     LOSS_PLOT_LEGEND_SIZE = 24
     # CM settings
     CM_SIZE = (9, 8)    # used for multi label binary classification confusion matrix
+    NAME_LIMIT = 15  # max number of characters for feature/label names in plots
 class _OneHotOtherPlaceholder:
     """Used internally by GUI_tools."""

ml_tools/resampling/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+from ._single_resampling import (
+    DragonResampler,
+)
+from ._multi_resampling import (
+    DragonMultiResampler,
+)
+from .._core import _imprimir_disponibles
+__all__ = [
+    "DragonResampler",
+    "DragonMultiResampler",
+]
+def info():
+    _imprimir_disponibles(__all__)

ml_tools/resampling/_base_resampler.py ADDED Viewed

@@ -0,0 +1,49 @@
+import polars as pl
+import pandas as pd
+from typing import Union
+from abc import ABC, abstractmethod
+__all__ = ["_DragonBaseResampler"]
+class _DragonBaseResampler(ABC):
+    """
+    Base class for Dragon resamplers handling common I/O and state.
+    """
+    def __init__(self,
+                 return_pandas: bool = False,
+                 seed: int = 42):
+        self.return_pandas = return_pandas
+        self.seed = seed
+    def _convert_to_polars(self, df: Union[pd.DataFrame, pl.DataFrame]) -> pl.DataFrame:
+        """Standardizes input to Polars DataFrame."""
+        if isinstance(df, pd.DataFrame):
+            return pl.from_pandas(df)
+        return df
+    def _convert_to_pandas(self, df: pl.DataFrame) -> pd.DataFrame:
+        """Converts Polars DataFrame back to Pandas."""
+        return df.to_pandas(use_pyarrow_extension_array=False)
+    def _process_return(self, df: pl.DataFrame, shuffle: bool = True) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Finalizes the DataFrame:
+        1. Global Shuffle (optional but recommended for ML).
+        2. Conversion to Pandas (if requested).
+        """
+        if shuffle:
+            # Random shuffle of the final dataset
+            df = df.sample(fraction=1.0, seed=self.seed, with_replacement=False)
+        if self.return_pandas:
+            return self._convert_to_pandas(df)
+        return df
+    @abstractmethod
+    def describe_balance(self, df: Union[pd.DataFrame, pl.DataFrame], top_n: int = 10) -> None:
+        """
+        Prints a statistical summary of the target distribution.
+        """
+        pass

ml_tools/resampling/_multi_resampling.py ADDED Viewed

@@ -0,0 +1,184 @@
+import polars as pl
+import pandas as pd
+import numpy as np
+from typing import Union, Optional
+from .._core import get_logger
+from ._base_resampler import _DragonBaseResampler
+_LOGGER = get_logger("DragonMultiResampler")
+__all__ = [
+    "DragonMultiResampler",
+]
+class DragonMultiResampler(_DragonBaseResampler):
+    """
+    A robust resampler for multi-label binary classification tasks using Polars.
+    It provides methods to downsample "all-negative" rows and balance the dataset
+    based on unique label combinations (Powerset).
+    """
+    def __init__(self,
+                 target_columns: list[str],
+                 return_pandas: bool = False,
+                 seed: int = 42):
+        """
+        Args:
+            target_columns (List[str]): The list of binary target column names.
+            return_pandas (bool): Whether to return results as pandas DataFrame.
+            seed (int): Random seed for reproducibility.
+        """
+        super().__init__(return_pandas=return_pandas, seed=seed)
+        self.targets = target_columns
+    def downsample_all_negatives(self,
+                                 df: Union[pd.DataFrame, pl.DataFrame],
+                                 negative_ratio: float = 1.0,
+                                 verbose: int = 2) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Downsamples rows where ALL target columns are 0 ("background" class).
+        Args:
+            df (pd.DataFrame | pl.DataFrame): Input DataFrame.
+            negative_ratio (float): Ratio of negatives to positives to retain.
+            verbose (int): Verbosity level for logging.
+        Returns:
+            Dataframe: Resampled DataFrame.
+        """
+        df_pl = self._convert_to_polars(df)
+        # 1. Identify "All Negative" vs "Has Signal"
+        fold_expr = pl.sum_horizontal(pl.col(self.targets)).cast(pl.UInt32)
+        df_pos = df_pl.filter(fold_expr > 0)
+        df_neg = df_pl.filter(fold_expr == 0)
+        n_pos = df_pos.height
+        n_neg_original = df_neg.height
+        if n_pos == 0:
+            if verbose >= 1:
+                _LOGGER.warning("No positive cases found in any label. Returning original DataFrame.")
+            return self._process_return(df_pl, shuffle=False)
+        # 2. Calculate target count for negatives
+        target_n_neg = int(n_pos * negative_ratio)
+        # 3. Sample if necessary
+        if n_neg_original > target_n_neg:
+            if verbose >= 2:
+                _LOGGER.info(f"📉 Downsampling 'All-Negative' rows from {n_neg_original} to {target_n_neg}")
+            # Here we use standard sampling because we are not grouping
+            df_neg_sampled = df_neg.sample(n=target_n_neg, seed=self.seed, with_replacement=False)
+            df_resampled = pl.concat([df_pos, df_neg_sampled])
+            return self._process_return(df_resampled)
+        else:
+            if verbose >= 1:
+                _LOGGER.warning(f"Negative count ({n_neg_original}) is already below target ({target_n_neg}). No downsampling applied.")
+            return self._process_return(df_pl, shuffle=False)
+    def balance_powerset(self,
+                         df: Union[pd.DataFrame, pl.DataFrame],
+                         max_samples_per_combination: Optional[int] = None,
+                         quantile_limit: float = 0.90,
+                         verbose: int = 2) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Groups data by unique label combinations (Powerset) and downsamples
+        majority combinations.
+        Args:
+            df (pd.DataFrame | pl.DataFrame): Input DataFrame.
+            max_samples_per_combination (int | None): Fixed cap per combination.
+                If None, uses quantile_limit to determine cap.
+            quantile_limit (float): Quantile to determine cap if max_samples_per_combination is None.
+            verbose (int): Verbosity level for logging.
+        Returns:
+            Dataframe: Resampled DataFrame.
+        """
+        df_pl = self._convert_to_polars(df)
+        # 1. Create a hash/structural representation of the targets for grouping
+        df_lazy = df_pl.lazy().with_columns(
+            pl.concat_list(pl.col(self.targets)).alias("_powerset_key")
+        )
+        # 2. Calculate frequencies
+        # We need to collect partially to calculate the quantile cap
+        combo_counts = df_lazy.group_by("_powerset_key").len().collect()
+        # Determine the Cap
+        if max_samples_per_combination is None:
+            # Handle potential None from quantile (satisfies linter)
+            q_val = combo_counts["len"].quantile(quantile_limit)
+            if q_val is None:
+                if verbose >= 1:
+                    _LOGGER.warning("Data empty or insufficient to calculate quantile. Returning original.")
+                return self._process_return(df_pl, shuffle=False)
+            cap_size = int(q_val)
+            if verbose >= 3:
+                _LOGGER.info(f"📊 Auto-calculated Powerset Cap: {cap_size} samples (based on {quantile_limit} quantile).")
+        else:
+            cap_size = max_samples_per_combination
+        # 3. Apply Stratified Sampling / Capping (Randomized)
+        df_balanced = (
+            df_lazy
+            .filter(
+                pl.int_range(0, pl.len())
+                .shuffle(seed=self.seed)
+                .over("_powerset_key")
+                < cap_size
+            )
+            .drop("_powerset_key")
+            .collect()
+        )
+        if verbose >= 2:
+            original_count = df_pl.height
+            new_count = df_balanced.height
+            _LOGGER.info(f"⚖️ Powerset Balancing: Reduced from {original_count} to {new_count} rows.")
+        return self._process_return(df_balanced)
+    def describe_balance(self, df: Union[pd.DataFrame, pl.DataFrame], top_n: int = 10) -> None:
+        df_pl = self._convert_to_polars(df)
+        total_rows = df_pl.height
+        message_1 = f"\n📊 --- Target Balance Report ({total_rows} samples) ---\n🎯 Multi-Targets: {len(self.targets)} columns"
+        # A. Individual Label Counts
+        sums = df_pl.select([
+            pl.sum(col).alias(col) for col in self.targets
+        ]).transpose(include_header=True, header_name="Label", column_names=["Count"])
+        sums = sums.with_columns(
+            (pl.col("Count") / total_rows * 100).round(2).alias("Percentage(%)")
+        ).sort("Count", descending=True)
+        message_1 += "\n🔹 Individual Label Frequencies:"
+        # B. Powerset (Combination) Counts
+        message_2 = f"🔹 Top {top_n} Label Combinations (Powerset):"
+        combo_stats = (
+            df_pl.group_by(self.targets)
+            .len(name="Count")
+            .sort("Count", descending=True)
+            .with_columns(
+                (pl.col("Count") / total_rows * 100).round(2).alias("Percentage(%)")
+            )
+        )
+        _LOGGER.info(f"{message_1}\n{sums.head(top_n)}\n{message_2}\n{combo_stats.head(top_n)}")

ml_tools/resampling/_single_resampling.py ADDED Viewed

@@ -0,0 +1,113 @@
+import polars as pl
+import pandas as pd
+import numpy as np
+from typing import Union
+from .._core import get_logger
+from ._base_resampler import _DragonBaseResampler
+_LOGGER = get_logger("DragonResampler")
+__all__ = [
+    "DragonResampler",
+]
+class DragonResampler(_DragonBaseResampler):
+    """
+    A resampler for Single-Target Classification tasks (Binary or Multiclass).
+    It balances classes by downsampling majority classes relative to the size of the minority class.
+    """
+    def __init__(self,
+                 target_column: str,
+                 return_pandas: bool = False,
+                 seed: int = 42):
+        """
+        Args:
+            target_column (str): The name of the single target column.
+            return_pandas (bool): Whether to return results as pandas DataFrame.
+            seed (int): Random seed for reproducibility.
+        """
+        super().__init__(return_pandas=return_pandas, seed=seed)
+        self.target = target_column
+    def balance_classes(self,
+                        df: Union[pd.DataFrame, pl.DataFrame],
+                        majority_ratio: float = 1.0,
+                        verbose: int = 2) -> Union[pd.DataFrame, pl.DataFrame]:
+        """
+        Downsamples all classes to match the minority class size (scaled by a ratio).
+        """
+        df_pl = self._convert_to_polars(df)
+        # 1. Calculate Class Counts
+        counts = df_pl.group_by(self.target).len().sort("len")
+        if counts.height == 0:
+            _LOGGER.error("DataFrame is empty or target column missing.")
+            return self._process_return(df_pl, shuffle=False)
+        # 2. Identify Statistics
+        min_val = counts["len"].min()
+        max_val = counts["len"].max()
+        if min_val is None or max_val is None:
+            _LOGGER.error("Failed to calculate class statistics (unexpected None).")
+            raise ValueError()
+        minority_count: int = min_val  # type: ignore
+        majority_count: int = max_val  # type: ignore
+        # Calculate the cap
+        cap_size = int(minority_count * majority_ratio)
+        if verbose >= 3:
+            _LOGGER.info(f"📊 Class Distribution:\n{counts}")
+            _LOGGER.info(f"🎯 Strategy: Cap majorities at {cap_size}")
+        # Optimization: If data is already balanced enough
+        if majority_count <= cap_size:
+            if verbose >= 2:
+                _LOGGER.info("Data is already within the requested balance ratio.")
+            return self._process_return(df_pl, shuffle=False)
+        # 3. Apply Downsampling (Randomized)
+        # We generate a random range index per group and filter by it.
+        # This ensures we pick a random subset, not the first N rows.
+        df_balanced = (
+            df_pl.lazy()
+            .filter(
+                pl.int_range(0, pl.len())
+                .shuffle(seed=self.seed)
+                .over(self.target)
+                < cap_size
+            )
+            .collect()
+        )
+        if verbose >= 2:
+            reduced_count = df_balanced.height
+            _LOGGER.info(f"⚖️ Balancing Complete: {df_pl.height} -> {reduced_count} rows.")
+        return self._process_return(df_balanced)
+    def describe_balance(self, df: Union[pd.DataFrame, pl.DataFrame], top_n: int = 10) -> None:
+        df_pl = self._convert_to_polars(df)
+        total_rows = df_pl.height
+        message = f"\n📊 --- Target Balance Report ({total_rows} samples) ---\n🎯 Single Target: '{self.target}'"
+        stats = (
+            df_pl.group_by(self.target)
+            .len(name="Count")
+            .sort("Count", descending=True)
+            .with_columns(
+                (pl.col("Count") / total_rows * 100).round(2).alias("Percentage(%)")
+            )
+        )
+        _LOGGER.info(f"{message}\n{stats.head(top_n)}")

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.7.1.dist-info → dragon_ml_toolbox-20.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 20.7.1__py3-none-any.whl → 20.9.0__py3-none-any.whl

dragon-ml-toolbox 20.7.1py3-none-any.whl → 20.9.0py3-none-any.whl