PyPI - dragon-ml-toolbox - Versions diffs - 20.5.0__py3-none-any.whl → 20.7.0__py3-none-any.whl - Mend

dragon-ml-toolbox 20.5.0py3-none-any.whl → 20.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.5.0
+Version: 20.7.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-dragon_ml_toolbox-20.5.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-20.5.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
+dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-20.7.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
-ml_tools/ETL_cleaning/__init__.py,sha256=8dsHiguUkI6Ix1759IPdGU3IXcjMz4DyaSCkdYhxxg8,490
+ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
 ml_tools/ETL_cleaning/_basic_clean.py,sha256=2_FhWP-xYgl8s51H3OjYb_sqsW2yX_QZ4kmyrKjbSsc,13892
-ml_tools/ETL_cleaning/_clean_tools.py,sha256=pizTBK69zHt7HpZc_bcX9KoX2loLDcyQJddf_Kl-Ldo,5129
-ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=dge7KQSO4IdeXV4pCCJCb5lhAzR8rmwZPoCscm1A9KY,10272
+ml_tools/ETL_cleaning/_clean_tools.py,sha256=7aIC4w0CLK93E2nWC8h8YbI8bW_3Na9myD9VBMA-9zQ,9575
+ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=WvDHtdQTQldYwRWkmr3MlqFgWPl8rrEHp6m1uqgH0ho,13291
 ml_tools/ETL_engineering/__init__.py,sha256=EVIU0skxaH4ZDk8tEkOrxhTMSSA2LI_glhIpzFSxxlg,1007
 ml_tools/ETL_engineering/_dragon_engineering.py,sha256=D-D6tmhyQ3I9-cXgxLVVbQBRTZoNsWaKPsvcTUaetws,10810
 ml_tools/ETL_engineering/_transforms.py,sha256=qOxa_vjh3gzS4IiGFqq_0Wnh0ilQO41jRiIp-6Ej4vw,47079
@@ -30,7 +30,7 @@ ml_tools/ML_chain/_update_schema.py,sha256=z1Us7lv6hy6GwSu1mcid50Jmqq3sh91hMQ0Ln
 ml_tools/ML_configuration/__init__.py,sha256=ogktFnYxz5jWJkhHS4DVaMldHkt3lT2gw9jx5PQ3d78,2755
 ml_tools/ML_configuration/_base_model_config.py,sha256=95L3IfobNFMtnNr79zYpDGerC1q1v7M05tWZvTS2cwE,2247
 ml_tools/ML_configuration/_finalize.py,sha256=l_n13bLu0avMdJ8hNRrH8V_wOBQZM1UGsTydKBkTysM,15047
-ml_tools/ML_configuration/_metrics.py,sha256=PqBGPO1Y_6ImmYI3TEBJhzipULE854vbvE0AbP5m8zQ,22888
+ml_tools/ML_configuration/_metrics.py,sha256=xKtEKzphtidwwU8UuUpGv4B8Y6Bv0tAOjEFUYfz8Ehc,23758
 ml_tools/ML_configuration/_models.py,sha256=lvuuqvD6DWUzOa3i06NZfrdfOi9bu2e26T_QO6BGMSw,7629
 ml_tools/ML_configuration/_training.py,sha256=_M_TwouHFNbGrZQtQNAvyG_poSVpmN99cbyUonZsHhk,8969
 ml_tools/ML_datasetmaster/__init__.py,sha256=UltQzuXnlXVCkD-aeA5TW4IcMVLnQf1_aglawg4WyrI,580
@@ -39,7 +39,7 @@ ml_tools/ML_datasetmaster/_datasetmaster.py,sha256=Oy2UE3YJpKTaFwQF5TkQLgLB54-BF
 ml_tools/ML_datasetmaster/_sequence_datasetmaster.py,sha256=cW3fuILZWs-7Yuo4T2fgGfTC4vwho3Gp4ohIKJYS7O0,18452
 ml_tools/ML_datasetmaster/_vision_datasetmaster.py,sha256=kvSqXYeNBN1JSRfSEEXYeIcsqy9HsJAl_EwFWClqlsw,67025
 ml_tools/ML_evaluation/__init__.py,sha256=e3c8JNP0tt4Kxc7QSQpGcOgrxf8JAucH4UkJvJxUL2E,1122
-ml_tools/ML_evaluation/_classification.py,sha256=xXCh87RE9_VXYalc7l6CbakYfB0rijGrY76RZIrqLBk,28922
+ml_tools/ML_evaluation/_classification.py,sha256=8bKQejKrgMipnxU1T12ted7p60xvJS0d0MvHtdNBCBM,30971
 ml_tools/ML_evaluation/_feature_importance.py,sha256=mTwi3LKom_axu6UFKunELj30APDdhG9GQC2w7I9mYhI,17137
 ml_tools/ML_evaluation/_loss.py,sha256=1a4O25i3Ya_3naNZNL7ELLUL46BY86g1scA7d7q2UFM,3625
 ml_tools/ML_evaluation/_regression.py,sha256=hnT2B2_6AnQ7aA7uk-X2lZL9G5JFGCduDXyZbr1gFCA,11037
@@ -118,7 +118,7 @@ ml_tools/ensemble_learning/_ensemble_learning.py,sha256=MHDZBR20_nStlSSeThFI3bSu
 ml_tools/excel_handler/__init__.py,sha256=AaWM3n_dqBhJLTs3OEA57ex5YykKXNOwVCyHlVsdnqI,530
 ml_tools/excel_handler/_excel_handler.py,sha256=TODudmeQgDSdxUKzLfAzizs--VL-g8WxDOfQ4sgxxLs,13965
 ml_tools/keys/__init__.py,sha256=-0c2pmrhyfROc-oQpEjJGLBMhSagA3CyFijQaaqZRqU,399
-ml_tools/keys/_keys.py,sha256=kBcW3euNmD57_4aoRaAeqJP3FtU3iSuvgYv-BZqnEWw,9290
+ml_tools/keys/_keys.py,sha256=lL9NlijxOEAhfDPPqK_wL3QhjalrYK_fWM-KNniSIOA,9308
 ml_tools/math_utilities/__init__.py,sha256=K7Obkkc4rPKj4EbRZf1BsXHfiCg7FXYv_aN9Yc2Z_Vg,400
 ml_tools/math_utilities/_math_utilities.py,sha256=BYHIVcM9tuKIhVrkgLLiM5QalJ39zx7dXYy_M9aGgiM,9012
 ml_tools/optimization_tools/__init__.py,sha256=KD8JXpfGuPndO4AHnjJGu6uV1GRwhOfboD0KZV45kzw,658
@@ -134,10 +134,11 @@ ml_tools/schema/_feature_schema.py,sha256=MuPf6Nf7tDhUTGyX7tcFHZh-lLSNsJkLmlf9Ix
 ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
 ml_tools/serde/__init__.py,sha256=IDirr8i-qjUHB71hmHO6lGiODhUoOnUcXYrvb_XgrzE,292
 ml_tools/serde/_serde.py,sha256=8QnYK8ZG21zdNaC0v63iSz2bhgwOKRKAWxTVQvMV0A8,5525
-ml_tools/utilities/__init__.py,sha256=iQb-S5JesEjGGI8983Vkj-14LCtchFxdWRhaziyvnoY,808
+ml_tools/utilities/__init__.py,sha256=h4lE3SQstg-opcQj6QSKhu-HkqSbmHExsWoM9vC5D9U,1035
+ml_tools/utilities/_translate.py,sha256=t5Z7s9X3KTHn-jpe49yRdhYkzAfYzzU4EsIJiUdRnEk,10296
 ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
 ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
-dragon_ml_toolbox-20.5.0.dist-info/METADATA,sha256=sf0thvyXG1fpiAdeFpjiTdsZBkdVEECxdTDz0oGFgv8,7866
-dragon_ml_toolbox-20.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-20.5.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-20.5.0.dist-info/RECORD,,
+dragon_ml_toolbox-20.7.0.dist-info/METADATA,sha256=MfguicRfdmedIMRUMM6qVIelIr56Mrqdjv4dvTPhB6Y,7866
+dragon_ml_toolbox-20.7.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-20.7.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-20.7.0.dist-info/RECORD,,

ml_tools/ETL_cleaning/__init__.py CHANGED Viewed

@@ -10,7 +10,8 @@ from ._dragon_cleaner import (
 )
 from ._clean_tools import (
-    save_unique_values
+    save_unique_values,
+    save_category_counts,
 )
 from .._core import _imprimir_disponibles
@@ -20,6 +21,7 @@ __all__ = [
     "DragonColumnCleaner",
     "DragonDataFrameCleaner",
     "save_unique_values",
+    "save_category_counts",
     "basic_clean",
     "basic_clean_drop",
     "drop_macro_polars",

ml_tools/ETL_cleaning/_clean_tools.py CHANGED Viewed

@@ -13,6 +13,7 @@ _LOGGER = get_logger("ETL Clean Tools")
 __all__ = [
     "save_unique_values",
+    "save_category_counts",
 ]
@@ -126,3 +127,111 @@ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
             counter += 1
     _LOGGER.info(f"{counter} files of unique values created.")
+################ Category Counts per column #################
+def save_category_counts(csv_path_or_df: Union[str, Path, pl.DataFrame],
+                         output_dir: Union[str, Path],
+                         use_columns: Optional[list[str]] = None,
+                         verbose: bool = False,
+                         keep_column_order: bool = True) -> None:
+    """
+    Calculates the frequency and percentage of each unique value in the specified columns
+    and saves the distribution report to a text file.
+    Useful for checking class balance or identifying rare categories.
+    Args:
+        csv_path_or_df (str | Path | pl.DataFrame):
+            The file path to the input CSV file or a Polars DataFrame.
+        output_dir (str | Path):
+            The directory where the report files will be saved.
+        use_columns (List[str] | None):
+            Columns to analyze. If None, all columns are processed.
+        verbose (bool):
+            If True, prints progress info.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix to filenames to maintain order.
+    """
+    # 1. Handle Input
+    if isinstance(csv_path_or_df, pl.DataFrame):
+        df = csv_path_or_df
+        if use_columns:
+            valid_cols = [c for c in use_columns if c in df.columns]
+            if not valid_cols:
+                _LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
+                raise ValueError()
+            df = df.select(valid_cols)
+    else:
+        csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
+        df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
+    output_path = make_fullpath(input_path=output_dir, make=True, enforce='directory')
+    total_rows = df.height
+    if total_rows == 0:
+        _LOGGER.warning("Input DataFrame is empty. No counts to save.")
+        return
+    counter = 0
+    # 2. Process Each Column
+    for i, col_name in enumerate(df.columns):
+        try:
+            # Group by, count, and calculate percentage
+            # We treat nulls as a category here to see missing data frequency
+            stats = (
+                df.select(pl.col(col_name))
+                .group_by(col_name, maintain_order=False)
+                .len(name="count")
+                .with_columns(
+                    (pl.col("count") / total_rows * 100).alias("pct")
+                )
+                .sort("count", descending=True)
+            )
+            # Collect to python list of dicts for writing
+            rows = stats.iter_rows(named=True)
+            unique_count = stats.height
+            # Check thresholds for warning
+            is_high_cardinality = (unique_count > 300) or ((unique_count / total_rows) > 0.5)
+        except Exception:
+            _LOGGER.error(f"Could not calculate counts for column '{col_name}'.")
+            continue
+        # 3. Write to File
+        sanitized_name = sanitize_filename(col_name)
+        if not sanitized_name.strip('_'):
+            sanitized_name = f'column_{i}'
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_path / f"{prefix}{sanitized_name}_counts.txt"
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(f"# Distribution for column: '{col_name}'\n")
+                f.write(f"# Total Rows: {total_rows} | Unique Values: {unique_count}\n")
+                if is_high_cardinality:
+                    f.write(f"# WARNING: High cardinality detected (Unique/Total ratio: {unique_count/total_rows:.2%}).\n")
+                f.write("-" * 65 + "\n")
+                f.write(f"{'Count':<10} | {'Percentage':<12} | {'Value'}\n")
+                f.write("-" * 65 + "\n")
+                for row in rows:
+                    val = str(row[col_name])
+                    count = row["count"]
+                    pct = row["pct"]
+                    f.write(f"{count:<10} | {pct:>10.2f}%  | {val}\n")
+        except IOError:
+             _LOGGER.exception(f"Error writing to file {file_path}.")
+        else:
+             if verbose:
+                 print(f"    Saved distribution for '{col_name}'.")
+             counter += 1
+    _LOGGER.info(f"{counter} distribution files created.")

ml_tools/ETL_cleaning/_dragon_cleaner.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import polars as pl
 from pathlib import Path
-from typing import Union
+from typing import Union, Optional
 from ..utilities import save_dataframe_filename, load_dataframe
 from .._core import get_logger
 from ..path_manager import make_fullpath
-from ._clean_tools import save_unique_values
+from ._clean_tools import save_unique_values, save_category_counts
 _LOGGER = get_logger("DragonCleaner")
@@ -33,12 +33,18 @@ class DragonColumnCleaner:
     """
     def __init__(self,
                  column_name: str,
-                 rules: Union[dict[str, Union[str, None]], dict[str, str]],
+                 exact_matches: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
+                 rules: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
                  case_insensitive: bool = False):
         """
         Args:
             column_name (str):
                 The name of the column to be cleaned.
+            exact_matches (Dict[str, str | None]):
+                A dictionary of EXACT string matches to replacement strings.
+                - Uses a hash map, which is significantly faster than regex.
+                - Used for simple 1-to-1 mappings (e.g., {'Aluminum': 'Al'}).
+                - Runs BEFORE the regex rules.
             rules (Dict[str, str | None]):
                 A dictionary of regex patterns to replacement strings.
                 - Replacement can be None to indicate that matching values should be converted to null.
@@ -61,25 +67,47 @@ class DragonColumnCleaner:
         if not isinstance(column_name, str) or not column_name:
             _LOGGER.error("The 'column_name' must be a non-empty string.")
             raise TypeError()
-        if not isinstance(rules, dict):
-            _LOGGER.error("The 'rules' argument must be a dictionary.")
-            raise TypeError()
-        # validate rules
-        for pattern, replacement in rules.items():
-            if not isinstance(pattern, str):
-                _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+        # Validate Regex Rules
+        if rules is not None:
+            if not isinstance(rules, dict):
+                _LOGGER.error("The 'rules' argument must be a dictionary.")
                 raise TypeError()
-            if replacement is not None and not isinstance(replacement, str):
-                _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+            for pattern, replacement in rules.items():
+                if not isinstance(pattern, str):
+                    _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+                    raise TypeError()
+                if replacement is not None and not isinstance(replacement, str):
+                    _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+                    raise TypeError()
+        # Validate Exact Matches
+        if exact_matches is not None:
+            if not isinstance(exact_matches, dict):
+                _LOGGER.error("The 'exact_matches' argument must be a dictionary.")
                 raise TypeError()
+            for key, val in exact_matches.items():
+                if not isinstance(key, str):
+                    _LOGGER.error("All keys in 'exact_matches' must be strings.")
+                    raise TypeError()
+                if val is not None and not isinstance(val, str):
+                    _LOGGER.error("All values in 'exact_matches' must be strings or None.")
+                    raise TypeError()
+        # Raise if both are None or empty
+        if not rules and not exact_matches:
+            _LOGGER.error("At least one of 'rules' or 'exact_matches' must be provided.")
+            raise ValueError()
         self.column_name = column_name
-        self.rules = rules
+        self.rules = rules if rules else {}
+        self.exact_matches = exact_matches if exact_matches else {}
         self.case_insensitive = case_insensitive
     def preview(self,
                 csv_path: Union[str, Path],
                 report_dir: Union[str, Path],
+                show_distribution: bool = True,
                 add_value_separator: bool=False,
                 rule_batch_size: int = 150):
         """
@@ -90,6 +118,8 @@ class DragonColumnCleaner:
                 The path to the CSV file containing the data to clean.
             report_dir (str | Path):
                 The directory where the preview report will be saved.
+            show_distribution (bool):
+                If True, generates a category count report for the column after cleaning.
             add_value_separator (bool):
                 If True, adds a separator line between each unique value in the report.
             rule_batch_size (int):
@@ -101,13 +131,21 @@ class DragonColumnCleaner:
         preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
         df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
-        # Apply cleaning rules to a copy of the column for preview
+        # Apply cleaning rules and save reports
         save_unique_values(csv_path_or_df=df_preview,
                            output_dir=report_dir,
                            use_columns=[self.column_name],
                            verbose=False,
                            keep_column_order=False,
                            add_value_separator=add_value_separator)
+        # Optionally save category counts
+        if show_distribution:
+            save_category_counts(csv_path_or_df=df_preview,
+                                 output_dir=report_dir,
+                                 use_columns=[self.column_name],
+                                 verbose=False,
+                                 keep_column_order=False)
 class DragonDataFrameCleaner:
@@ -181,16 +219,23 @@ class DragonDataFrameCleaner:
         for cleaner in self.cleaners:
             col_name = cleaner.column_name
-            # Get all rules as a list of items
+            # Start expression for this batch
+            col_expr = pl.col(col_name).cast(pl.String)
+            # --- PHASE 1: EXACT MATCHES ---
+            # Apply dictionary-based replacement first (faster than regex)
+            if cleaner.exact_matches:
+                # 'replace' handles dictionary mapping safely. If value is mapped to None, it becomes null.
+                col_expr = col_expr.replace(cleaner.exact_matches)
+            # --- PHASE 2: REGEX PATTERNS ---
             all_rules = list(cleaner.rules.items())
             # Process in batches of 'rule_batch_size'
             for i in range(0, len(all_rules), rule_batch_size):
                 rule_batch = all_rules[i : i + rule_batch_size]
-                # Start expression for this batch
-                col_expr = pl.col(col_name).cast(pl.String)
+                # continue chaining operations on the same col_expr
                 for pattern, replacement in rule_batch:
                     final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
@@ -202,6 +247,15 @@ class DragonDataFrameCleaner:
                         col_expr = col_expr.str.replace_all(final_pattern, replacement)
                 # Apply this batch of rules to the LazyFrame
+                # apply partially here to keep the logical plan size under control
+                final_lf = final_lf.with_columns(col_expr.alias(col_name))
+                # Reset col_expr for the next batch, but pointing to the 'new' column
+                # This ensures the next batch works on the result of the previous batch
+                col_expr = pl.col(col_name)
+            # If we had exact matches but NO regex rules, we still need to apply the expression once
+            if cleaner.exact_matches and not all_rules:
                 final_lf = final_lf.with_columns(col_expr.alias(col_name))
         # 3. Collect Results
@@ -242,4 +296,3 @@ class DragonDataFrameCleaner:
         save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
         return None

ml_tools/ML_configuration/_metrics.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Literal
 __all__ = [
@@ -26,7 +26,7 @@ class _BaseClassificationFormat:
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  xtick_size: int=22,
                  ytick_size: int=22,
                  legend_size: int=26,
@@ -46,8 +46,8 @@ class _BaseClassificationFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
-            calibration_bins (int): The number of bins to use when
-                creating the calibration (reliability) plot.
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plot. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
@@ -97,6 +97,7 @@ class _BaseMultiLabelFormat:
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -115,6 +116,9 @@ class _BaseMultiLabelFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plots for each label. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
             xtick_size (int): Font size for x-axis tick labels.
@@ -133,6 +137,7 @@ class _BaseMultiLabelFormat:
         """
         self.cmap = cmap
         self.ROC_PR_line = ROC_PR_line
+        self.calibration_bins = calibration_bins
         self.font_size = font_size
         self.xtick_size = xtick_size
         self.ytick_size = ytick_size
@@ -142,6 +147,7 @@ class _BaseMultiLabelFormat:
         parts = [
             f"cmap='{self.cmap}'",
             f"ROC_PR_line='{self.ROC_PR_line}'",
+            f"calibration_bins={self.calibration_bins}",
             f"font_size={self.font_size}",
             f"xtick_size={self.xtick_size}",
             f"ytick_size={self.ytick_size}",
@@ -416,7 +422,7 @@ class FormatBinaryClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -440,7 +446,7 @@ class FormatMultiClassClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -464,7 +470,7 @@ class FormatBinaryImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -488,7 +494,7 @@ class FormatMultiClassImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -513,6 +519,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -520,6 +527,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
                  ) -> None:
         super().__init__(cmap=cmap,
                          ROC_PR_line=ROC_PR_line,
+                         calibration_bins=calibration_bins,
                          font_size=font_size,
                          xtick_size=xtick_size,
                          ytick_size=ytick_size,

ml_tools/ML_evaluation/_classification.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.calibration import CalibrationDisplay
+from sklearn.calibration import calibration_curve
 from sklearn.metrics import (
     classification_report,
     ConfusionMatrixDisplay,
@@ -378,42 +378,42 @@ def classification_metrics(save_dir: Union[str, Path],
             # --- Save Calibration Plot ---
             fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+            user_chosen_bins = format_config.calibration_bins
+            # --- Automate Bin Selection ---
+            if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+                # Determine bins based on number of samples
+                n_samples = y_true.shape[0]
+                if n_samples < 200:
+                    dynamic_bins = 5
+                elif n_samples < 1000:
+                    dynamic_bins = 10
+                else:
+                    dynamic_bins = 15
+            else:
+                dynamic_bins = user_chosen_bins
+            # --- Step 1: Get binned data directly ---
+            # calculates reliability diagram data without needing a temporary plot
+            prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
-            # --- Step 1: Get binned data *without* plotting ---
-            with plt.ioff(): # Suppress showing the temporary plot
-                fig_temp, ax_temp = plt.subplots()
-                cal_display_temp = CalibrationDisplay.from_predictions(
-                    y_true_binary, # Use binarized labels
-                    y_score,
-                    n_bins=format_config.calibration_bins,
-                    ax=ax_temp,
-                    name="temp" # Add a name to suppress potential warnings
-                )
-                # Get the x, y coordinates of the binned data
-                line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
-                plt.close(fig_temp) # Close the temporary plot
-            # --- Step 2: Build the plot from scratch ---
+            # --- Step 2: Plot ---
             ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
-            sns.regplot(
-                x=line_x,
-                y=line_y,
-                ax=ax_cal,
-                scatter=False,
-                label=f"Model calibration",
-                line_kws={
-                    'color': format_config.ROC_PR_line,
-                    'linestyle': '--',
-                    'linewidth': 2,
-                    }
-            )
+            # Plot the actual calibration curve (connect points with a line)
+            ax_cal.plot(prob_pred,
+                        prob_true,
+                        marker='o',  # Add markers to see bin locations
+                        linewidth=2,
+                        label="Model calibration",
+                        color=format_config.ROC_PR_line)
             ax_cal.set_title(f'Reliability Curve{plot_title}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size + 2)
             ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
             ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
-            # --- Step 3: Set final limits *after* plotting ---
+            # --- Step 3: Set final limits ---
             ax_cal.set_ylim(0.0, 1.0)
             ax_cal.set_xlim(0.0, 1.0)
@@ -428,7 +428,7 @@ def classification_metrics(save_dir: Union[str, Path],
             cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
             plt.savefig(cal_path)
             plt.close(fig_cal)
         _LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
@@ -632,6 +632,52 @@ def multi_label_classification_metrics(
         pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
         plt.savefig(pr_path)
         plt.close(fig_pr)
+        # --- Save Calibration Plot (New Feature) ---
+        fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+        user_chosen_bins = format_config.calibration_bins
+        # --- Automate Bin Selection ---
+        if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+            # Determine bins based on number of samples
+            n_samples = y_true.shape[0]
+            if n_samples < 200:
+                dynamic_bins = 5
+            elif n_samples < 1000:
+                dynamic_bins = 10
+            else:
+                dynamic_bins = 15
+        else:
+            dynamic_bins = user_chosen_bins
+        # Calculate calibration curve for this specific label
+        prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
+        ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+        ax_cal.plot(prob_pred,
+                    prob_true,
+                    marker='o',
+                    linewidth=2,
+                    label=f"Calibration for '{name}'",
+                    color=format_config.ROC_PR_line)
+        ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylim(0.0, 1.0)
+        ax_cal.set_xlim(0.0, 1.0)
+        ax_cal.tick_params(axis='x', labelsize=xtick_size)
+        ax_cal.tick_params(axis='y', labelsize=ytick_size)
+        ax_cal.legend(loc='lower right', fontsize=legend_size)
+        ax_cal.grid(True)
+        plt.tight_layout()
+        cal_path = save_dir_path / f"calibration_plot_{sanitized_name}.svg"
+        plt.savefig(cal_path)
+        plt.close(fig_cal)
     _LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")

ml_tools/keys/_keys.py CHANGED Viewed

@@ -4,6 +4,7 @@ class MagicWords:
     CURRENT = "current"
     RENAME = "rename"
     UNKNOWN = "unknown"
+    AUTO = "auto"
 class PyTorchLogKeys:

ml_tools/utilities/__init__.py CHANGED Viewed

@@ -15,6 +15,13 @@ from ._utility_tools import (
     train_dataset_yielder
 )
+from ._translate import (
+    translate_dataframe_columns,
+    create_translation_template,
+    audit_column_translation
+)
 from .._core import _imprimir_disponibles
@@ -27,6 +34,9 @@ __all__ = [
     "save_dataframe",
     "save_dataframe_with_schema",
     "merge_dataframes",
+    "translate_dataframe_columns",
+    "create_translation_template",
+    "audit_column_translation",
     "distribute_dataset_by_target",
     "train_dataset_orchestrator",
     "train_dataset_yielder"

ml_tools/utilities/_translate.py ADDED Viewed

@@ -0,0 +1,292 @@
+import json
+import pandas as pd
+import polars as pl
+from pathlib import Path
+from typing import Union, Literal
+from ..path_manager import make_fullpath
+from .._core import get_logger
+from ._utility_save_load import load_dataframe
+_LOGGER = get_logger("Translation Tools")
+__all__ = [
+    "translate_dataframe_columns",
+    "create_translation_template",
+    "audit_column_translation"
+]
+def translate_dataframe_columns(
+    df: Union[pd.DataFrame, pl.DataFrame],
+    mapper: Union[dict[str, str], str, Path],
+    direction: Literal["A_to_B", "B_to_A"] = "A_to_B",
+    verbose: int = 3
+) -> Union[pd.DataFrame, pl.DataFrame]:
+    """
+    Translates the column names of a DataFrame (Pandas or Polars) using a provided mapping source.
+    The mapping can be a python dictionary, a JSON file, or a CSV file.
+    Translation Logic:
+    -----------------
+    The DataFrame currently has columns in 'Language A'.
+    - "A_to_B" (Standard):
+      The mapper is structured as {Language A : Language B}.
+      Keys match the current DataFrame columns.
+    - "B_to_A" (Inverted Source):
+      The mapper is structured as {Language B : Language A}.
+      Values match the current DataFrame columns.
+    Parameters
+    ----------
+    df : (pd.DataFrame | pl.DataFrame)
+        The input DataFrame to be translated.
+    mapper : (dict[str, str] | str | Path)
+        The source of the translation mapping:
+        - Dict: {'original_name': 'new_name'}
+        - JSON path: File containing a single JSON object (dict).
+        - CSV path: File with two columns.
+    direction : Literal["A_to_B", "B_to_A"]
+        Specifies the structure of the provided mapper relative to the DataFrame.
+    verbose : int
+        Whether to log warnings and information about the process.
+    Returns
+    -------
+    Dataframe:
+        The polars or pandas DataFrame with renamed columns.
+    """
+    # df type validation
+    if not isinstance(df, (pd.DataFrame, pl.DataFrame)):
+        _LOGGER.error(f"Input df must be a pandas or polars DataFrame. Got: {type(df)}")
+        raise TypeError()
+    # 1. Load and Standardize the Mapping
+    translation_map = _load_translation_mapping(mapper, direction)
+    # 2. Validation: Check intersection between DF columns and Map keys
+    df_cols = set(df.columns)
+    map_keys = set(translation_map.keys())
+    # Calculate overlap
+    common_cols = df_cols.intersection(map_keys)
+    if not common_cols:
+        if verbose >= 1:
+            _LOGGER.warning("No column names matched the provided translation mapping. Returning original DataFrame.")
+        return df
+    missing_in_map = df_cols - map_keys
+    if missing_in_map and verbose >= 1:
+        _LOGGER.warning(f"Columns not found in translation map: {list(missing_in_map)}")
+    if verbose >= 3:
+        _LOGGER.info(f"Translating {len(common_cols)} columns...")
+    # 3. Apply Translation
+    try:
+        if isinstance(df, pd.DataFrame):
+            return df.rename(columns=translation_map)
+        elif isinstance(df, pl.DataFrame):
+            return df.rename(translation_map)
+    except Exception as e:
+        _LOGGER.error(f"Failed to rename columns: {e}")
+        raise e
+    if verbose >= 2:
+        _LOGGER.info(f"Successfully translated {len(common_cols)} columns.")
+def create_translation_template(
+    df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
+    save_path: Union[str, Path],
+    verbose: bool = True
+) -> None:
+    """
+    Generates a JSON translation template from a DataFrame's column names.
+    Creates a 'translation_template.json' file where keys are the dataframe column names and values
+    are empty strings, ready for manual translation.
+    Parameters
+    ----------
+    df_or_path : [DataFrame | str | Path]
+        The DataFrame or path to a CSV file to extract column names from.
+    save_path : [str | Path]
+        The destination directory for the .json template.
+    """
+    # 1. Get Columns
+    if isinstance(df_or_path, (str, Path)):
+        df, _ = load_dataframe(df_or_path, kind="pandas", verbose=False)
+        columns = df.columns.tolist()
+    elif isinstance(df_or_path, pd.DataFrame):
+        columns = df_or_path.columns.tolist()
+    elif isinstance(df_or_path, pl.DataFrame):
+        columns = df_or_path.columns
+    else:
+        _LOGGER.error("Input must be a DataFrame or a path to a dataset.")
+        raise TypeError()
+    # 2. Create Dictionary {ColName : ""}
+    template_dict = {col: "" for col in columns}
+    # 3. Save to JSON
+    out_path = make_fullpath(save_path, enforce="directory")
+    full_out_path = out_path / "translation_template.json"
+    try:
+        with open(full_out_path, 'w', encoding='utf-8') as f:
+            json.dump(template_dict, f, indent=4, ensure_ascii=False)
+        if verbose:
+            _LOGGER.info(f"Translation template created at '{out_path.name}' with {len(columns)} entries.")
+    except Exception as e:
+        _LOGGER.error(f"Failed to save template: {e}")
+        raise e
+def audit_column_translation(
+    df_or_path: Union[pd.DataFrame, pl.DataFrame, str, Path],
+    mapper: Union[dict[str, str], str, Path],
+    direction: Literal["A_to_B", "B_to_A"] = "A_to_B"
+) -> None:
+    """
+    Audits the coverage of a translation map against a DataFrame WITHOUT applying changes.
+    Logs a detailed report of:
+    - How many columns will be renamed.
+    - Which DataFrame columns are NOT in the map (will remain unchanged).
+    - Which Map keys are NOT in the DataFrame (unused mappings).
+    Parameters
+    ----------
+    df_or_path : [DataFrame | str | Path]
+        The target dataset to audit.
+    mapper : [Dict | str | Path]
+        The translation source.
+    direction : ["A_to_B" | "B_to_A"]
+        Direction logic (see translate_dataframe_columns).
+    """
+    # 1. Get DataFrame Columns
+    if isinstance(df_or_path, (str, Path)):
+        df, df_name = load_dataframe(df_or_path, kind="pandas", verbose=False)
+        cols = set(df.columns)
+        source_name = f"File: '{df_name}'"
+    elif isinstance(df_or_path, pd.DataFrame):
+        cols = set(df_or_path.columns)
+        source_name = "DataFrame (Pandas)"
+    elif isinstance(df_or_path, pl.DataFrame):
+        cols = set(df_or_path.columns)
+        source_name = "DataFrame (Polars)"
+    else:
+        _LOGGER.error("Input must be a DataFrame or a path to a dataset.")
+        raise TypeError()
+    # 2. Load Map
+    try:
+        trans_map = _load_translation_mapping(mapper, direction)
+        map_keys = set(trans_map.keys())
+    except Exception as e:
+        _LOGGER.error(f"Could not load mapper. {e}")
+        return
+    # 3. Analyze Sets
+    matched = cols.intersection(map_keys)
+    missing_in_map = cols - map_keys
+    unused_map_keys = map_keys - cols
+    coverage_pct = (len(matched) / len(cols) * 100) if len(cols) > 0 else 0.0
+    # 4. Report
+    report_string = f"--- 🔍 Translation Audit Report: {source_name} ---\n \
+        Direction: {direction}\n \
+        Total Columns: {len(cols)}\n \
+        Map Coverage:  {len(matched)} / {len(cols)} ({coverage_pct:.1f}%)\n"
+    if matched:
+        report_string += f"\n✅ Will Translate: {len(matched)} columns"
+    if missing_in_map:
+        report_string += f"\n⚠️  Not in Map: {len(missing_in_map)} columns: {list(missing_in_map)}"
+    if unused_map_keys:
+        report_string += f"\n➡️ Unused Map Keys: {len(unused_map_keys)}"
+    _LOGGER.info(report_string)
+def _load_translation_mapping(
+    source: Union[dict[str, str], str, Path],
+    direction: Literal["A_to_B", "B_to_A"]
+) -> dict[str, str]:
+    """
+    Internal helper to load mapping from Dict, JSON, or CSV and handle direction inversion.
+    """
+    raw_map: dict[str, str] = {}
+    # --- Load Source ---
+    if isinstance(source, dict):
+        raw_map = source.copy()
+    elif isinstance(source, (str, Path)):
+        path = make_fullpath(source, enforce="file")
+        if path.suffix.lower() == ".json":
+            with open(path, 'r', encoding='utf-8') as f:
+                content = json.load(f)
+                if not isinstance(content, dict):
+                    _LOGGER.error(f"JSON file '{path.name}' does not contain a dictionary.")
+                    raise ValueError()
+                raw_map = content
+        elif path.suffix.lower() == ".csv":
+            # Load CSV using pandas for robustness
+            try:
+                df_map = pd.read_csv(path)
+                # STRICT VALIDATION: Must be exactly 2 columns
+                if df_map.shape[1] != 2:
+                    _LOGGER.error(f"CSV file '{path.name}' must have exactly 2 columns for mapping. Found {df_map.shape[1]}.")
+                    raise ValueError()
+                key_col = df_map.columns[0]
+                val_col = df_map.columns[1]
+                # Convert to dictionary (drop NaNs to be safe)
+                raw_map = df_map.dropna(subset=[key_col, val_col]).set_index(key_col)[val_col].to_dict()
+            except Exception as e:
+                _LOGGER.error(f"Error reading CSV mapping file: {e}")
+                raise e
+        else:
+            _LOGGER.error(f"Unsupported file extension for mapping source: {path.suffix}")
+            raise ValueError()
+    else:
+        _LOGGER.error("Mapper must be a Dictionary, or a Path/String to a JSON/CSV file.")
+        raise TypeError()
+    # --- Handle Direction ---
+    # Case: The mapper is A->B, and DF is A. (Keys match DF). Return as is.
+    if direction == "A_to_B":
+        return raw_map
+    # Case: The mapper is B->A, but DF is A. (Values match DF).
+    # swap the mapper to A->B so the Keys match the DF.
+    elif direction == "B_to_A":
+        # Inversion requires unique values to be lossless
+        reversed_map = {v: k for k, v in raw_map.items()}
+        if len(reversed_map) < len(raw_map):
+            _LOGGER.warning("Direction 'B_to_A' resulted in fewer keys than original. Duplicate target values existed in the source map; some collisions were overwritten.")
+        return reversed_map
+    else:
+        _LOGGER.error("Direction must be 'A_to_B' or 'B_to_A'.")
+        raise ValueError()

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.5.0.dist-info → dragon_ml_toolbox-20.7.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 20.5.0__py3-none-any.whl → 20.7.0__py3-none-any.whl

dragon-ml-toolbox 20.5.0py3-none-any.whl → 20.7.0py3-none-any.whl