PyPI - dragon-ml-toolbox - Versions diffs - 20.4.0__py3-none-any.whl → 20.6.0__py3-none-any.whl - Mend

dragon-ml-toolbox 20.4.0py3-none-any.whl → 20.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.4.0
+Version: 20.6.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
-dragon_ml_toolbox-20.4.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
-dragon_ml_toolbox-20.4.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
+dragon_ml_toolbox-20.6.0.dist-info/licenses/LICENSE,sha256=L35WDmmLZNTlJvxF6Vy7Uy4SYNi6rCfWUqlTHpoRMoU,1081
+dragon_ml_toolbox-20.6.0.dist-info/licenses/LICENSE-THIRD-PARTY.md,sha256=0-HBRMMgKuwtGy6nMJZvIn1fLxhx_ksyyVB2U_iyYZU,2818
 ml_tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ml_tools/constants.py,sha256=3br5Rk9cL2IUo638eJuMOGdbGQaWssaUecYEvSeRBLM,3322
-ml_tools/ETL_cleaning/__init__.py,sha256=8dsHiguUkI6Ix1759IPdGU3IXcjMz4DyaSCkdYhxxg8,490
+ml_tools/ETL_cleaning/__init__.py,sha256=gLRHF-qzwpqKTvbbn9chIQELeUDh_XGpBRX28j-5IqI,545
 ml_tools/ETL_cleaning/_basic_clean.py,sha256=2_FhWP-xYgl8s51H3OjYb_sqsW2yX_QZ4kmyrKjbSsc,13892
-ml_tools/ETL_cleaning/_clean_tools.py,sha256=pizTBK69zHt7HpZc_bcX9KoX2loLDcyQJddf_Kl-Ldo,5129
-ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=dge7KQSO4IdeXV4pCCJCb5lhAzR8rmwZPoCscm1A9KY,10272
+ml_tools/ETL_cleaning/_clean_tools.py,sha256=7aIC4w0CLK93E2nWC8h8YbI8bW_3Na9myD9VBMA-9zQ,9575
+ml_tools/ETL_cleaning/_dragon_cleaner.py,sha256=WvDHtdQTQldYwRWkmr3MlqFgWPl8rrEHp6m1uqgH0ho,13291
 ml_tools/ETL_engineering/__init__.py,sha256=EVIU0skxaH4ZDk8tEkOrxhTMSSA2LI_glhIpzFSxxlg,1007
 ml_tools/ETL_engineering/_dragon_engineering.py,sha256=D-D6tmhyQ3I9-cXgxLVVbQBRTZoNsWaKPsvcTUaetws,10810
 ml_tools/ETL_engineering/_transforms.py,sha256=qOxa_vjh3gzS4IiGFqq_0Wnh0ilQO41jRiIp-6Ej4vw,47079
@@ -30,7 +30,7 @@ ml_tools/ML_chain/_update_schema.py,sha256=z1Us7lv6hy6GwSu1mcid50Jmqq3sh91hMQ0Ln
 ml_tools/ML_configuration/__init__.py,sha256=ogktFnYxz5jWJkhHS4DVaMldHkt3lT2gw9jx5PQ3d78,2755
 ml_tools/ML_configuration/_base_model_config.py,sha256=95L3IfobNFMtnNr79zYpDGerC1q1v7M05tWZvTS2cwE,2247
 ml_tools/ML_configuration/_finalize.py,sha256=l_n13bLu0avMdJ8hNRrH8V_wOBQZM1UGsTydKBkTysM,15047
-ml_tools/ML_configuration/_metrics.py,sha256=PqBGPO1Y_6ImmYI3TEBJhzipULE854vbvE0AbP5m8zQ,22888
+ml_tools/ML_configuration/_metrics.py,sha256=xKtEKzphtidwwU8UuUpGv4B8Y6Bv0tAOjEFUYfz8Ehc,23758
 ml_tools/ML_configuration/_models.py,sha256=lvuuqvD6DWUzOa3i06NZfrdfOi9bu2e26T_QO6BGMSw,7629
 ml_tools/ML_configuration/_training.py,sha256=_M_TwouHFNbGrZQtQNAvyG_poSVpmN99cbyUonZsHhk,8969
 ml_tools/ML_datasetmaster/__init__.py,sha256=UltQzuXnlXVCkD-aeA5TW4IcMVLnQf1_aglawg4WyrI,580
@@ -39,7 +39,7 @@ ml_tools/ML_datasetmaster/_datasetmaster.py,sha256=Oy2UE3YJpKTaFwQF5TkQLgLB54-BF
 ml_tools/ML_datasetmaster/_sequence_datasetmaster.py,sha256=cW3fuILZWs-7Yuo4T2fgGfTC4vwho3Gp4ohIKJYS7O0,18452
 ml_tools/ML_datasetmaster/_vision_datasetmaster.py,sha256=kvSqXYeNBN1JSRfSEEXYeIcsqy9HsJAl_EwFWClqlsw,67025
 ml_tools/ML_evaluation/__init__.py,sha256=e3c8JNP0tt4Kxc7QSQpGcOgrxf8JAucH4UkJvJxUL2E,1122
-ml_tools/ML_evaluation/_classification.py,sha256=xXCh87RE9_VXYalc7l6CbakYfB0rijGrY76RZIrqLBk,28922
+ml_tools/ML_evaluation/_classification.py,sha256=8bKQejKrgMipnxU1T12ted7p60xvJS0d0MvHtdNBCBM,30971
 ml_tools/ML_evaluation/_feature_importance.py,sha256=mTwi3LKom_axu6UFKunELj30APDdhG9GQC2w7I9mYhI,17137
 ml_tools/ML_evaluation/_loss.py,sha256=1a4O25i3Ya_3naNZNL7ELLUL46BY86g1scA7d7q2UFM,3625
 ml_tools/ML_evaluation/_regression.py,sha256=hnT2B2_6AnQ7aA7uk-X2lZL9G5JFGCduDXyZbr1gFCA,11037
@@ -76,7 +76,7 @@ ml_tools/ML_models_vision/_image_classification.py,sha256=miwMNoTXpmmZSiqeXvDKpx
 ml_tools/ML_models_vision/_image_segmentation.py,sha256=NRjn91bDD2OJWSJFrrNW9s41qgg5w7pw68Q61-kg-As,4157
 ml_tools/ML_models_vision/_object_detection.py,sha256=AOGER5bx0REc-FfBtspJmyLJxn3GdwDSPwFGveobR94,5608
 ml_tools/ML_optimization/__init__.py,sha256=No18Dsw6Q9zPt8B9fpG0bWomuXmwDC7DiokiaPuwmRI,485
-ml_tools/ML_optimization/_multi_dragon.py,sha256=R0G91Y-TK49coCE0NAZdQuEqI0kTEaKuIuZ6QGE99lg,38525
+ml_tools/ML_optimization/_multi_dragon.py,sha256=zQhDxFY8FNxUlcbSnHMVArfojzYjgNa21jSE3pJmRW0,38956
 ml_tools/ML_optimization/_single_dragon.py,sha256=jh5-SK6NKAzbheQhquiYoROozk-RzUv1jiFkIzK_AFg,7288
 ml_tools/ML_optimization/_single_manual.py,sha256=h-_k9JmRqPkjTra1nu7AyYbSyWkYZ1R3utiNmW06WFs,21809
 ml_tools/ML_scaler/_ML_scaler.py,sha256=P75X0Sx8N-VxC2Qy8aG7mWaZlkTfjspiZDi1YiMQD1I,8872
@@ -103,12 +103,12 @@ ml_tools/_core/__init__.py,sha256=m-VP0RW0tOTm9N5NI3kFNcpM7WtVgs0RK9pK3ZJRZQQ,14
 ml_tools/_core/_logger.py,sha256=xzhn_FouMDRVNwXGBGlPC9Ruq6i5uCrmNaS5jesguMU,4972
 ml_tools/_core/_schema_load_ops.py,sha256=KLs9vBzANz5ESe2wlP-C41N4VlgGil-ywcfvWKSOGss,1551
 ml_tools/_core/_script_info.py,sha256=LtFGt10gEvCnhIRMKJPi2yXkiGLcdr7lE-oIP2XGHzQ,234
-ml_tools/data_exploration/__init__.py,sha256=ahCjELrum2aIj_cLK-sdGbJjTvvolf3US_oaB97rOQg,1736
+ml_tools/data_exploration/__init__.py,sha256=nYKg1bPBgXibC5nhmNKPw3VaKFeVtlNGL_YpHixW-Pg,1795
 ml_tools/data_exploration/_analysis.py,sha256=H6LryV56FFCHWjvQdkhZbtprZy6aP8EqU_hC2Cf9CLE,7832
 ml_tools/data_exploration/_cleaning.py,sha256=pAZOXgGK35j7O8q6cnyTwYK1GLNnD04A8p2fSyMB1mg,20906
 ml_tools/data_exploration/_features.py,sha256=wW-M8n2aLIy05DR2z4fI8wjpPjn3mOAnm9aSGYbMKwI,23363
 ml_tools/data_exploration/_plotting.py,sha256=zH1dPcIoAlOuww23xIoBCsQOAshPPv9OyGposOA2RvI,19883
-ml_tools/data_exploration/_schema_ops.py,sha256=PoFeHaS9dXI9gfL0SRD-8uSP4owqmbQFbtfA-HxkLnY,7108
+ml_tools/data_exploration/_schema_ops.py,sha256=Fd6fBGGv4OpxmJ1HG9pith6QL90z0tzssCvzkQxlEEQ,11083
 ml_tools/ensemble_evaluation/__init__.py,sha256=t4Gr8EGEk8RLatyc92-S0BzbQvdvodzoF-qDAH2qjVg,546
 ml_tools/ensemble_evaluation/_ensemble_evaluation.py,sha256=-sX9cLMaa0FOQDikmVv2lsCYtQ56Kftd3tILnNej0Hg,28346
 ml_tools/ensemble_inference/__init__.py,sha256=VMX-Kata2V0UmiURIU2jx6mRuZmvTWf-QXzCpHmVGZA,255
@@ -118,7 +118,7 @@ ml_tools/ensemble_learning/_ensemble_learning.py,sha256=MHDZBR20_nStlSSeThFI3bSu
 ml_tools/excel_handler/__init__.py,sha256=AaWM3n_dqBhJLTs3OEA57ex5YykKXNOwVCyHlVsdnqI,530
 ml_tools/excel_handler/_excel_handler.py,sha256=TODudmeQgDSdxUKzLfAzizs--VL-g8WxDOfQ4sgxxLs,13965
 ml_tools/keys/__init__.py,sha256=-0c2pmrhyfROc-oQpEjJGLBMhSagA3CyFijQaaqZRqU,399
-ml_tools/keys/_keys.py,sha256=kBcW3euNmD57_4aoRaAeqJP3FtU3iSuvgYv-BZqnEWw,9290
+ml_tools/keys/_keys.py,sha256=lL9NlijxOEAhfDPPqK_wL3QhjalrYK_fWM-KNniSIOA,9308
 ml_tools/math_utilities/__init__.py,sha256=K7Obkkc4rPKj4EbRZf1BsXHfiCg7FXYv_aN9Yc2Z_Vg,400
 ml_tools/math_utilities/_math_utilities.py,sha256=BYHIVcM9tuKIhVrkgLLiM5QalJ39zx7dXYy_M9aGgiM,9012
 ml_tools/optimization_tools/__init__.py,sha256=KD8JXpfGuPndO4AHnjJGu6uV1GRwhOfboD0KZV45kzw,658
@@ -130,14 +130,14 @@ ml_tools/path_manager/_path_tools.py,sha256=LcZE31QlkzZWUR8g1MW_N_mPY2DpKBJLA45V
 ml_tools/plot_fonts/__init__.py,sha256=KIxXRCjQ3SliEoLhEcqs7zDVZbVTn38bmSdL-yR1Q2w,187
 ml_tools/plot_fonts/_plot_fonts.py,sha256=mfjXNT9P59ymHoTI85Q8CcvfxfK5BIFBWtTZH-hNIC4,2209
 ml_tools/schema/__init__.py,sha256=K6uiZ9f0GCQ7etw1yl2-dQVLhU7RkL3KHesO3HNX6v4,334
-ml_tools/schema/_feature_schema.py,sha256=aVY3AJt1j4D2mtusVy2l6lDR2SYzPMyfvG1o9zOn0Kw,8585
+ml_tools/schema/_feature_schema.py,sha256=MuPf6Nf7tDhUTGyX7tcFHZh-lLSNsJkLmlf9IxdF4O4,9660
 ml_tools/schema/_gui_schema.py,sha256=IVwN4THAdFrvh2TpV4SFd_zlzMX3eioF-w-qcSVTndE,7245
 ml_tools/serde/__init__.py,sha256=IDirr8i-qjUHB71hmHO6lGiODhUoOnUcXYrvb_XgrzE,292
 ml_tools/serde/_serde.py,sha256=8QnYK8ZG21zdNaC0v63iSz2bhgwOKRKAWxTVQvMV0A8,5525
 ml_tools/utilities/__init__.py,sha256=iQb-S5JesEjGGI8983Vkj-14LCtchFxdWRhaziyvnoY,808
 ml_tools/utilities/_utility_save_load.py,sha256=EFvFaTaHahDQWdJWZr-j7cHqRbG_Xrpc96228JhV-bs,16773
 ml_tools/utilities/_utility_tools.py,sha256=bN0J9d1S0W5wNzNntBWqDsJcEAK7-1OgQg3X2fwXns0,6918
-dragon_ml_toolbox-20.4.0.dist-info/METADATA,sha256=5r7luC7aniRGcoQ5qy94fFLwme7UldbcfXFI-m_6hlA,7866
-dragon_ml_toolbox-20.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dragon_ml_toolbox-20.4.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
-dragon_ml_toolbox-20.4.0.dist-info/RECORD,,
+dragon_ml_toolbox-20.6.0.dist-info/METADATA,sha256=HfSazpvNdCk-0TW27NgJuerpBdsrzGhmmUnO3g1FMe4,7866
+dragon_ml_toolbox-20.6.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dragon_ml_toolbox-20.6.0.dist-info/top_level.txt,sha256=wm-oxax3ciyez6VoO4zsFd-gSok2VipYXnbg3TH9PtU,9
+dragon_ml_toolbox-20.6.0.dist-info/RECORD,,

ml_tools/ETL_cleaning/__init__.py CHANGED Viewed

@@ -10,7 +10,8 @@ from ._dragon_cleaner import (
 )
 from ._clean_tools import (
-    save_unique_values
+    save_unique_values,
+    save_category_counts,
 )
 from .._core import _imprimir_disponibles
@@ -20,6 +21,7 @@ __all__ = [
     "DragonColumnCleaner",
     "DragonDataFrameCleaner",
     "save_unique_values",
+    "save_category_counts",
     "basic_clean",
     "basic_clean_drop",
     "drop_macro_polars",

ml_tools/ETL_cleaning/_clean_tools.py CHANGED Viewed

@@ -13,6 +13,7 @@ _LOGGER = get_logger("ETL Clean Tools")
 __all__ = [
     "save_unique_values",
+    "save_category_counts",
 ]
@@ -126,3 +127,111 @@ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
             counter += 1
     _LOGGER.info(f"{counter} files of unique values created.")
+################ Category Counts per column #################
+def save_category_counts(csv_path_or_df: Union[str, Path, pl.DataFrame],
+                         output_dir: Union[str, Path],
+                         use_columns: Optional[list[str]] = None,
+                         verbose: bool = False,
+                         keep_column_order: bool = True) -> None:
+    """
+    Calculates the frequency and percentage of each unique value in the specified columns
+    and saves the distribution report to a text file.
+    Useful for checking class balance or identifying rare categories.
+    Args:
+        csv_path_or_df (str | Path | pl.DataFrame):
+            The file path to the input CSV file or a Polars DataFrame.
+        output_dir (str | Path):
+            The directory where the report files will be saved.
+        use_columns (List[str] | None):
+            Columns to analyze. If None, all columns are processed.
+        verbose (bool):
+            If True, prints progress info.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix to filenames to maintain order.
+    """
+    # 1. Handle Input
+    if isinstance(csv_path_or_df, pl.DataFrame):
+        df = csv_path_or_df
+        if use_columns:
+            valid_cols = [c for c in use_columns if c in df.columns]
+            if not valid_cols:
+                _LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
+                raise ValueError()
+            df = df.select(valid_cols)
+    else:
+        csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
+        df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
+    output_path = make_fullpath(input_path=output_dir, make=True, enforce='directory')
+    total_rows = df.height
+    if total_rows == 0:
+        _LOGGER.warning("Input DataFrame is empty. No counts to save.")
+        return
+    counter = 0
+    # 2. Process Each Column
+    for i, col_name in enumerate(df.columns):
+        try:
+            # Group by, count, and calculate percentage
+            # We treat nulls as a category here to see missing data frequency
+            stats = (
+                df.select(pl.col(col_name))
+                .group_by(col_name, maintain_order=False)
+                .len(name="count")
+                .with_columns(
+                    (pl.col("count") / total_rows * 100).alias("pct")
+                )
+                .sort("count", descending=True)
+            )
+            # Collect to python list of dicts for writing
+            rows = stats.iter_rows(named=True)
+            unique_count = stats.height
+            # Check thresholds for warning
+            is_high_cardinality = (unique_count > 300) or ((unique_count / total_rows) > 0.5)
+        except Exception:
+            _LOGGER.error(f"Could not calculate counts for column '{col_name}'.")
+            continue
+        # 3. Write to File
+        sanitized_name = sanitize_filename(col_name)
+        if not sanitized_name.strip('_'):
+            sanitized_name = f'column_{i}'
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_path / f"{prefix}{sanitized_name}_counts.txt"
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(f"# Distribution for column: '{col_name}'\n")
+                f.write(f"# Total Rows: {total_rows} | Unique Values: {unique_count}\n")
+                if is_high_cardinality:
+                    f.write(f"# WARNING: High cardinality detected (Unique/Total ratio: {unique_count/total_rows:.2%}).\n")
+                f.write("-" * 65 + "\n")
+                f.write(f"{'Count':<10} | {'Percentage':<12} | {'Value'}\n")
+                f.write("-" * 65 + "\n")
+                for row in rows:
+                    val = str(row[col_name])
+                    count = row["count"]
+                    pct = row["pct"]
+                    f.write(f"{count:<10} | {pct:>10.2f}%  | {val}\n")
+        except IOError:
+             _LOGGER.exception(f"Error writing to file {file_path}.")
+        else:
+             if verbose:
+                 print(f"    Saved distribution for '{col_name}'.")
+             counter += 1
+    _LOGGER.info(f"{counter} distribution files created.")

ml_tools/ETL_cleaning/_dragon_cleaner.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import polars as pl
 from pathlib import Path
-from typing import Union
+from typing import Union, Optional
 from ..utilities import save_dataframe_filename, load_dataframe
 from .._core import get_logger
 from ..path_manager import make_fullpath
-from ._clean_tools import save_unique_values
+from ._clean_tools import save_unique_values, save_category_counts
 _LOGGER = get_logger("DragonCleaner")
@@ -33,12 +33,18 @@ class DragonColumnCleaner:
     """
     def __init__(self,
                  column_name: str,
-                 rules: Union[dict[str, Union[str, None]], dict[str, str]],
+                 exact_matches: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
+                 rules: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
                  case_insensitive: bool = False):
         """
         Args:
             column_name (str):
                 The name of the column to be cleaned.
+            exact_matches (Dict[str, str | None]):
+                A dictionary of EXACT string matches to replacement strings.
+                - Uses a hash map, which is significantly faster than regex.
+                - Used for simple 1-to-1 mappings (e.g., {'Aluminum': 'Al'}).
+                - Runs BEFORE the regex rules.
             rules (Dict[str, str | None]):
                 A dictionary of regex patterns to replacement strings.
                 - Replacement can be None to indicate that matching values should be converted to null.
@@ -61,25 +67,47 @@ class DragonColumnCleaner:
         if not isinstance(column_name, str) or not column_name:
             _LOGGER.error("The 'column_name' must be a non-empty string.")
             raise TypeError()
-        if not isinstance(rules, dict):
-            _LOGGER.error("The 'rules' argument must be a dictionary.")
-            raise TypeError()
-        # validate rules
-        for pattern, replacement in rules.items():
-            if not isinstance(pattern, str):
-                _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+        # Validate Regex Rules
+        if rules is not None:
+            if not isinstance(rules, dict):
+                _LOGGER.error("The 'rules' argument must be a dictionary.")
                 raise TypeError()
-            if replacement is not None and not isinstance(replacement, str):
-                _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+            for pattern, replacement in rules.items():
+                if not isinstance(pattern, str):
+                    _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+                    raise TypeError()
+                if replacement is not None and not isinstance(replacement, str):
+                    _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+                    raise TypeError()
+        # Validate Exact Matches
+        if exact_matches is not None:
+            if not isinstance(exact_matches, dict):
+                _LOGGER.error("The 'exact_matches' argument must be a dictionary.")
                 raise TypeError()
+            for key, val in exact_matches.items():
+                if not isinstance(key, str):
+                    _LOGGER.error("All keys in 'exact_matches' must be strings.")
+                    raise TypeError()
+                if val is not None and not isinstance(val, str):
+                    _LOGGER.error("All values in 'exact_matches' must be strings or None.")
+                    raise TypeError()
+        # Raise if both are None or empty
+        if not rules and not exact_matches:
+            _LOGGER.error("At least one of 'rules' or 'exact_matches' must be provided.")
+            raise ValueError()
         self.column_name = column_name
-        self.rules = rules
+        self.rules = rules if rules else {}
+        self.exact_matches = exact_matches if exact_matches else {}
         self.case_insensitive = case_insensitive
     def preview(self,
                 csv_path: Union[str, Path],
                 report_dir: Union[str, Path],
+                show_distribution: bool = True,
                 add_value_separator: bool=False,
                 rule_batch_size: int = 150):
         """
@@ -90,6 +118,8 @@ class DragonColumnCleaner:
                 The path to the CSV file containing the data to clean.
             report_dir (str | Path):
                 The directory where the preview report will be saved.
+            show_distribution (bool):
+                If True, generates a category count report for the column after cleaning.
             add_value_separator (bool):
                 If True, adds a separator line between each unique value in the report.
             rule_batch_size (int):
@@ -101,13 +131,21 @@ class DragonColumnCleaner:
         preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
         df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
-        # Apply cleaning rules to a copy of the column for preview
+        # Apply cleaning rules and save reports
         save_unique_values(csv_path_or_df=df_preview,
                            output_dir=report_dir,
                            use_columns=[self.column_name],
                            verbose=False,
                            keep_column_order=False,
                            add_value_separator=add_value_separator)
+        # Optionally save category counts
+        if show_distribution:
+            save_category_counts(csv_path_or_df=df_preview,
+                                 output_dir=report_dir,
+                                 use_columns=[self.column_name],
+                                 verbose=False,
+                                 keep_column_order=False)
 class DragonDataFrameCleaner:
@@ -181,16 +219,23 @@ class DragonDataFrameCleaner:
         for cleaner in self.cleaners:
             col_name = cleaner.column_name
-            # Get all rules as a list of items
+            # Start expression for this batch
+            col_expr = pl.col(col_name).cast(pl.String)
+            # --- PHASE 1: EXACT MATCHES ---
+            # Apply dictionary-based replacement first (faster than regex)
+            if cleaner.exact_matches:
+                # 'replace' handles dictionary mapping safely. If value is mapped to None, it becomes null.
+                col_expr = col_expr.replace(cleaner.exact_matches)
+            # --- PHASE 2: REGEX PATTERNS ---
             all_rules = list(cleaner.rules.items())
             # Process in batches of 'rule_batch_size'
             for i in range(0, len(all_rules), rule_batch_size):
                 rule_batch = all_rules[i : i + rule_batch_size]
-                # Start expression for this batch
-                col_expr = pl.col(col_name).cast(pl.String)
+                # continue chaining operations on the same col_expr
                 for pattern, replacement in rule_batch:
                     final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
@@ -202,6 +247,15 @@ class DragonDataFrameCleaner:
                         col_expr = col_expr.str.replace_all(final_pattern, replacement)
                 # Apply this batch of rules to the LazyFrame
+                # apply partially here to keep the logical plan size under control
+                final_lf = final_lf.with_columns(col_expr.alias(col_name))
+                # Reset col_expr for the next batch, but pointing to the 'new' column
+                # This ensures the next batch works on the result of the previous batch
+                col_expr = pl.col(col_name)
+            # If we had exact matches but NO regex rules, we still need to apply the expression once
+            if cleaner.exact_matches and not all_rules:
                 final_lf = final_lf.with_columns(col_expr.alias(col_name))
         # 3. Collect Results
@@ -242,4 +296,3 @@ class DragonDataFrameCleaner:
         save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
         return None

ml_tools/ML_configuration/_metrics.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Literal
 __all__ = [
@@ -26,7 +26,7 @@ class _BaseClassificationFormat:
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  xtick_size: int=22,
                  ytick_size: int=22,
                  legend_size: int=26,
@@ -46,8 +46,8 @@ class _BaseClassificationFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
-            calibration_bins (int): The number of bins to use when
-                creating the calibration (reliability) plot.
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plot. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
@@ -97,6 +97,7 @@ class _BaseMultiLabelFormat:
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -115,6 +116,9 @@ class _BaseMultiLabelFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plots for each label. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
             xtick_size (int): Font size for x-axis tick labels.
@@ -133,6 +137,7 @@ class _BaseMultiLabelFormat:
         """
         self.cmap = cmap
         self.ROC_PR_line = ROC_PR_line
+        self.calibration_bins = calibration_bins
         self.font_size = font_size
         self.xtick_size = xtick_size
         self.ytick_size = ytick_size
@@ -142,6 +147,7 @@ class _BaseMultiLabelFormat:
         parts = [
             f"cmap='{self.cmap}'",
             f"ROC_PR_line='{self.ROC_PR_line}'",
+            f"calibration_bins={self.calibration_bins}",
             f"font_size={self.font_size}",
             f"xtick_size={self.xtick_size}",
             f"ytick_size={self.ytick_size}",
@@ -416,7 +422,7 @@ class FormatBinaryClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -440,7 +446,7 @@ class FormatMultiClassClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -464,7 +470,7 @@ class FormatBinaryImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -488,7 +494,7 @@ class FormatMultiClassImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -513,6 +519,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -520,6 +527,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
                  ) -> None:
         super().__init__(cmap=cmap,
                          ROC_PR_line=ROC_PR_line,
+                         calibration_bins=calibration_bins,
                          font_size=font_size,
                          xtick_size=xtick_size,
                          ytick_size=ytick_size,

ml_tools/ML_evaluation/_classification.py CHANGED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.calibration import CalibrationDisplay
+from sklearn.calibration import calibration_curve
 from sklearn.metrics import (
     classification_report,
     ConfusionMatrixDisplay,
@@ -378,42 +378,42 @@ def classification_metrics(save_dir: Union[str, Path],
             # --- Save Calibration Plot ---
             fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+            user_chosen_bins = format_config.calibration_bins
+            # --- Automate Bin Selection ---
+            if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+                # Determine bins based on number of samples
+                n_samples = y_true.shape[0]
+                if n_samples < 200:
+                    dynamic_bins = 5
+                elif n_samples < 1000:
+                    dynamic_bins = 10
+                else:
+                    dynamic_bins = 15
+            else:
+                dynamic_bins = user_chosen_bins
+            # --- Step 1: Get binned data directly ---
+            # calculates reliability diagram data without needing a temporary plot
+            prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
-            # --- Step 1: Get binned data *without* plotting ---
-            with plt.ioff(): # Suppress showing the temporary plot
-                fig_temp, ax_temp = plt.subplots()
-                cal_display_temp = CalibrationDisplay.from_predictions(
-                    y_true_binary, # Use binarized labels
-                    y_score,
-                    n_bins=format_config.calibration_bins,
-                    ax=ax_temp,
-                    name="temp" # Add a name to suppress potential warnings
-                )
-                # Get the x, y coordinates of the binned data
-                line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
-                plt.close(fig_temp) # Close the temporary plot
-            # --- Step 2: Build the plot from scratch ---
+            # --- Step 2: Plot ---
             ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
-            sns.regplot(
-                x=line_x,
-                y=line_y,
-                ax=ax_cal,
-                scatter=False,
-                label=f"Model calibration",
-                line_kws={
-                    'color': format_config.ROC_PR_line,
-                    'linestyle': '--',
-                    'linewidth': 2,
-                    }
-            )
+            # Plot the actual calibration curve (connect points with a line)
+            ax_cal.plot(prob_pred,
+                        prob_true,
+                        marker='o',  # Add markers to see bin locations
+                        linewidth=2,
+                        label="Model calibration",
+                        color=format_config.ROC_PR_line)
             ax_cal.set_title(f'Reliability Curve{plot_title}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size + 2)
             ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
             ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
-            # --- Step 3: Set final limits *after* plotting ---
+            # --- Step 3: Set final limits ---
             ax_cal.set_ylim(0.0, 1.0)
             ax_cal.set_xlim(0.0, 1.0)
@@ -428,7 +428,7 @@ def classification_metrics(save_dir: Union[str, Path],
             cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
             plt.savefig(cal_path)
             plt.close(fig_cal)
         _LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
@@ -632,6 +632,52 @@ def multi_label_classification_metrics(
         pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
         plt.savefig(pr_path)
         plt.close(fig_pr)
+        # --- Save Calibration Plot (New Feature) ---
+        fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+        user_chosen_bins = format_config.calibration_bins
+        # --- Automate Bin Selection ---
+        if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+            # Determine bins based on number of samples
+            n_samples = y_true.shape[0]
+            if n_samples < 200:
+                dynamic_bins = 5
+            elif n_samples < 1000:
+                dynamic_bins = 10
+            else:
+                dynamic_bins = 15
+        else:
+            dynamic_bins = user_chosen_bins
+        # Calculate calibration curve for this specific label
+        prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
+        ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+        ax_cal.plot(prob_pred,
+                    prob_true,
+                    marker='o',
+                    linewidth=2,
+                    label=f"Calibration for '{name}'",
+                    color=format_config.ROC_PR_line)
+        ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylim(0.0, 1.0)
+        ax_cal.set_xlim(0.0, 1.0)
+        ax_cal.tick_params(axis='x', labelsize=xtick_size)
+        ax_cal.tick_params(axis='y', labelsize=ytick_size)
+        ax_cal.legend(loc='lower right', fontsize=legend_size)
+        ax_cal.grid(True)
+        plt.tight_layout()
+        cal_path = save_dir_path / f"calibration_plot_{sanitized_name}.svg"
+        plt.savefig(cal_path)
+        plt.close(fig_cal)
     _LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")

ml_tools/ML_optimization/_multi_dragon.py CHANGED Viewed

@@ -170,9 +170,13 @@ class DragonParetoOptimizer:
             re_evaluate=False # model is deterministic
         )
-    def run(self) -> pd.DataFrame:
+    def run(self,
+            plots_and_log: bool=True) -> pd.DataFrame:
         """
         Execute the optimization with progress tracking and periodic logging.
+        Args:
+            plots_and_log (bool): If True, generates plots and logs during optimization. Disable for multi-run scenarios.
         Returns:
             pd.DataFrame: A DataFrame containing the non-dominated solutions (Pareto Front).
@@ -189,9 +193,10 @@ class DragonParetoOptimizer:
         _LOGGER.info(f"🧬 Starting NSGA-II (GeneticAlgorithm) for {generations} generations...")
         # Initialize log file
-        with open(log_file, "w") as f:
-            f.write(f"Pareto Optimization Log - {generations} Generations\n")
-            f.write("=" * 60 + "\n")
+        if plots_and_log:
+            with open(log_file, "w") as f:
+                f.write(f"Pareto Optimization Log - {generations} Generations\n")
+                f.write("=" * 60 + "\n")
         # History tracking for visualization
         history_records = []
@@ -201,43 +206,44 @@ class DragonParetoOptimizer:
             for gen in range(1, generations + 1):
                 self.algorithm.step()
-                # Capture stats for history (every generation for smooth plots)
-                current_evals = self.algorithm.population.evals.clone() # type: ignore
-                gen_stats = {}
-                for i, target_name in enumerate(self.ordered_target_names):
-                    vals = current_evals[:, i]
-                    v_mean = float(vals.mean())
-                    v_min = float(vals.min())
-                    v_max = float(vals.max())
-                    # Store for plotting
-                    history_records.append({
-                        "Generation": gen,
-                        "Target": target_name,
-                        "Mean": v_mean,
-                        "Min": v_min,
-                        "Max": v_max
-                    })
-                    gen_stats[target_name] = (v_mean, v_min, v_max)
-                # Periodic Logging of Population Stats to FILE
-                if gen % log_interval == 0 or gen == generations:
-                    stats_msg = [f"Gen {gen}:"]
-                    for t_name, (v_mean, v_min, v_max) in gen_stats.items():
-                        stats_msg.append(f"{t_name}: {v_mean:.3f} (Range: {v_min:.3f}-{v_max:.3f})")
+                if plots_and_log:
+                    # Capture stats for history (every generation for smooth plots)
+                    current_evals = self.algorithm.population.evals.clone() # type: ignore
-                    log_line = " | ".join(stats_msg)
+                    gen_stats = {}
+                    for i, target_name in enumerate(self.ordered_target_names):
+                        vals = current_evals[:, i]
+                        v_mean = float(vals.mean())
+                        v_min = float(vals.min())
+                        v_max = float(vals.max())
+                        # Store for plotting
+                        history_records.append({
+                            "Generation": gen,
+                            "Target": target_name,
+                            "Mean": v_mean,
+                            "Min": v_min,
+                            "Max": v_max
+                        })
+                        gen_stats[target_name] = (v_mean, v_min, v_max)
-                    # Write to file
-                    with open(log_file, "a") as f:
-                        f.write(log_line + "\n")
+                    # Periodic Logging of Population Stats to FILE
+                    if gen % log_interval == 0 or gen == generations:
+                        stats_msg = [f"Gen {gen}:"]
+                        for t_name, (v_mean, v_min, v_max) in gen_stats.items():
+                            stats_msg.append(f"{t_name}: {v_mean:.3f} (Range: {v_min:.3f}-{v_max:.3f})")
+                        log_line = " | ".join(stats_msg)
+                        # Write to file
+                        with open(log_file, "a") as f:
+                            f.write(log_line + "\n")
                 pbar.update(1)
         # --- Post-Optimization Visualization ---
-        if history_records:
+        if plots_and_log and history_records:
             _LOGGER.debug("Generating optimization history plots...")
             history_df = pd.DataFrame(history_records)
             self._plot_optimization_history(history_df, save_path)
@@ -308,7 +314,8 @@ class DragonParetoOptimizer:
         _LOGGER.info(f"Optimization complete. Found {len(pareto_df)} non-dominated solutions.")
         # --- Plotting ---
-        self._generate_plots(pareto_df, save_path)
+        if plots_and_log:
+            self._generate_plots(pareto_df, save_path)
         return pareto_df

ml_tools/data_exploration/__init__.py CHANGED Viewed

@@ -36,6 +36,7 @@ from ._features import (
 from ._schema_ops import (
     finalize_feature_schema,
     apply_feature_schema,
+    reconstruct_from_schema
 )
 from .._core import _imprimir_disponibles
@@ -62,6 +63,7 @@ __all__ = [
     "encode_categorical_features",
     "finalize_feature_schema",
     "apply_feature_schema",
+    "reconstruct_from_schema",
     "match_and_filter_columns_by_regex",
     "standardize_percentages",
     "reconstruct_one_hot",

ml_tools/data_exploration/_schema_ops.py CHANGED Viewed

@@ -9,6 +9,13 @@ from .._core import get_logger
 _LOGGER = get_logger("Data Exploration: Schema Ops")
+__all__ = [
+    "finalize_feature_schema",
+    "apply_feature_schema",
+    "reconstruct_from_schema",
+]
 def finalize_feature_schema(
     df_features: pd.DataFrame,
     categorical_mappings: Optional[dict[str, dict[str, int]]]
@@ -86,7 +93,7 @@ def apply_feature_schema(
     schema: FeatureSchema,
     targets: Optional[list[str]] = None,
     unknown_value: int = 99999,
-    verbose: bool = True
+    verbose: int = 3
 ) -> pd.DataFrame:
     """
     Aligns the input DataFrame with the provided FeatureSchema.
@@ -100,7 +107,7 @@ def apply_feature_schema(
         targets (list[str] | None): Optional list of target column names.
         unknown_value (int): Integer value to assign to unknown categorical levels.
                              Defaults to 99999 to avoid collision with existing categories.
-        verbose (bool): If True, logs info about dropped extra columns.
+        verbose (int): Verbosity level for logging. Higher values produce more detailed logs.
     Returns:
         pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
@@ -147,7 +154,8 @@ def apply_feature_schema(
             # Handle Unknown Categories
             if df_processed[col_name].isnull().any():
                 n_missing = df_processed[col_name].isnull().sum()
-                _LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
+                if verbose >= 1:
+                    _LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
                 # Fill unknowns with the specified integer
                 df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
@@ -159,14 +167,13 @@ def apply_feature_schema(
     extra_cols = set(df_processed.columns) - set(final_column_order)
     if extra_cols:
-        _LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
-        if verbose:
-            for extra_column in extra_cols:
-                print(f"  - Dropping column: '{extra_column}'")
+        if verbose >= 1:
+            _LOGGER.warning(f"Dropping {len(extra_cols)} extra columns not present in schema: {extra_cols}")
     df_final = df_processed[final_column_order]
-    _LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
+    if verbose >= 2:
+        _LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
     # df_final should be a dataframe
     if isinstance(df_final, pd.Series):
@@ -174,3 +181,95 @@ def apply_feature_schema(
     return df_final
+def reconstruct_from_schema(
+    df: pd.DataFrame,
+    schema: FeatureSchema,
+    targets: Optional[list[str]] = None,
+    verbose: int = 3
+) -> pd.DataFrame:
+    """
+    Reverses the schema application to make data human-readable.
+    This function decodes categorical features back to their string representations
+    using the schema's mappings. It strictly enforces the schema structure,
+    ignoring extra columns (unless they are specified as targets).
+    Args:
+        df (pd.DataFrame): The input DataFrame containing encoded features.
+        schema (FeatureSchema): The schema defining feature names and reverse mappings.
+        targets (list[str] | None): Optional list of target column names to preserve. These are not decoded and kept in the order specified here.
+        verbose (int): Verbosity level for logging info about the process.
+    Returns:
+        pd.DataFrame: A new DataFrame with the exact column order (features + targets),
+                      with categorical features decoded to strings.
+    Raises:
+        ValueError: If any required feature or target column is missing.
+    """
+    # 1. Setup
+    df_decoded = df.copy()
+    targets = targets if targets is not None else []
+    # 2. Validation: Strict Column Presence
+    # Check Features
+    missing_features = [col for col in schema.feature_names if col not in df_decoded.columns]
+    if missing_features:
+        _LOGGER.error(f"Schema Reconstruction Mismatch: Missing required features: {missing_features}")
+        raise ValueError()
+    # Check Targets
+    if targets:
+        missing_targets = [col for col in targets if col not in df_decoded.columns]
+        if missing_targets:
+            _LOGGER.error(f"Schema Reconstruction Mismatch: Missing required targets: {missing_targets}")
+            raise ValueError()
+    # 3. Reorder and Filter (Drop extra columns early)
+    # The valid columns are Features + Targets
+    valid_columns = list(schema.feature_names) + targets
+    extra_cols = set(df_decoded.columns) - set(valid_columns)
+    if extra_cols:
+        if verbose >= 1:
+            _LOGGER.warning(f"Dropping extra columns not present in schema or targets: {extra_cols}")
+    # Enforce order: Features first, then Targets
+    df_decoded = df_decoded[valid_columns]
+    # 4. Reverse Categorical Encoding
+    if schema.categorical_feature_names and schema.categorical_mappings:
+        for col_name in schema.categorical_feature_names:
+            if col_name not in schema.categorical_mappings:
+                continue
+            forward_mapping = schema.categorical_mappings[col_name]
+            # Create reverse map: {int: str}
+            reverse_mapping = {v: k for k, v in forward_mapping.items()}
+            # --- SAFE TYPE CASTING ---
+            # Ensure values are Integers before mapping (handle 5.0 vs 5).
+            try:
+                if pd.api.types.is_numeric_dtype(df_decoded[col_name]):
+                    df_decoded[col_name] = df_decoded[col_name].astype("Int64")
+            except (TypeError, ValueError):
+                # casted to NaN later during mapping
+                pass
+            # -------------------------
+            # Check for unknown codes before mapping
+            if verbose >= 1:
+                unique_codes = df_decoded[col_name].dropna().unique()
+                unknown_codes = [code for code in unique_codes if code not in reverse_mapping]
+                if unknown_codes:
+                    _LOGGER.warning(f"Feature '{col_name}': Found unknown encoded values {unknown_codes}. These will be mapped to NaN.")
+            # Apply reverse mapping
+            df_decoded[col_name] = df_decoded[col_name].map(reverse_mapping)
+    if verbose >= 2:
+        _LOGGER.info(f"Schema reconstruction successful. Final shape: {df_decoded.shape}")
+    return df_decoded

ml_tools/keys/_keys.py CHANGED Viewed

@@ -4,6 +4,7 @@ class MagicWords:
     CURRENT = "current"
     RENAME = "rename"
     UNKNOWN = "unknown"
+    AUTO = "auto"
 class PyTorchLogKeys:

ml_tools/schema/_feature_schema.py CHANGED Viewed

@@ -202,13 +202,39 @@ class FeatureSchema(NamedTuple):
                           filename=DatasetKeys.CATEGORICAL_NAMES,
                           verbose=verbose)
-    def save_artifacts(self, directory: Union[str,Path]):
+    def save_description(self, directory: Union[str, Path], verbose: bool = False) -> None:
+        """
+        Saves the schema's description to a .txt file.
+        Args:
+            directory: The directory where the file will be saved.
+            verbose: If True, prints a confirmation message upon saving.
+        """
+        dir_path = make_fullpath(directory, make=True, enforce="directory")
+        filename = "FeatureSchema-description.txt"
+        file_path = dir_path / filename
+        try:
+            with open(file_path, "w", encoding="utf-8") as f:
+                f.write(str(self))
+            if verbose:
+                _LOGGER.info(f"Schema description saved to '{dir_path.name}/{filename}'")
+        except IOError as e:
+            _LOGGER.error(f"Failed to save schema description: {e}")
+            raise e
+    def save_artifacts(self, directory: Union[str,Path], verbose: bool=True):
         """
         Saves feature names, categorical feature names, continuous feature names to separate text files.
         """
-        self.save_all_features(directory=directory, verbose=True)
-        self.save_continuous_features(directory=directory, verbose=True)
-        self.save_categorical_features(directory=directory, verbose=True)
+        self.save_all_features(directory=directory, verbose=False)
+        self.save_continuous_features(directory=directory, verbose=False)
+        self.save_categorical_features(directory=directory, verbose=False)
+        self.save_description(directory=directory, verbose=False)
+        if verbose:
+            _LOGGER.info(f"All FeatureSchema artifacts saved to directory: '{directory}'")
     def __repr__(self) -> str:
         """Returns a concise representation of the schema's contents."""

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md RENAMED Viewed

File without changes

{dragon_ml_toolbox-20.4.0.dist-info → dragon_ml_toolbox-20.6.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dragon-ml-toolbox 20.4.0__py3-none-any.whl → 20.6.0__py3-none-any.whl

dragon-ml-toolbox 20.4.0py3-none-any.whl → 20.6.0py3-none-any.whl