PyPI - dragon-ml-toolbox - Versions diffs - 20.5.0__tar.gz → 20.7.0__tar.gz - Mend

dragon-ml-toolbox 20.5.0tar.gz → 20.7.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (149) hide show

{dragon_ml_toolbox-20.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-20.7.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.5.0
+Version: 20.7.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0/dragon_ml_toolbox.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dragon-ml-toolbox
-Version: 20.5.0
+Version: 20.7.0
 Summary: Complete pipelines and helper tools for data science and machine learning projects.
 Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
 License-Expression: MIT

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/dragon_ml_toolbox.egg-info/SOURCES.txt RENAMED Viewed

@@ -142,5 +142,6 @@ ml_tools/schema/_gui_schema.py
 ml_tools/serde/__init__.py
 ml_tools/serde/_serde.py
 ml_tools/utilities/__init__.py
+ml_tools/utilities/_translate.py
 ml_tools/utilities/_utility_save_load.py
 ml_tools/utilities/_utility_tools.py

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/__init__.py RENAMED Viewed

@@ -10,7 +10,8 @@ from ._dragon_cleaner import (
 )
 from ._clean_tools import (
-    save_unique_values
+    save_unique_values,
+    save_category_counts,
 )
 from .._core import _imprimir_disponibles
@@ -20,6 +21,7 @@ __all__ = [
     "DragonColumnCleaner",
     "DragonDataFrameCleaner",
     "save_unique_values",
+    "save_category_counts",
     "basic_clean",
     "basic_clean_drop",
     "drop_macro_polars",

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_clean_tools.py RENAMED Viewed

@@ -13,6 +13,7 @@ _LOGGER = get_logger("ETL Clean Tools")
 __all__ = [
     "save_unique_values",
+    "save_category_counts",
 ]
@@ -126,3 +127,111 @@ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
             counter += 1
     _LOGGER.info(f"{counter} files of unique values created.")
+################ Category Counts per column #################
+def save_category_counts(csv_path_or_df: Union[str, Path, pl.DataFrame],
+                         output_dir: Union[str, Path],
+                         use_columns: Optional[list[str]] = None,
+                         verbose: bool = False,
+                         keep_column_order: bool = True) -> None:
+    """
+    Calculates the frequency and percentage of each unique value in the specified columns
+    and saves the distribution report to a text file.
+    Useful for checking class balance or identifying rare categories.
+    Args:
+        csv_path_or_df (str | Path | pl.DataFrame):
+            The file path to the input CSV file or a Polars DataFrame.
+        output_dir (str | Path):
+            The directory where the report files will be saved.
+        use_columns (List[str] | None):
+            Columns to analyze. If None, all columns are processed.
+        verbose (bool):
+            If True, prints progress info.
+        keep_column_order (bool):
+            If True, prepends a numeric prefix to filenames to maintain order.
+    """
+    # 1. Handle Input
+    if isinstance(csv_path_or_df, pl.DataFrame):
+        df = csv_path_or_df
+        if use_columns:
+            valid_cols = [c for c in use_columns if c in df.columns]
+            if not valid_cols:
+                _LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
+                raise ValueError()
+            df = df.select(valid_cols)
+    else:
+        csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
+        df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
+    output_path = make_fullpath(input_path=output_dir, make=True, enforce='directory')
+    total_rows = df.height
+    if total_rows == 0:
+        _LOGGER.warning("Input DataFrame is empty. No counts to save.")
+        return
+    counter = 0
+    # 2. Process Each Column
+    for i, col_name in enumerate(df.columns):
+        try:
+            # Group by, count, and calculate percentage
+            # We treat nulls as a category here to see missing data frequency
+            stats = (
+                df.select(pl.col(col_name))
+                .group_by(col_name, maintain_order=False)
+                .len(name="count")
+                .with_columns(
+                    (pl.col("count") / total_rows * 100).alias("pct")
+                )
+                .sort("count", descending=True)
+            )
+            # Collect to python list of dicts for writing
+            rows = stats.iter_rows(named=True)
+            unique_count = stats.height
+            # Check thresholds for warning
+            is_high_cardinality = (unique_count > 300) or ((unique_count / total_rows) > 0.5)
+        except Exception:
+            _LOGGER.error(f"Could not calculate counts for column '{col_name}'.")
+            continue
+        # 3. Write to File
+        sanitized_name = sanitize_filename(col_name)
+        if not sanitized_name.strip('_'):
+            sanitized_name = f'column_{i}'
+        prefix = f"{i + 1}_" if keep_column_order else ''
+        file_path = output_path / f"{prefix}{sanitized_name}_counts.txt"
+        try:
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(f"# Distribution for column: '{col_name}'\n")
+                f.write(f"# Total Rows: {total_rows} | Unique Values: {unique_count}\n")
+                if is_high_cardinality:
+                    f.write(f"# WARNING: High cardinality detected (Unique/Total ratio: {unique_count/total_rows:.2%}).\n")
+                f.write("-" * 65 + "\n")
+                f.write(f"{'Count':<10} | {'Percentage':<12} | {'Value'}\n")
+                f.write("-" * 65 + "\n")
+                for row in rows:
+                    val = str(row[col_name])
+                    count = row["count"]
+                    pct = row["pct"]
+                    f.write(f"{count:<10} | {pct:>10.2f}%  | {val}\n")
+        except IOError:
+             _LOGGER.exception(f"Error writing to file {file_path}.")
+        else:
+             if verbose:
+                 print(f"    Saved distribution for '{col_name}'.")
+             counter += 1
+    _LOGGER.info(f"{counter} distribution files created.")

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_dragon_cleaner.py RENAMED Viewed

@@ -1,13 +1,13 @@
 import polars as pl
 from pathlib import Path
-from typing import Union
+from typing import Union, Optional
 from ..utilities import save_dataframe_filename, load_dataframe
 from .._core import get_logger
 from ..path_manager import make_fullpath
-from ._clean_tools import save_unique_values
+from ._clean_tools import save_unique_values, save_category_counts
 _LOGGER = get_logger("DragonCleaner")
@@ -33,12 +33,18 @@ class DragonColumnCleaner:
     """
     def __init__(self,
                  column_name: str,
-                 rules: Union[dict[str, Union[str, None]], dict[str, str]],
+                 exact_matches: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
+                 rules: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
                  case_insensitive: bool = False):
         """
         Args:
             column_name (str):
                 The name of the column to be cleaned.
+            exact_matches (Dict[str, str | None]):
+                A dictionary of EXACT string matches to replacement strings.
+                - Uses a hash map, which is significantly faster than regex.
+                - Used for simple 1-to-1 mappings (e.g., {'Aluminum': 'Al'}).
+                - Runs BEFORE the regex rules.
             rules (Dict[str, str | None]):
                 A dictionary of regex patterns to replacement strings.
                 - Replacement can be None to indicate that matching values should be converted to null.
@@ -61,25 +67,47 @@ class DragonColumnCleaner:
         if not isinstance(column_name, str) or not column_name:
             _LOGGER.error("The 'column_name' must be a non-empty string.")
             raise TypeError()
-        if not isinstance(rules, dict):
-            _LOGGER.error("The 'rules' argument must be a dictionary.")
-            raise TypeError()
-        # validate rules
-        for pattern, replacement in rules.items():
-            if not isinstance(pattern, str):
-                _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+        # Validate Regex Rules
+        if rules is not None:
+            if not isinstance(rules, dict):
+                _LOGGER.error("The 'rules' argument must be a dictionary.")
                 raise TypeError()
-            if replacement is not None and not isinstance(replacement, str):
-                _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+            for pattern, replacement in rules.items():
+                if not isinstance(pattern, str):
+                    _LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
+                    raise TypeError()
+                if replacement is not None and not isinstance(replacement, str):
+                    _LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
+                    raise TypeError()
+        # Validate Exact Matches
+        if exact_matches is not None:
+            if not isinstance(exact_matches, dict):
+                _LOGGER.error("The 'exact_matches' argument must be a dictionary.")
                 raise TypeError()
+            for key, val in exact_matches.items():
+                if not isinstance(key, str):
+                    _LOGGER.error("All keys in 'exact_matches' must be strings.")
+                    raise TypeError()
+                if val is not None and not isinstance(val, str):
+                    _LOGGER.error("All values in 'exact_matches' must be strings or None.")
+                    raise TypeError()
+        # Raise if both are None or empty
+        if not rules and not exact_matches:
+            _LOGGER.error("At least one of 'rules' or 'exact_matches' must be provided.")
+            raise ValueError()
         self.column_name = column_name
-        self.rules = rules
+        self.rules = rules if rules else {}
+        self.exact_matches = exact_matches if exact_matches else {}
         self.case_insensitive = case_insensitive
     def preview(self,
                 csv_path: Union[str, Path],
                 report_dir: Union[str, Path],
+                show_distribution: bool = True,
                 add_value_separator: bool=False,
                 rule_batch_size: int = 150):
         """
@@ -90,6 +118,8 @@ class DragonColumnCleaner:
                 The path to the CSV file containing the data to clean.
             report_dir (str | Path):
                 The directory where the preview report will be saved.
+            show_distribution (bool):
+                If True, generates a category count report for the column after cleaning.
             add_value_separator (bool):
                 If True, adds a separator line between each unique value in the report.
             rule_batch_size (int):
@@ -101,13 +131,21 @@ class DragonColumnCleaner:
         preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
         df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
-        # Apply cleaning rules to a copy of the column for preview
+        # Apply cleaning rules and save reports
         save_unique_values(csv_path_or_df=df_preview,
                            output_dir=report_dir,
                            use_columns=[self.column_name],
                            verbose=False,
                            keep_column_order=False,
                            add_value_separator=add_value_separator)
+        # Optionally save category counts
+        if show_distribution:
+            save_category_counts(csv_path_or_df=df_preview,
+                                 output_dir=report_dir,
+                                 use_columns=[self.column_name],
+                                 verbose=False,
+                                 keep_column_order=False)
 class DragonDataFrameCleaner:
@@ -181,16 +219,23 @@ class DragonDataFrameCleaner:
         for cleaner in self.cleaners:
             col_name = cleaner.column_name
-            # Get all rules as a list of items
+            # Start expression for this batch
+            col_expr = pl.col(col_name).cast(pl.String)
+            # --- PHASE 1: EXACT MATCHES ---
+            # Apply dictionary-based replacement first (faster than regex)
+            if cleaner.exact_matches:
+                # 'replace' handles dictionary mapping safely. If value is mapped to None, it becomes null.
+                col_expr = col_expr.replace(cleaner.exact_matches)
+            # --- PHASE 2: REGEX PATTERNS ---
             all_rules = list(cleaner.rules.items())
             # Process in batches of 'rule_batch_size'
             for i in range(0, len(all_rules), rule_batch_size):
                 rule_batch = all_rules[i : i + rule_batch_size]
-                # Start expression for this batch
-                col_expr = pl.col(col_name).cast(pl.String)
+                # continue chaining operations on the same col_expr
                 for pattern, replacement in rule_batch:
                     final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
@@ -202,6 +247,15 @@ class DragonDataFrameCleaner:
                         col_expr = col_expr.str.replace_all(final_pattern, replacement)
                 # Apply this batch of rules to the LazyFrame
+                # apply partially here to keep the logical plan size under control
+                final_lf = final_lf.with_columns(col_expr.alias(col_name))
+                # Reset col_expr for the next batch, but pointing to the 'new' column
+                # This ensures the next batch works on the result of the previous batch
+                col_expr = pl.col(col_name)
+            # If we had exact matches but NO regex rules, we still need to apply the expression once
+            if cleaner.exact_matches and not all_rules:
                 final_lf = final_lf.with_columns(col_expr.alias(col_name))
         # 3. Collect Results
@@ -242,4 +296,3 @@ class DragonDataFrameCleaner:
         save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
         return None

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_metrics.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Literal
 __all__ = [
@@ -26,7 +26,7 @@ class _BaseClassificationFormat:
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  xtick_size: int=22,
                  ytick_size: int=22,
                  legend_size: int=26,
@@ -46,8 +46,8 @@ class _BaseClassificationFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
-            calibration_bins (int): The number of bins to use when
-                creating the calibration (reliability) plot.
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plot. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
@@ -97,6 +97,7 @@ class _BaseMultiLabelFormat:
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -115,6 +116,9 @@ class _BaseMultiLabelFormat:
                 - Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
                 - Hex codes: '#FF6347', '#4682B4'
+            calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plots for each label. If 'auto', the number will be dynamically determined based on the number of samples.
+                - Typical int values: 10, 15, 20
             font_size (int): The base font size to apply to the plots.
             xtick_size (int): Font size for x-axis tick labels.
@@ -133,6 +137,7 @@ class _BaseMultiLabelFormat:
         """
         self.cmap = cmap
         self.ROC_PR_line = ROC_PR_line
+        self.calibration_bins = calibration_bins
         self.font_size = font_size
         self.xtick_size = xtick_size
         self.ytick_size = ytick_size
@@ -142,6 +147,7 @@ class _BaseMultiLabelFormat:
         parts = [
             f"cmap='{self.cmap}'",
             f"ROC_PR_line='{self.ROC_PR_line}'",
+            f"calibration_bins={self.calibration_bins}",
             f"font_size={self.font_size}",
             f"xtick_size={self.xtick_size}",
             f"ytick_size={self.ytick_size}",
@@ -416,7 +422,7 @@ class FormatBinaryClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -440,7 +446,7 @@ class FormatMultiClassClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -464,7 +470,7 @@ class FormatBinaryImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -488,7 +494,7 @@ class FormatMultiClassImageClassificationMetrics(_BaseClassificationFormat):
     def __init__(self,
                  cmap: str="BuGn",
                  ROC_PR_line: str='darkorange',
-                 calibration_bins: int=15,
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int=26,
                  xtick_size: int=22,
                  ytick_size: int=22,
@@ -513,6 +519,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
     def __init__(self,
                  cmap: str = "BuGn",
                  ROC_PR_line: str='darkorange',
+                 calibration_bins: Union[int, Literal['auto']]='auto',
                  font_size: int = 25,
                  xtick_size: int=20,
                  ytick_size: int=20,
@@ -520,6 +527,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
                  ) -> None:
         super().__init__(cmap=cmap,
                          ROC_PR_line=ROC_PR_line,
+                         calibration_bins=calibration_bins,
                          font_size=font_size,
                          xtick_size=xtick_size,
                          ytick_size=ytick_size,

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_classification.py RENAMED Viewed

@@ -2,7 +2,7 @@ import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-from sklearn.calibration import CalibrationDisplay
+from sklearn.calibration import calibration_curve
 from sklearn.metrics import (
     classification_report,
     ConfusionMatrixDisplay,
@@ -378,42 +378,42 @@ def classification_metrics(save_dir: Union[str, Path],
             # --- Save Calibration Plot ---
             fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+            user_chosen_bins = format_config.calibration_bins
+            # --- Automate Bin Selection ---
+            if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+                # Determine bins based on number of samples
+                n_samples = y_true.shape[0]
+                if n_samples < 200:
+                    dynamic_bins = 5
+                elif n_samples < 1000:
+                    dynamic_bins = 10
+                else:
+                    dynamic_bins = 15
+            else:
+                dynamic_bins = user_chosen_bins
+            # --- Step 1: Get binned data directly ---
+            # calculates reliability diagram data without needing a temporary plot
+            prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
-            # --- Step 1: Get binned data *without* plotting ---
-            with plt.ioff(): # Suppress showing the temporary plot
-                fig_temp, ax_temp = plt.subplots()
-                cal_display_temp = CalibrationDisplay.from_predictions(
-                    y_true_binary, # Use binarized labels
-                    y_score,
-                    n_bins=format_config.calibration_bins,
-                    ax=ax_temp,
-                    name="temp" # Add a name to suppress potential warnings
-                )
-                # Get the x, y coordinates of the binned data
-                line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
-                plt.close(fig_temp) # Close the temporary plot
-            # --- Step 2: Build the plot from scratch ---
+            # --- Step 2: Plot ---
             ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
-            sns.regplot(
-                x=line_x,
-                y=line_y,
-                ax=ax_cal,
-                scatter=False,
-                label=f"Model calibration",
-                line_kws={
-                    'color': format_config.ROC_PR_line,
-                    'linestyle': '--',
-                    'linewidth': 2,
-                    }
-            )
+            # Plot the actual calibration curve (connect points with a line)
+            ax_cal.plot(prob_pred,
+                        prob_true,
+                        marker='o',  # Add markers to see bin locations
+                        linewidth=2,
+                        label="Model calibration",
+                        color=format_config.ROC_PR_line)
             ax_cal.set_title(f'Reliability Curve{plot_title}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size + 2)
             ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
             ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
-            # --- Step 3: Set final limits *after* plotting ---
+            # --- Step 3: Set final limits ---
             ax_cal.set_ylim(0.0, 1.0)
             ax_cal.set_xlim(0.0, 1.0)
@@ -428,7 +428,7 @@ def classification_metrics(save_dir: Union[str, Path],
             cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
             plt.savefig(cal_path)
             plt.close(fig_cal)
         _LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
@@ -632,6 +632,52 @@ def multi_label_classification_metrics(
         pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
         plt.savefig(pr_path)
         plt.close(fig_pr)
+        # --- Save Calibration Plot (New Feature) ---
+        fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
+        user_chosen_bins = format_config.calibration_bins
+        # --- Automate Bin Selection ---
+        if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
+            # Determine bins based on number of samples
+            n_samples = y_true.shape[0]
+            if n_samples < 200:
+                dynamic_bins = 5
+            elif n_samples < 1000:
+                dynamic_bins = 10
+            else:
+                dynamic_bins = 15
+        else:
+            dynamic_bins = user_chosen_bins
+        # Calculate calibration curve for this specific label
+        prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
+        ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
+        ax_cal.plot(prob_pred,
+                    prob_true,
+                    marker='o',
+                    linewidth=2,
+                    label=f"Calibration for '{name}'",
+                    color=format_config.ROC_PR_line)
+        ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
+        ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
+        ax_cal.set_ylim(0.0, 1.0)
+        ax_cal.set_xlim(0.0, 1.0)
+        ax_cal.tick_params(axis='x', labelsize=xtick_size)
+        ax_cal.tick_params(axis='y', labelsize=ytick_size)
+        ax_cal.legend(loc='lower right', fontsize=legend_size)
+        ax_cal.grid(True)
+        plt.tight_layout()
+        cal_path = save_dir_path / f"calibration_plot_{sanitized_name}.svg"
+        plt.savefig(cal_path)
+        plt.close(fig_cal)
     _LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/keys/_keys.py RENAMED Viewed

@@ -4,6 +4,7 @@ class MagicWords:
     CURRENT = "current"
     RENAME = "rename"
     UNKNOWN = "unknown"
+    AUTO = "auto"
 class PyTorchLogKeys:

{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/utilities/__init__.py RENAMED Viewed

@@ -15,6 +15,13 @@ from ._utility_tools import (
     train_dataset_yielder
 )
+from ._translate import (
+    translate_dataframe_columns,
+    create_translation_template,
+    audit_column_translation
+)
 from .._core import _imprimir_disponibles
@@ -27,6 +34,9 @@ __all__ = [
     "save_dataframe",
     "save_dataframe_with_schema",
     "merge_dataframes",
+    "translate_dataframe_columns",
+    "create_translation_template",
+    "audit_column_translation",
     "distribute_dataset_by_target",
     "train_dataset_orchestrator",
     "train_dataset_yielder"

dragon-ml-toolbox 20.5.0__tar.gz → 20.7.0__tar.gz

dragon-ml-toolbox 20.5.0tar.gz → 20.7.0tar.gz