dragon-ml-toolbox 20.5.0__tar.gz → 20.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-20.5.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-20.7.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +1 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/__init__.py +3 -1
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_clean_tools.py +109 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_dragon_cleaner.py +72 -19
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_metrics.py +16 -8
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_classification.py +76 -30
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/keys/_keys.py +1 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/utilities/__init__.py +10 -0
- dragon_ml_toolbox-20.7.0/ml_tools/utilities/_translate.py +292 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/LICENSE +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/README.md +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_basic_clean.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_engineering/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_engineering/_dragon_engineering.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_engineering/_transforms.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/GUI_tools/_GUI_tools.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/GUI_tools/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/IO_tools/_IO_loggers.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/IO_tools/_IO_save_load.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/IO_tools/_IO_utils.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/IO_tools/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/MICE/_MICE_imputation.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/MICE/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/MICE/_dragon_mice.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_callbacks/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_callbacks/_base.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_callbacks/_checkpoint.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_callbacks/_early_stop.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_callbacks/_scheduler.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_chain/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_chain/_chaining_tools.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_chain/_dragon_chain.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_chain/_update_schema.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_base_model_config.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_finalize.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_models.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_configuration/_training.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_datasetmaster/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_datasetmaster/_base_datasetmaster.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_datasetmaster/_datasetmaster.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_datasetmaster/_sequence_datasetmaster.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_datasetmaster/_vision_datasetmaster.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_feature_importance.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_loss.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_regression.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_sequence.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_vision.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation_captum/_ML_evaluation_captum.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation_captum/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_finalize_handler/_ML_finalize_handler.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_finalize_handler/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference/_base_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference/_chain_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference/_dragon_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference/_multi_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference_sequence/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference_sequence/_sequence_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference_vision/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_inference_vision/_vision_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_base_mlp_attention.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_base_save_load.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_dragon_autoint.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_dragon_gate.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_dragon_node.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_dragon_tabnet.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_dragon_tabular.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_mlp_attention.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models/_models_advanced_helpers.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_sequence/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_sequence/_sequence_models.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_vision/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_vision/_base_wrapper.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_vision/_image_classification.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_vision/_image_segmentation.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_models_vision/_object_detection.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_optimization/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_optimization/_multi_dragon.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_optimization/_single_dragon.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_optimization/_single_manual.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_scaler/_ML_scaler.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_scaler/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_trainer/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_trainer/_base_trainer.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_trainer/_dragon_detection_trainer.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_trainer/_dragon_sequence_trainer.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_trainer/_dragon_trainer.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_utilities/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_utilities/_artifact_finder.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_utilities/_inspection.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_utilities/_train_tools.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_vision_transformers/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_vision_transformers/_core_transforms.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_vision_transformers/_offline_augmentation.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/PSO_optimization/_PSO.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/PSO_optimization/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/SQL/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/SQL/_dragon_SQL.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/VIF/_VIF_factor.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/VIF/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/_core/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/_core/_logger.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/_core/_schema_load_ops.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/_core/_script_info.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/_analysis.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/_cleaning.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/_features.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/_plotting.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/data_exploration/_schema_ops.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_evaluation/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_evaluation/_ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_inference/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_inference/_ensemble_inference.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_learning/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ensemble_learning/_ensemble_learning.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/excel_handler/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/excel_handler/_excel_handler.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/keys/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/math_utilities/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/math_utilities/_math_utilities.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/optimization_tools/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/optimization_tools/_optimization_bounds.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/optimization_tools/_optimization_plots.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/path_manager/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/path_manager/_dragonmanager.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/path_manager/_path_tools.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/plot_fonts/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/plot_fonts/_plot_fonts.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/schema/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/schema/_feature_schema.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/schema/_gui_schema.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/serde/__init__.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/serde/_serde.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/utilities/_utility_save_load.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/utilities/_utility_tools.py +0 -0
- {dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/setup.cfg +0 -0
|
@@ -10,7 +10,8 @@ from ._dragon_cleaner import (
|
|
|
10
10
|
)
|
|
11
11
|
|
|
12
12
|
from ._clean_tools import (
|
|
13
|
-
save_unique_values
|
|
13
|
+
save_unique_values,
|
|
14
|
+
save_category_counts,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
from .._core import _imprimir_disponibles
|
|
@@ -20,6 +21,7 @@ __all__ = [
|
|
|
20
21
|
"DragonColumnCleaner",
|
|
21
22
|
"DragonDataFrameCleaner",
|
|
22
23
|
"save_unique_values",
|
|
24
|
+
"save_category_counts",
|
|
23
25
|
"basic_clean",
|
|
24
26
|
"basic_clean_drop",
|
|
25
27
|
"drop_macro_polars",
|
|
@@ -13,6 +13,7 @@ _LOGGER = get_logger("ETL Clean Tools")
|
|
|
13
13
|
|
|
14
14
|
__all__ = [
|
|
15
15
|
"save_unique_values",
|
|
16
|
+
"save_category_counts",
|
|
16
17
|
]
|
|
17
18
|
|
|
18
19
|
|
|
@@ -126,3 +127,111 @@ def save_unique_values(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
|
126
127
|
counter += 1
|
|
127
128
|
|
|
128
129
|
_LOGGER.info(f"{counter} files of unique values created.")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
################ Category Counts per column #################
|
|
133
|
+
def save_category_counts(csv_path_or_df: Union[str, Path, pl.DataFrame],
|
|
134
|
+
output_dir: Union[str, Path],
|
|
135
|
+
use_columns: Optional[list[str]] = None,
|
|
136
|
+
verbose: bool = False,
|
|
137
|
+
keep_column_order: bool = True) -> None:
|
|
138
|
+
"""
|
|
139
|
+
Calculates the frequency and percentage of each unique value in the specified columns
|
|
140
|
+
and saves the distribution report to a text file.
|
|
141
|
+
|
|
142
|
+
Useful for checking class balance or identifying rare categories.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
csv_path_or_df (str | Path | pl.DataFrame):
|
|
146
|
+
The file path to the input CSV file or a Polars DataFrame.
|
|
147
|
+
output_dir (str | Path):
|
|
148
|
+
The directory where the report files will be saved.
|
|
149
|
+
use_columns (List[str] | None):
|
|
150
|
+
Columns to analyze. If None, all columns are processed.
|
|
151
|
+
verbose (bool):
|
|
152
|
+
If True, prints progress info.
|
|
153
|
+
keep_column_order (bool):
|
|
154
|
+
If True, prepends a numeric prefix to filenames to maintain order.
|
|
155
|
+
"""
|
|
156
|
+
# 1. Handle Input
|
|
157
|
+
if isinstance(csv_path_or_df, pl.DataFrame):
|
|
158
|
+
df = csv_path_or_df
|
|
159
|
+
if use_columns:
|
|
160
|
+
valid_cols = [c for c in use_columns if c in df.columns]
|
|
161
|
+
if not valid_cols:
|
|
162
|
+
_LOGGER.error("None of the specified columns in 'use_columns' exist in the provided DataFrame.")
|
|
163
|
+
raise ValueError()
|
|
164
|
+
df = df.select(valid_cols)
|
|
165
|
+
else:
|
|
166
|
+
csv_path = make_fullpath(input_path=csv_path_or_df, enforce="file")
|
|
167
|
+
df = load_dataframe(df_path=csv_path, use_columns=use_columns, kind="polars", all_strings=True)[0]
|
|
168
|
+
|
|
169
|
+
output_path = make_fullpath(input_path=output_dir, make=True, enforce='directory')
|
|
170
|
+
total_rows = df.height
|
|
171
|
+
|
|
172
|
+
if total_rows == 0:
|
|
173
|
+
_LOGGER.warning("Input DataFrame is empty. No counts to save.")
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
counter = 0
|
|
177
|
+
|
|
178
|
+
# 2. Process Each Column
|
|
179
|
+
for i, col_name in enumerate(df.columns):
|
|
180
|
+
try:
|
|
181
|
+
# Group by, count, and calculate percentage
|
|
182
|
+
# We treat nulls as a category here to see missing data frequency
|
|
183
|
+
stats = (
|
|
184
|
+
df.select(pl.col(col_name))
|
|
185
|
+
.group_by(col_name, maintain_order=False)
|
|
186
|
+
.len(name="count")
|
|
187
|
+
.with_columns(
|
|
188
|
+
(pl.col("count") / total_rows * 100).alias("pct")
|
|
189
|
+
)
|
|
190
|
+
.sort("count", descending=True)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Collect to python list of dicts for writing
|
|
194
|
+
rows = stats.iter_rows(named=True)
|
|
195
|
+
unique_count = stats.height
|
|
196
|
+
|
|
197
|
+
# Check thresholds for warning
|
|
198
|
+
is_high_cardinality = (unique_count > 300) or ((unique_count / total_rows) > 0.5)
|
|
199
|
+
|
|
200
|
+
except Exception:
|
|
201
|
+
_LOGGER.error(f"Could not calculate counts for column '{col_name}'.")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
# 3. Write to File
|
|
205
|
+
sanitized_name = sanitize_filename(col_name)
|
|
206
|
+
if not sanitized_name.strip('_'):
|
|
207
|
+
sanitized_name = f'column_{i}'
|
|
208
|
+
|
|
209
|
+
prefix = f"{i + 1}_" if keep_column_order else ''
|
|
210
|
+
file_path = output_path / f"{prefix}{sanitized_name}_counts.txt"
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
with open(file_path, 'w', encoding='utf-8') as f:
|
|
214
|
+
f.write(f"# Distribution for column: '{col_name}'\n")
|
|
215
|
+
f.write(f"# Total Rows: {total_rows} | Unique Values: {unique_count}\n")
|
|
216
|
+
|
|
217
|
+
if is_high_cardinality:
|
|
218
|
+
f.write(f"# WARNING: High cardinality detected (Unique/Total ratio: {unique_count/total_rows:.2%}).\n")
|
|
219
|
+
|
|
220
|
+
f.write("-" * 65 + "\n")
|
|
221
|
+
f.write(f"{'Count':<10} | {'Percentage':<12} | {'Value'}\n")
|
|
222
|
+
f.write("-" * 65 + "\n")
|
|
223
|
+
|
|
224
|
+
for row in rows:
|
|
225
|
+
val = str(row[col_name])
|
|
226
|
+
count = row["count"]
|
|
227
|
+
pct = row["pct"]
|
|
228
|
+
f.write(f"{count:<10} | {pct:>10.2f}% | {val}\n")
|
|
229
|
+
|
|
230
|
+
except IOError:
|
|
231
|
+
_LOGGER.exception(f"Error writing to file {file_path}.")
|
|
232
|
+
else:
|
|
233
|
+
if verbose:
|
|
234
|
+
print(f" Saved distribution for '{col_name}'.")
|
|
235
|
+
counter += 1
|
|
236
|
+
|
|
237
|
+
_LOGGER.info(f"{counter} distribution files created.")
|
{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ETL_cleaning/_dragon_cleaner.py
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from typing import Union
|
|
3
|
+
from typing import Union, Optional
|
|
4
4
|
|
|
5
5
|
from ..utilities import save_dataframe_filename, load_dataframe
|
|
6
6
|
|
|
7
7
|
from .._core import get_logger
|
|
8
8
|
from ..path_manager import make_fullpath
|
|
9
9
|
|
|
10
|
-
from ._clean_tools import save_unique_values
|
|
10
|
+
from ._clean_tools import save_unique_values, save_category_counts
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
_LOGGER = get_logger("DragonCleaner")
|
|
@@ -33,12 +33,18 @@ class DragonColumnCleaner:
|
|
|
33
33
|
"""
|
|
34
34
|
def __init__(self,
|
|
35
35
|
column_name: str,
|
|
36
|
-
|
|
36
|
+
exact_matches: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
|
|
37
|
+
rules: Optional[Union[dict[str, Union[str, None]], dict[str, str]]] = None,
|
|
37
38
|
case_insensitive: bool = False):
|
|
38
39
|
"""
|
|
39
40
|
Args:
|
|
40
41
|
column_name (str):
|
|
41
42
|
The name of the column to be cleaned.
|
|
43
|
+
exact_matches (Dict[str, str | None]):
|
|
44
|
+
A dictionary of EXACT string matches to replacement strings.
|
|
45
|
+
- Uses a hash map, which is significantly faster than regex.
|
|
46
|
+
- Used for simple 1-to-1 mappings (e.g., {'Aluminum': 'Al'}).
|
|
47
|
+
- Runs BEFORE the regex rules.
|
|
42
48
|
rules (Dict[str, str | None]):
|
|
43
49
|
A dictionary of regex patterns to replacement strings.
|
|
44
50
|
- Replacement can be None to indicate that matching values should be converted to null.
|
|
@@ -61,25 +67,47 @@ class DragonColumnCleaner:
|
|
|
61
67
|
if not isinstance(column_name, str) or not column_name:
|
|
62
68
|
_LOGGER.error("The 'column_name' must be a non-empty string.")
|
|
63
69
|
raise TypeError()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
if not isinstance(pattern, str):
|
|
70
|
-
_LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
|
|
70
|
+
|
|
71
|
+
# Validate Regex Rules
|
|
72
|
+
if rules is not None:
|
|
73
|
+
if not isinstance(rules, dict):
|
|
74
|
+
_LOGGER.error("The 'rules' argument must be a dictionary.")
|
|
71
75
|
raise TypeError()
|
|
72
|
-
|
|
73
|
-
|
|
76
|
+
for pattern, replacement in rules.items():
|
|
77
|
+
if not isinstance(pattern, str):
|
|
78
|
+
_LOGGER.error("All keys in 'rules' must be strings representing regex patterns.")
|
|
79
|
+
raise TypeError()
|
|
80
|
+
if replacement is not None and not isinstance(replacement, str):
|
|
81
|
+
_LOGGER.error("All values in 'rules' must be strings or None (for nullification).")
|
|
82
|
+
raise TypeError()
|
|
83
|
+
|
|
84
|
+
# Validate Exact Matches
|
|
85
|
+
if exact_matches is not None:
|
|
86
|
+
if not isinstance(exact_matches, dict):
|
|
87
|
+
_LOGGER.error("The 'exact_matches' argument must be a dictionary.")
|
|
74
88
|
raise TypeError()
|
|
89
|
+
for key, val in exact_matches.items():
|
|
90
|
+
if not isinstance(key, str):
|
|
91
|
+
_LOGGER.error("All keys in 'exact_matches' must be strings.")
|
|
92
|
+
raise TypeError()
|
|
93
|
+
if val is not None and not isinstance(val, str):
|
|
94
|
+
_LOGGER.error("All values in 'exact_matches' must be strings or None.")
|
|
95
|
+
raise TypeError()
|
|
96
|
+
|
|
97
|
+
# Raise if both are None or empty
|
|
98
|
+
if not rules and not exact_matches:
|
|
99
|
+
_LOGGER.error("At least one of 'rules' or 'exact_matches' must be provided.")
|
|
100
|
+
raise ValueError()
|
|
75
101
|
|
|
76
102
|
self.column_name = column_name
|
|
77
|
-
self.rules = rules
|
|
103
|
+
self.rules = rules if rules else {}
|
|
104
|
+
self.exact_matches = exact_matches if exact_matches else {}
|
|
78
105
|
self.case_insensitive = case_insensitive
|
|
79
106
|
|
|
80
107
|
def preview(self,
|
|
81
108
|
csv_path: Union[str, Path],
|
|
82
109
|
report_dir: Union[str, Path],
|
|
110
|
+
show_distribution: bool = True,
|
|
83
111
|
add_value_separator: bool=False,
|
|
84
112
|
rule_batch_size: int = 150):
|
|
85
113
|
"""
|
|
@@ -90,6 +118,8 @@ class DragonColumnCleaner:
|
|
|
90
118
|
The path to the CSV file containing the data to clean.
|
|
91
119
|
report_dir (str | Path):
|
|
92
120
|
The directory where the preview report will be saved.
|
|
121
|
+
show_distribution (bool):
|
|
122
|
+
If True, generates a category count report for the column after cleaning.
|
|
93
123
|
add_value_separator (bool):
|
|
94
124
|
If True, adds a separator line between each unique value in the report.
|
|
95
125
|
rule_batch_size (int):
|
|
@@ -101,13 +131,21 @@ class DragonColumnCleaner:
|
|
|
101
131
|
preview_cleaner = DragonDataFrameCleaner(cleaners=[self])
|
|
102
132
|
df_preview = preview_cleaner.clean(df, rule_batch_size=rule_batch_size)
|
|
103
133
|
|
|
104
|
-
# Apply cleaning rules
|
|
134
|
+
# Apply cleaning rules and save reports
|
|
105
135
|
save_unique_values(csv_path_or_df=df_preview,
|
|
106
136
|
output_dir=report_dir,
|
|
107
137
|
use_columns=[self.column_name],
|
|
108
138
|
verbose=False,
|
|
109
139
|
keep_column_order=False,
|
|
110
140
|
add_value_separator=add_value_separator)
|
|
141
|
+
|
|
142
|
+
# Optionally save category counts
|
|
143
|
+
if show_distribution:
|
|
144
|
+
save_category_counts(csv_path_or_df=df_preview,
|
|
145
|
+
output_dir=report_dir,
|
|
146
|
+
use_columns=[self.column_name],
|
|
147
|
+
verbose=False,
|
|
148
|
+
keep_column_order=False)
|
|
111
149
|
|
|
112
150
|
|
|
113
151
|
class DragonDataFrameCleaner:
|
|
@@ -181,16 +219,23 @@ class DragonDataFrameCleaner:
|
|
|
181
219
|
for cleaner in self.cleaners:
|
|
182
220
|
col_name = cleaner.column_name
|
|
183
221
|
|
|
184
|
-
#
|
|
222
|
+
# Start expression for this batch
|
|
223
|
+
col_expr = pl.col(col_name).cast(pl.String)
|
|
224
|
+
|
|
225
|
+
# --- PHASE 1: EXACT MATCHES ---
|
|
226
|
+
# Apply dictionary-based replacement first (faster than regex)
|
|
227
|
+
if cleaner.exact_matches:
|
|
228
|
+
# 'replace' handles dictionary mapping safely. If value is mapped to None, it becomes null.
|
|
229
|
+
col_expr = col_expr.replace(cleaner.exact_matches)
|
|
230
|
+
|
|
231
|
+
# --- PHASE 2: REGEX PATTERNS ---
|
|
185
232
|
all_rules = list(cleaner.rules.items())
|
|
186
233
|
|
|
187
234
|
# Process in batches of 'rule_batch_size'
|
|
188
235
|
for i in range(0, len(all_rules), rule_batch_size):
|
|
189
236
|
rule_batch = all_rules[i : i + rule_batch_size]
|
|
190
237
|
|
|
191
|
-
#
|
|
192
|
-
col_expr = pl.col(col_name).cast(pl.String)
|
|
193
|
-
|
|
238
|
+
# continue chaining operations on the same col_expr
|
|
194
239
|
for pattern, replacement in rule_batch:
|
|
195
240
|
final_pattern = f"(?i){pattern}" if cleaner.case_insensitive else pattern
|
|
196
241
|
|
|
@@ -202,6 +247,15 @@ class DragonDataFrameCleaner:
|
|
|
202
247
|
col_expr = col_expr.str.replace_all(final_pattern, replacement)
|
|
203
248
|
|
|
204
249
|
# Apply this batch of rules to the LazyFrame
|
|
250
|
+
# apply partially here to keep the logical plan size under control
|
|
251
|
+
final_lf = final_lf.with_columns(col_expr.alias(col_name))
|
|
252
|
+
|
|
253
|
+
# Reset col_expr for the next batch, but pointing to the 'new' column
|
|
254
|
+
# This ensures the next batch works on the result of the previous batch
|
|
255
|
+
col_expr = pl.col(col_name)
|
|
256
|
+
|
|
257
|
+
# If we had exact matches but NO regex rules, we still need to apply the expression once
|
|
258
|
+
if cleaner.exact_matches and not all_rules:
|
|
205
259
|
final_lf = final_lf.with_columns(col_expr.alias(col_name))
|
|
206
260
|
|
|
207
261
|
# 3. Collect Results
|
|
@@ -242,4 +296,3 @@ class DragonDataFrameCleaner:
|
|
|
242
296
|
save_dataframe_filename(df=df_clean, save_dir=output_filepath.parent, filename=output_filepath.name)
|
|
243
297
|
|
|
244
298
|
return None
|
|
245
|
-
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Union
|
|
1
|
+
from typing import Union, Literal
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
__all__ = [
|
|
@@ -26,7 +26,7 @@ class _BaseClassificationFormat:
|
|
|
26
26
|
def __init__(self,
|
|
27
27
|
cmap: str="BuGn",
|
|
28
28
|
ROC_PR_line: str='darkorange',
|
|
29
|
-
calibration_bins: int=
|
|
29
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
30
30
|
xtick_size: int=22,
|
|
31
31
|
ytick_size: int=22,
|
|
32
32
|
legend_size: int=26,
|
|
@@ -46,8 +46,8 @@ class _BaseClassificationFormat:
|
|
|
46
46
|
- Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
|
|
47
47
|
- Hex codes: '#FF6347', '#4682B4'
|
|
48
48
|
|
|
49
|
-
calibration_bins (int): The number of bins to use when
|
|
50
|
-
|
|
49
|
+
calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plot. If 'auto', the number will be dynamically determined based on the number of samples.
|
|
50
|
+
- Typical int values: 10, 15, 20
|
|
51
51
|
|
|
52
52
|
font_size (int): The base font size to apply to the plots.
|
|
53
53
|
|
|
@@ -97,6 +97,7 @@ class _BaseMultiLabelFormat:
|
|
|
97
97
|
def __init__(self,
|
|
98
98
|
cmap: str = "BuGn",
|
|
99
99
|
ROC_PR_line: str='darkorange',
|
|
100
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
100
101
|
font_size: int = 25,
|
|
101
102
|
xtick_size: int=20,
|
|
102
103
|
ytick_size: int=20,
|
|
@@ -115,6 +116,9 @@ class _BaseMultiLabelFormat:
|
|
|
115
116
|
- Common color names: 'darkorange', 'cornflowerblue', 'crimson', 'forestgreen'
|
|
116
117
|
- Hex codes: '#FF6347', '#4682B4'
|
|
117
118
|
|
|
119
|
+
calibration_bins (int | 'auto'): The number of bins to use when creating the calibration (reliability) plots for each label. If 'auto', the number will be dynamically determined based on the number of samples.
|
|
120
|
+
- Typical int values: 10, 15, 20
|
|
121
|
+
|
|
118
122
|
font_size (int): The base font size to apply to the plots.
|
|
119
123
|
|
|
120
124
|
xtick_size (int): Font size for x-axis tick labels.
|
|
@@ -133,6 +137,7 @@ class _BaseMultiLabelFormat:
|
|
|
133
137
|
"""
|
|
134
138
|
self.cmap = cmap
|
|
135
139
|
self.ROC_PR_line = ROC_PR_line
|
|
140
|
+
self.calibration_bins = calibration_bins
|
|
136
141
|
self.font_size = font_size
|
|
137
142
|
self.xtick_size = xtick_size
|
|
138
143
|
self.ytick_size = ytick_size
|
|
@@ -142,6 +147,7 @@ class _BaseMultiLabelFormat:
|
|
|
142
147
|
parts = [
|
|
143
148
|
f"cmap='{self.cmap}'",
|
|
144
149
|
f"ROC_PR_line='{self.ROC_PR_line}'",
|
|
150
|
+
f"calibration_bins={self.calibration_bins}",
|
|
145
151
|
f"font_size={self.font_size}",
|
|
146
152
|
f"xtick_size={self.xtick_size}",
|
|
147
153
|
f"ytick_size={self.ytick_size}",
|
|
@@ -416,7 +422,7 @@ class FormatBinaryClassificationMetrics(_BaseClassificationFormat):
|
|
|
416
422
|
def __init__(self,
|
|
417
423
|
cmap: str="BuGn",
|
|
418
424
|
ROC_PR_line: str='darkorange',
|
|
419
|
-
calibration_bins: int=
|
|
425
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
420
426
|
font_size: int=26,
|
|
421
427
|
xtick_size: int=22,
|
|
422
428
|
ytick_size: int=22,
|
|
@@ -440,7 +446,7 @@ class FormatMultiClassClassificationMetrics(_BaseClassificationFormat):
|
|
|
440
446
|
def __init__(self,
|
|
441
447
|
cmap: str="BuGn",
|
|
442
448
|
ROC_PR_line: str='darkorange',
|
|
443
|
-
calibration_bins: int=
|
|
449
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
444
450
|
font_size: int=26,
|
|
445
451
|
xtick_size: int=22,
|
|
446
452
|
ytick_size: int=22,
|
|
@@ -464,7 +470,7 @@ class FormatBinaryImageClassificationMetrics(_BaseClassificationFormat):
|
|
|
464
470
|
def __init__(self,
|
|
465
471
|
cmap: str="BuGn",
|
|
466
472
|
ROC_PR_line: str='darkorange',
|
|
467
|
-
calibration_bins: int=
|
|
473
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
468
474
|
font_size: int=26,
|
|
469
475
|
xtick_size: int=22,
|
|
470
476
|
ytick_size: int=22,
|
|
@@ -488,7 +494,7 @@ class FormatMultiClassImageClassificationMetrics(_BaseClassificationFormat):
|
|
|
488
494
|
def __init__(self,
|
|
489
495
|
cmap: str="BuGn",
|
|
490
496
|
ROC_PR_line: str='darkorange',
|
|
491
|
-
calibration_bins: int=
|
|
497
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
492
498
|
font_size: int=26,
|
|
493
499
|
xtick_size: int=22,
|
|
494
500
|
ytick_size: int=22,
|
|
@@ -513,6 +519,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
|
|
|
513
519
|
def __init__(self,
|
|
514
520
|
cmap: str = "BuGn",
|
|
515
521
|
ROC_PR_line: str='darkorange',
|
|
522
|
+
calibration_bins: Union[int, Literal['auto']]='auto',
|
|
516
523
|
font_size: int = 25,
|
|
517
524
|
xtick_size: int=20,
|
|
518
525
|
ytick_size: int=20,
|
|
@@ -520,6 +527,7 @@ class FormatMultiLabelBinaryClassificationMetrics(_BaseMultiLabelFormat):
|
|
|
520
527
|
) -> None:
|
|
521
528
|
super().__init__(cmap=cmap,
|
|
522
529
|
ROC_PR_line=ROC_PR_line,
|
|
530
|
+
calibration_bins=calibration_bins,
|
|
523
531
|
font_size=font_size,
|
|
524
532
|
xtick_size=xtick_size,
|
|
525
533
|
ytick_size=ytick_size,
|
{dragon_ml_toolbox-20.5.0 → dragon_ml_toolbox-20.7.0}/ml_tools/ML_evaluation/_classification.py
RENAMED
|
@@ -2,7 +2,7 @@ import numpy as np
|
|
|
2
2
|
import pandas as pd
|
|
3
3
|
import matplotlib.pyplot as plt
|
|
4
4
|
import seaborn as sns
|
|
5
|
-
from sklearn.calibration import
|
|
5
|
+
from sklearn.calibration import calibration_curve
|
|
6
6
|
from sklearn.metrics import (
|
|
7
7
|
classification_report,
|
|
8
8
|
ConfusionMatrixDisplay,
|
|
@@ -378,42 +378,42 @@ def classification_metrics(save_dir: Union[str, Path],
|
|
|
378
378
|
|
|
379
379
|
# --- Save Calibration Plot ---
|
|
380
380
|
fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
|
|
381
|
+
|
|
382
|
+
user_chosen_bins = format_config.calibration_bins
|
|
383
|
+
|
|
384
|
+
# --- Automate Bin Selection ---
|
|
385
|
+
if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
|
|
386
|
+
# Determine bins based on number of samples
|
|
387
|
+
n_samples = y_true.shape[0]
|
|
388
|
+
if n_samples < 200:
|
|
389
|
+
dynamic_bins = 5
|
|
390
|
+
elif n_samples < 1000:
|
|
391
|
+
dynamic_bins = 10
|
|
392
|
+
else:
|
|
393
|
+
dynamic_bins = 15
|
|
394
|
+
else:
|
|
395
|
+
dynamic_bins = user_chosen_bins
|
|
396
|
+
|
|
397
|
+
# --- Step 1: Get binned data directly ---
|
|
398
|
+
# calculates reliability diagram data without needing a temporary plot
|
|
399
|
+
prob_true, prob_pred = calibration_curve(y_true_binary, y_score, n_bins=dynamic_bins)
|
|
381
400
|
|
|
382
|
-
# --- Step
|
|
383
|
-
with plt.ioff(): # Suppress showing the temporary plot
|
|
384
|
-
fig_temp, ax_temp = plt.subplots()
|
|
385
|
-
cal_display_temp = CalibrationDisplay.from_predictions(
|
|
386
|
-
y_true_binary, # Use binarized labels
|
|
387
|
-
y_score,
|
|
388
|
-
n_bins=format_config.calibration_bins,
|
|
389
|
-
ax=ax_temp,
|
|
390
|
-
name="temp" # Add a name to suppress potential warnings
|
|
391
|
-
)
|
|
392
|
-
# Get the x, y coordinates of the binned data
|
|
393
|
-
line_x, line_y = cal_display_temp.line_.get_data() # type: ignore
|
|
394
|
-
plt.close(fig_temp) # Close the temporary plot
|
|
395
|
-
|
|
396
|
-
# --- Step 2: Build the plot from scratch ---
|
|
401
|
+
# --- Step 2: Plot ---
|
|
397
402
|
ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
|
398
403
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
'color': format_config.ROC_PR_line,
|
|
407
|
-
'linestyle': '--',
|
|
408
|
-
'linewidth': 2,
|
|
409
|
-
}
|
|
410
|
-
)
|
|
404
|
+
# Plot the actual calibration curve (connect points with a line)
|
|
405
|
+
ax_cal.plot(prob_pred,
|
|
406
|
+
prob_true,
|
|
407
|
+
marker='o', # Add markers to see bin locations
|
|
408
|
+
linewidth=2,
|
|
409
|
+
label="Model calibration",
|
|
410
|
+
color=format_config.ROC_PR_line)
|
|
411
411
|
|
|
412
412
|
ax_cal.set_title(f'Reliability Curve{plot_title}', pad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size + 2)
|
|
413
413
|
ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
|
|
414
414
|
ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=format_config.font_size)
|
|
415
415
|
|
|
416
|
-
# --- Step 3: Set final limits
|
|
416
|
+
# --- Step 3: Set final limits ---
|
|
417
417
|
ax_cal.set_ylim(0.0, 1.0)
|
|
418
418
|
ax_cal.set_xlim(0.0, 1.0)
|
|
419
419
|
|
|
@@ -428,7 +428,7 @@ def classification_metrics(save_dir: Union[str, Path],
|
|
|
428
428
|
cal_path = save_dir_path / f"calibration_plot{save_suffix}.svg"
|
|
429
429
|
plt.savefig(cal_path)
|
|
430
430
|
plt.close(fig_cal)
|
|
431
|
-
|
|
431
|
+
|
|
432
432
|
_LOGGER.info(f"📈 Saved {len(class_indices_to_plot)} sets of ROC, Precision-Recall, and Calibration plots.")
|
|
433
433
|
|
|
434
434
|
|
|
@@ -632,6 +632,52 @@ def multi_label_classification_metrics(
|
|
|
632
632
|
pr_path = save_dir_path / f"pr_curve_{sanitized_name}.svg"
|
|
633
633
|
plt.savefig(pr_path)
|
|
634
634
|
plt.close(fig_pr)
|
|
635
|
+
|
|
636
|
+
# --- Save Calibration Plot (New Feature) ---
|
|
637
|
+
fig_cal, ax_cal = plt.subplots(figsize=CLASSIFICATION_PLOT_SIZE, dpi=DPI_value)
|
|
638
|
+
|
|
639
|
+
user_chosen_bins = format_config.calibration_bins
|
|
640
|
+
|
|
641
|
+
# --- Automate Bin Selection ---
|
|
642
|
+
if not isinstance(user_chosen_bins, int) or user_chosen_bins <= 0:
|
|
643
|
+
# Determine bins based on number of samples
|
|
644
|
+
n_samples = y_true.shape[0]
|
|
645
|
+
if n_samples < 200:
|
|
646
|
+
dynamic_bins = 5
|
|
647
|
+
elif n_samples < 1000:
|
|
648
|
+
dynamic_bins = 10
|
|
649
|
+
else:
|
|
650
|
+
dynamic_bins = 15
|
|
651
|
+
else:
|
|
652
|
+
dynamic_bins = user_chosen_bins
|
|
653
|
+
|
|
654
|
+
# Calculate calibration curve for this specific label
|
|
655
|
+
prob_true, prob_pred = calibration_curve(true_i, prob_i, n_bins=dynamic_bins)
|
|
656
|
+
|
|
657
|
+
ax_cal.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
|
|
658
|
+
ax_cal.plot(prob_pred,
|
|
659
|
+
prob_true,
|
|
660
|
+
marker='o',
|
|
661
|
+
linewidth=2,
|
|
662
|
+
label=f"Calibration for '{name}'",
|
|
663
|
+
color=format_config.ROC_PR_line)
|
|
664
|
+
|
|
665
|
+
ax_cal.set_title(f'Reliability Curve for "{name}"', pad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size + 2)
|
|
666
|
+
ax_cal.set_xlabel('Mean Predicted Probability', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
|
|
667
|
+
ax_cal.set_ylabel('Fraction of Positives', labelpad=_EvaluationConfig.LABEL_PADDING, fontsize=base_font_size)
|
|
668
|
+
|
|
669
|
+
ax_cal.set_ylim(0.0, 1.0)
|
|
670
|
+
ax_cal.set_xlim(0.0, 1.0)
|
|
671
|
+
|
|
672
|
+
ax_cal.tick_params(axis='x', labelsize=xtick_size)
|
|
673
|
+
ax_cal.tick_params(axis='y', labelsize=ytick_size)
|
|
674
|
+
ax_cal.legend(loc='lower right', fontsize=legend_size)
|
|
675
|
+
ax_cal.grid(True)
|
|
676
|
+
|
|
677
|
+
plt.tight_layout()
|
|
678
|
+
cal_path = save_dir_path / f"calibration_plot_{sanitized_name}.svg"
|
|
679
|
+
plt.savefig(cal_path)
|
|
680
|
+
plt.close(fig_cal)
|
|
635
681
|
|
|
636
682
|
_LOGGER.info(f"All individual label reports and plots saved to '{save_dir_path.name}'")
|
|
637
683
|
|
|
@@ -15,6 +15,13 @@ from ._utility_tools import (
|
|
|
15
15
|
train_dataset_yielder
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
+
from ._translate import (
|
|
19
|
+
translate_dataframe_columns,
|
|
20
|
+
create_translation_template,
|
|
21
|
+
audit_column_translation
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
18
25
|
from .._core import _imprimir_disponibles
|
|
19
26
|
|
|
20
27
|
|
|
@@ -27,6 +34,9 @@ __all__ = [
|
|
|
27
34
|
"save_dataframe",
|
|
28
35
|
"save_dataframe_with_schema",
|
|
29
36
|
"merge_dataframes",
|
|
37
|
+
"translate_dataframe_columns",
|
|
38
|
+
"create_translation_template",
|
|
39
|
+
"audit_column_translation",
|
|
30
40
|
"distribute_dataset_by_target",
|
|
31
41
|
"train_dataset_orchestrator",
|
|
32
42
|
"train_dataset_yielder"
|