dragon-ml-toolbox 19.13.0__tar.gz → 19.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.13.0/dragon_ml_toolbox.egg-info → dragon_ml_toolbox-19.14.0}/PKG-INFO +1 -1
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0/dragon_ml_toolbox.egg-info}/PKG-INFO +1 -1
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_data_exploration.py +123 -115
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/pyproject.toml +1 -1
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/LICENSE +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/README.md +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/SOURCES.txt +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/dependency_links.txt +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/requires.txt +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/top_level.txt +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ETL_engineering.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/GUI_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/IO_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/MICE_imputation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_callbacks.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_chaining_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_chaining_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_configuration.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_configuration_pytab.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation_captum.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_finalize_handler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models_advanced.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_models_pytab.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_optimization.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_optimization_pareto.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_scaler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_trainer.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_vision_transformers.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/PSO_optimization.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/SQL.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/VIF_factor.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/__init__.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ETL_cleaning.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ETL_engineering.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_GUI_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_IO_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_MICE_imputation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_callbacks.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_configuration.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_configuration_pytab.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_captum.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_multi.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_finalize_handler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models_advanced.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models_pytab.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_optimization.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_optimization_pareto.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_scaler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_trainer.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_datasetmaster.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_models.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_transformers.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_PSO_optimization.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_SQL.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_VIF_factor.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/__init__.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_learning.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_excel_handler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_keys.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_logger.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_math_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_base.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_helpers.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_optimization_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_path_manager.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_plot_fonts.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_schema.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_script_info.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_serde.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/constants.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/data_exploration.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_evaluation.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_inference.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ensemble_learning.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/excel_handler.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/keys.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/math_utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/optimization_tools.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/path_manager.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/plot_fonts.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/schema.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/serde.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/utilities.py +0 -0
- {dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.14.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dragon-ml-toolbox
|
|
3
|
-
Version: 19.
|
|
3
|
+
Version: 19.14.0
|
|
4
4
|
Summary: Complete pipelines and helper tools for data science and machine learning projects.
|
|
5
5
|
Author-email: Karl Luigi Loza Vidaurre <luigiloza@gmail.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -26,13 +26,13 @@ __all__ = [
|
|
|
26
26
|
"drop_macro",
|
|
27
27
|
"clean_column_names",
|
|
28
28
|
"plot_value_distributions",
|
|
29
|
-
"plot_continuous_vs_target",
|
|
30
|
-
"plot_categorical_vs_target",
|
|
31
29
|
"split_features_targets",
|
|
32
30
|
"encode_categorical_features",
|
|
33
31
|
"clip_outliers_single",
|
|
34
32
|
"clip_outliers_multi",
|
|
35
33
|
"drop_outlier_samples",
|
|
34
|
+
"plot_continuous_vs_target",
|
|
35
|
+
"plot_categorical_vs_target",
|
|
36
36
|
"plot_correlation_heatmap",
|
|
37
37
|
"finalize_feature_schema",
|
|
38
38
|
"match_and_filter_columns_by_regex",
|
|
@@ -59,16 +59,18 @@ def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
|
59
59
|
"""
|
|
60
60
|
summary = pd.DataFrame({
|
|
61
61
|
'Data Type': df.dtypes,
|
|
62
|
-
'
|
|
62
|
+
'Completeness %': (df.notnull().mean() * 100).round(2),
|
|
63
63
|
'Unique Values': df.nunique(),
|
|
64
|
-
'Missing %': (df.isnull().mean() * 100).round(
|
|
64
|
+
# 'Missing %': (df.isnull().mean() * 100).round(2)
|
|
65
65
|
})
|
|
66
66
|
|
|
67
67
|
# For numeric columns, add summary statistics
|
|
68
68
|
numeric_cols = df.select_dtypes(include='number').columns
|
|
69
69
|
if not numeric_cols.empty:
|
|
70
|
-
|
|
71
|
-
|
|
70
|
+
stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
|
|
71
|
+
|
|
72
|
+
summary_numeric = stats.T[
|
|
73
|
+
['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
|
|
72
74
|
].round(round_digits)
|
|
73
75
|
summary = summary.join(summary_numeric, how='left')
|
|
74
76
|
|
|
@@ -596,68 +598,55 @@ def plot_value_distributions(
|
|
|
596
598
|
|
|
597
599
|
|
|
598
600
|
def plot_continuous_vs_target(
|
|
599
|
-
|
|
600
|
-
|
|
601
|
+
df_continuous: pd.DataFrame,
|
|
602
|
+
df_targets: pd.DataFrame,
|
|
601
603
|
save_dir: Union[str, Path],
|
|
602
|
-
|
|
604
|
+
verbose: int = 1
|
|
603
605
|
):
|
|
604
606
|
"""
|
|
605
|
-
Plots each continuous feature against each target
|
|
607
|
+
Plots each continuous feature from df_continuous against each target in df_targets.
|
|
606
608
|
|
|
607
|
-
This function
|
|
608
|
-
|
|
609
|
-
regression line, and saves each plot as an individual .svg file.
|
|
609
|
+
This function creates a scatter plot for each feature-target pair, overlays a
|
|
610
|
+
simple linear regression line, and saves each plot as an individual .svg file.
|
|
610
611
|
|
|
611
612
|
Plots are saved in a structured way, with a subdirectory created for
|
|
612
613
|
each target variable.
|
|
613
614
|
|
|
614
615
|
Args:
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
save_dir (str | Path): The base directory where plots will be saved.
|
|
618
|
-
|
|
619
|
-
DataFrame will be used.
|
|
616
|
+
df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
|
|
617
|
+
df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
|
|
618
|
+
save_dir (str | Path): The base directory where plots will be saved.
|
|
619
|
+
verbose (int): Verbosity level for logging warnings.
|
|
620
620
|
|
|
621
621
|
Notes:
|
|
622
|
-
- Only numeric features and numeric targets are processed.
|
|
623
|
-
|
|
624
|
-
-
|
|
625
|
-
pairwise for each plot.
|
|
622
|
+
- Only numeric features and numeric targets are processed.
|
|
623
|
+
- Rows with NaN in either the feature or the target are dropped pairwise.
|
|
624
|
+
- Assumes df_continuous and df_targets share the same index.
|
|
626
625
|
"""
|
|
627
626
|
# 1. Validate the base save directory
|
|
628
627
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
629
628
|
|
|
630
|
-
# 2.
|
|
631
|
-
def
|
|
629
|
+
# 2. Validation helper
|
|
630
|
+
def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
|
|
632
631
|
valid_cols = []
|
|
633
|
-
for col in
|
|
634
|
-
if
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
_LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
|
|
632
|
+
for col in df.columns:
|
|
633
|
+
if not is_numeric_dtype(df[col]):
|
|
634
|
+
if verbose > 0:
|
|
635
|
+
_LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
|
|
638
636
|
else:
|
|
639
637
|
valid_cols.append(col)
|
|
640
638
|
return valid_cols
|
|
641
639
|
|
|
642
|
-
# 3. Validate target columns
|
|
643
|
-
valid_targets =
|
|
640
|
+
# 3. Validate target columns
|
|
641
|
+
valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
|
|
644
642
|
if not valid_targets:
|
|
645
|
-
_LOGGER.error("No valid numeric target columns provided
|
|
643
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
646
644
|
return
|
|
647
645
|
|
|
648
|
-
# 4.
|
|
649
|
-
|
|
650
|
-
_LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
|
|
651
|
-
target_set = set(valid_targets)
|
|
652
|
-
# Get all columns that are not in the valid_targets set
|
|
653
|
-
features_to_validate = [col for col in df.columns if col not in target_set]
|
|
654
|
-
else:
|
|
655
|
-
features_to_validate = features
|
|
656
|
-
|
|
657
|
-
valid_features = _validate_numeric_cols(features_to_validate, "Feature")
|
|
658
|
-
|
|
646
|
+
# 4. Validate feature columns
|
|
647
|
+
valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
|
|
659
648
|
if not valid_features:
|
|
660
|
-
_LOGGER.error("No valid numeric feature columns
|
|
649
|
+
_LOGGER.error("No valid numeric feature columns provided in df_continuous.")
|
|
661
650
|
return
|
|
662
651
|
|
|
663
652
|
# 5. Main plotting loop
|
|
@@ -669,15 +658,20 @@ def plot_continuous_vs_target(
|
|
|
669
658
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
670
659
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
671
660
|
|
|
672
|
-
|
|
661
|
+
if verbose > 0:
|
|
662
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
673
663
|
|
|
674
664
|
for feature_name in valid_features:
|
|
675
665
|
|
|
676
|
-
#
|
|
677
|
-
temp_df =
|
|
666
|
+
# Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
|
|
667
|
+
temp_df = pd.concat([
|
|
668
|
+
df_continuous[feature_name],
|
|
669
|
+
df_targets[target_name]
|
|
670
|
+
], axis=1).dropna()
|
|
678
671
|
|
|
679
672
|
if temp_df.empty:
|
|
680
|
-
|
|
673
|
+
if verbose > 1:
|
|
674
|
+
_LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
|
|
681
675
|
continue
|
|
682
676
|
|
|
683
677
|
x = temp_df[feature_name]
|
|
@@ -685,11 +679,12 @@ def plot_continuous_vs_target(
|
|
|
685
679
|
|
|
686
680
|
# 6. Perform linear fit
|
|
687
681
|
try:
|
|
688
|
-
# Modern replacement for np.polyfit + np.poly1d
|
|
682
|
+
# Modern replacement for np.polyfit + np.poly1d
|
|
689
683
|
p = np.polynomial.Polynomial.fit(x, y, deg=1)
|
|
690
684
|
plot_regression_line = True
|
|
691
685
|
except (np.linalg.LinAlgError, ValueError):
|
|
692
|
-
|
|
686
|
+
if verbose > 0:
|
|
687
|
+
_LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
|
|
693
688
|
plot_regression_line = False
|
|
694
689
|
|
|
695
690
|
# 7. Create the plot
|
|
@@ -723,77 +718,68 @@ def plot_continuous_vs_target(
|
|
|
723
718
|
|
|
724
719
|
# Close the figure to free up memory
|
|
725
720
|
plt.close()
|
|
726
|
-
|
|
727
|
-
|
|
721
|
+
|
|
722
|
+
if verbose > 0:
|
|
723
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
|
|
728
724
|
|
|
729
725
|
|
|
730
726
|
def plot_categorical_vs_target(
|
|
731
|
-
|
|
732
|
-
|
|
727
|
+
df_categorical: pd.DataFrame,
|
|
728
|
+
df_targets: pd.DataFrame,
|
|
733
729
|
save_dir: Union[str, Path],
|
|
734
|
-
features: Optional[List[str]] = None,
|
|
735
730
|
max_categories: int = 50,
|
|
736
|
-
fill_na_with: str = "MISSING DATA"
|
|
731
|
+
fill_na_with: str = "MISSING DATA",
|
|
732
|
+
drop_empty_targets: bool = True,
|
|
733
|
+
verbose: int = 1
|
|
737
734
|
):
|
|
738
735
|
"""
|
|
739
|
-
Plots each
|
|
736
|
+
Plots each feature in df_categorical against each numeric target in df_targets using box plots.
|
|
740
737
|
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
dependent variable.
|
|
744
|
-
|
|
745
|
-
Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
|
|
738
|
+
Automatically aligns the two DataFrames by index. If a numeric
|
|
739
|
+
column is passed within df_categorical, it will be cast to object type to treat it as a category.
|
|
746
740
|
|
|
747
741
|
Args:
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
save_dir (str | Path):
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
742
|
+
df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
|
|
743
|
+
df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
|
|
744
|
+
save_dir (str | Path): Base directory for saving plots.
|
|
745
|
+
max_categories (int): The maximum number of unique categories a feature can have to be plotted.
|
|
746
|
+
fill_na_with (str): String to replace NaN values in categorical columns.
|
|
747
|
+
drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
|
|
748
|
+
verbose (int): Verbosity level for logging warnings.
|
|
754
749
|
|
|
755
750
|
Notes:
|
|
756
|
-
-
|
|
757
|
-
- Features are automatically identified as categorical if they are 'object' dtype.
|
|
751
|
+
- Assumes df_categorical and df_targets share the same index.
|
|
758
752
|
"""
|
|
759
|
-
# 1. Validate the base save directory
|
|
753
|
+
# 1. Validate the base save directory
|
|
760
754
|
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
761
755
|
|
|
762
756
|
# 2. Validate target columns (must be numeric)
|
|
763
757
|
valid_targets = []
|
|
764
|
-
for col in
|
|
765
|
-
if
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
_LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
|
|
758
|
+
for col in df_targets.columns:
|
|
759
|
+
if not is_numeric_dtype(df_targets[col]):
|
|
760
|
+
if verbose > 0:
|
|
761
|
+
_LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
|
|
769
762
|
else:
|
|
770
763
|
valid_targets.append(col)
|
|
771
764
|
|
|
772
765
|
if not valid_targets:
|
|
773
|
-
_LOGGER.error("No valid numeric target columns provided
|
|
766
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
774
767
|
return
|
|
775
768
|
|
|
776
|
-
# 3.
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
if
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
else:
|
|
788
|
-
# Validate user-provided list
|
|
789
|
-
for col in features:
|
|
790
|
-
if col not in df.columns:
|
|
791
|
-
_LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
|
|
792
|
-
else:
|
|
793
|
-
features_to_plot.append(col)
|
|
769
|
+
# 3. Validate feature columns (Flexible: Allow numeric but warn)
|
|
770
|
+
valid_features = []
|
|
771
|
+
for col in df_categorical.columns:
|
|
772
|
+
# If numeric, warn but accept it (will be cast to object later)
|
|
773
|
+
if is_numeric_dtype(df_categorical[col]):
|
|
774
|
+
if verbose > 0:
|
|
775
|
+
_LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
|
|
776
|
+
valid_features.append(col)
|
|
777
|
+
else:
|
|
778
|
+
# Assume it is already object/category
|
|
779
|
+
valid_features.append(col)
|
|
794
780
|
|
|
795
|
-
if not
|
|
796
|
-
_LOGGER.error("No valid
|
|
781
|
+
if not valid_features:
|
|
782
|
+
_LOGGER.error("No valid feature columns provided in df_categorical.")
|
|
797
783
|
return
|
|
798
784
|
|
|
799
785
|
# 4. Main plotting loop
|
|
@@ -805,29 +791,47 @@ def plot_categorical_vs_target(
|
|
|
805
791
|
target_save_dir = base_save_path / safe_target_dir_name
|
|
806
792
|
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
807
793
|
|
|
808
|
-
|
|
809
|
-
|
|
794
|
+
if verbose > 0:
|
|
795
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
796
|
+
|
|
797
|
+
for feature_name in valid_features:
|
|
798
|
+
|
|
799
|
+
# Align data using concat to respect indices
|
|
800
|
+
feature_series = df_categorical[feature_name]
|
|
801
|
+
target_series = df_targets[target_name]
|
|
802
|
+
|
|
803
|
+
# Create a temporary DataFrame for this pair
|
|
804
|
+
temp_df = pd.concat([feature_series, target_series], axis=1)
|
|
805
|
+
|
|
806
|
+
# Optional: Drop rows where the target is NaN
|
|
807
|
+
if drop_empty_targets:
|
|
808
|
+
temp_df = temp_df.dropna(subset=[target_name])
|
|
809
|
+
if temp_df.empty:
|
|
810
|
+
if verbose > 1:
|
|
811
|
+
_LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
|
|
812
|
+
continue
|
|
813
|
+
|
|
814
|
+
# Force feature to object if it isn't already (handling the numeric flexibility)
|
|
815
|
+
if not is_object_dtype(temp_df[feature_name]):
|
|
816
|
+
temp_df[feature_name] = temp_df[feature_name].astype(object)
|
|
817
|
+
|
|
818
|
+
# Handle NaNs in the feature column (treat as a category)
|
|
819
|
+
if temp_df[feature_name].isnull().any():
|
|
820
|
+
temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
|
|
810
821
|
|
|
811
|
-
#
|
|
812
|
-
temp_df =
|
|
822
|
+
# Convert to string to ensure consistent plotting and cardinality check
|
|
823
|
+
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
813
824
|
|
|
814
825
|
# Check cardinality
|
|
815
826
|
n_unique = temp_df[feature_name].nunique()
|
|
816
827
|
if n_unique > max_categories:
|
|
817
|
-
|
|
828
|
+
if verbose > 1:
|
|
829
|
+
_LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
|
|
818
830
|
continue
|
|
819
|
-
|
|
820
|
-
# Handle NaNs by replacing them with the specified string
|
|
821
|
-
if temp_df[feature_name].isnull().any():
|
|
822
|
-
# Convert to object type first to allow string replacement
|
|
823
|
-
temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
|
|
824
|
-
|
|
825
|
-
# Convert feature to string to ensure correct plotting order
|
|
826
|
-
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
827
831
|
|
|
828
832
|
# 5. Create the plot
|
|
829
|
-
#
|
|
830
|
-
plt.figure(figsize=(max(10, n_unique *
|
|
833
|
+
# Dynamic figure width based on number of categories
|
|
834
|
+
plt.figure(figsize=(max(10, n_unique * 0.8), 10))
|
|
831
835
|
|
|
832
836
|
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
833
837
|
|
|
@@ -850,8 +854,9 @@ def plot_categorical_vs_target(
|
|
|
850
854
|
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
851
855
|
|
|
852
856
|
plt.close()
|
|
853
|
-
|
|
854
|
-
|
|
857
|
+
|
|
858
|
+
if verbose > 0:
|
|
859
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
|
|
855
860
|
|
|
856
861
|
|
|
857
862
|
def encode_categorical_features(
|
|
@@ -1078,7 +1083,10 @@ def plot_correlation_heatmap(df: pd.DataFrame,
|
|
|
1078
1083
|
annot=annot_bool,
|
|
1079
1084
|
cmap='coolwarm',
|
|
1080
1085
|
fmt=".2f",
|
|
1081
|
-
cbar_kws={"shrink": 0.8}
|
|
1086
|
+
cbar_kws={"shrink": 0.8},
|
|
1087
|
+
vmin=-1, # Anchors minimum color to -1
|
|
1088
|
+
vmax=1, # Anchors maximum color to 1
|
|
1089
|
+
center=0 # Ensures 0 corresponds to the neutral color (white)
|
|
1082
1090
|
)
|
|
1083
1091
|
|
|
1084
1092
|
# add suffix to title
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "dragon-ml-toolbox"
|
|
3
|
-
version = "19.
|
|
3
|
+
version = "19.14.0"
|
|
4
4
|
description = "Complete pipelines and helper tools for data science and machine learning projects."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Karl Luigi Loza Vidaurre", email = "luigiloza@gmail.com" }
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/requires.txt
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/dragon_ml_toolbox.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/ML_sequence_datasetmaster.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_chaining_utilities.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_configuration_pytab.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_captum.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_evaluation_multi.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_finalize_handler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_models_advanced.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_optimization_pareto.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_datasetmaster.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_sequence_models.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_datasetmaster.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_inference.py
RENAMED
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ML_vision_transformers.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_evaluation.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_inference.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_ensemble_learning.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_base.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_models_advanced_helpers.py
RENAMED
|
File without changes
|
{dragon_ml_toolbox-19.13.0 → dragon_ml_toolbox-19.14.0}/ml_tools/_core/_optimization_tools.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|