dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1901
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
from typing import Optional, Union, Literal
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
from pandas.api.types import is_numeric_dtype, is_object_dtype
|
|
8
|
+
|
|
9
|
+
from ..path_manager import make_fullpath, sanitize_filename
|
|
10
|
+
from .._core import get_logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_LOGGER = get_logger("Data Exploration: Plotting")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"plot_value_distributions",
|
|
18
|
+
"plot_continuous_vs_target",
|
|
19
|
+
"plot_categorical_vs_target",
|
|
20
|
+
"plot_correlation_heatmap",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def plot_value_distributions(
|
|
25
|
+
df: pd.DataFrame,
|
|
26
|
+
save_dir: Union[str, Path],
|
|
27
|
+
categorical_columns: Optional[list[str]] = None,
|
|
28
|
+
max_categories: int = 100,
|
|
29
|
+
fill_na_with: str = "MISSING DATA"
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Plots and saves the value distributions for all columns in a DataFrame,
|
|
33
|
+
using the best plot type for each column (histogram or count plot).
|
|
34
|
+
|
|
35
|
+
Plots are saved as SVG files under two subdirectories in `save_dir`:
|
|
36
|
+
- "Distribution_Continuous" for continuous numeric features (histograms).
|
|
37
|
+
- "Distribution_Categorical" for categorical features (count plots).
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
df (pd.DataFrame): The input DataFrame to analyze.
|
|
41
|
+
save_dir (str | Path): Directory path to save the plots.
|
|
42
|
+
categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
|
|
43
|
+
max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
44
|
+
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
|
|
45
|
+
|
|
46
|
+
Notes:
|
|
47
|
+
- `seaborn.histplot` with KDE is used for continuous features.
|
|
48
|
+
- `seaborn.countplot` is used for categorical features.
|
|
49
|
+
"""
|
|
50
|
+
# 1. Setup save directories
|
|
51
|
+
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
52
|
+
numeric_dir = base_save_path / "Distribution_Continuous"
|
|
53
|
+
categorical_dir = base_save_path / "Distribution_Categorical"
|
|
54
|
+
numeric_dir.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
categorical_dir.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
|
|
57
|
+
# 2. Filter columns to plot
|
|
58
|
+
columns_to_plot = df.columns.to_list()
|
|
59
|
+
|
|
60
|
+
# Setup for forced categorical logic
|
|
61
|
+
categorical_set = set(categorical_columns) if categorical_columns is not None else None
|
|
62
|
+
|
|
63
|
+
numeric_plots_saved = 0
|
|
64
|
+
categorical_plots_saved = 0
|
|
65
|
+
|
|
66
|
+
for col_name in columns_to_plot:
|
|
67
|
+
try:
|
|
68
|
+
is_numeric = is_numeric_dtype(df[col_name])
|
|
69
|
+
n_unique = df[col_name].nunique()
|
|
70
|
+
|
|
71
|
+
# --- 3. Determine Plot Type ---
|
|
72
|
+
is_continuous = False
|
|
73
|
+
if categorical_set is not None:
|
|
74
|
+
# Use the explicit list
|
|
75
|
+
if col_name not in categorical_set:
|
|
76
|
+
is_continuous = True
|
|
77
|
+
else:
|
|
78
|
+
# Use auto-detection
|
|
79
|
+
if is_numeric:
|
|
80
|
+
is_continuous = True
|
|
81
|
+
|
|
82
|
+
# --- Case 1: Continuous Numeric (Histogram) ---
|
|
83
|
+
if is_continuous:
|
|
84
|
+
plt.figure(figsize=(10, 6))
|
|
85
|
+
# Drop NaNs for histogram, as they can't be plotted on a numeric axis
|
|
86
|
+
sns.histplot(x=df[col_name].dropna(), kde=True, bins=30)
|
|
87
|
+
plt.title(f"Distribution of '{col_name}' (Continuous)")
|
|
88
|
+
plt.xlabel(col_name)
|
|
89
|
+
plt.ylabel("Count")
|
|
90
|
+
|
|
91
|
+
save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
|
|
92
|
+
numeric_plots_saved += 1
|
|
93
|
+
|
|
94
|
+
# --- Case 2: Categorical (Count Plot) ---
|
|
95
|
+
else:
|
|
96
|
+
# Check max categories
|
|
97
|
+
if n_unique > max_categories:
|
|
98
|
+
_LOGGER.warning(f"Skipping plot for '{col_name}': {n_unique} unique values > {max_categories} max_categories.")
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
# Adaptive figure size
|
|
102
|
+
fig_width = max(10, n_unique * 0.5)
|
|
103
|
+
plt.figure(figsize=(fig_width, 8))
|
|
104
|
+
|
|
105
|
+
# Make a temporary copy for plotting to handle NaNs
|
|
106
|
+
temp_series = df[col_name].copy()
|
|
107
|
+
|
|
108
|
+
# Handle NaNs by replacing them with the specified string
|
|
109
|
+
if temp_series.isnull().any():
|
|
110
|
+
# Convert to object type first to allow string replacement
|
|
111
|
+
temp_series = temp_series.astype(object).fillna(fill_na_with)
|
|
112
|
+
|
|
113
|
+
# Convert all to string to be safe (handles low-card numeric)
|
|
114
|
+
temp_series = temp_series.astype(str)
|
|
115
|
+
|
|
116
|
+
# Get category order by frequency
|
|
117
|
+
order = temp_series.value_counts().index
|
|
118
|
+
sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
|
|
119
|
+
|
|
120
|
+
plt.title(f"Distribution of '{col_name}' (Categorical)")
|
|
121
|
+
plt.xlabel(col_name)
|
|
122
|
+
plt.ylabel("Count")
|
|
123
|
+
|
|
124
|
+
# Smart tick rotation
|
|
125
|
+
max_label_len = 0
|
|
126
|
+
if n_unique > 0:
|
|
127
|
+
max_label_len = max(len(str(s)) for s in order)
|
|
128
|
+
|
|
129
|
+
# Rotate if labels are long OR there are many categories
|
|
130
|
+
if max_label_len > 10 or n_unique > 25:
|
|
131
|
+
plt.xticks(rotation=45, ha='right')
|
|
132
|
+
|
|
133
|
+
save_path = categorical_dir / f"{sanitize_filename(col_name)}.svg"
|
|
134
|
+
categorical_plots_saved += 1
|
|
135
|
+
|
|
136
|
+
# --- 4. Save Plot ---
|
|
137
|
+
plt.grid(True, linestyle='--', alpha=0.6, axis='y')
|
|
138
|
+
plt.tight_layout()
|
|
139
|
+
# Save as .svg
|
|
140
|
+
plt.savefig(save_path, format='svg', bbox_inches="tight")
|
|
141
|
+
plt.close()
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
_LOGGER.error(f"Failed to plot distribution for '{col_name}'. Error: {e}")
|
|
145
|
+
plt.close()
|
|
146
|
+
|
|
147
|
+
_LOGGER.info(f"Saved {numeric_plots_saved} continuous distribution plots to '{numeric_dir.name}'.")
|
|
148
|
+
_LOGGER.info(f"Saved {categorical_plots_saved} categorical distribution plots to '{categorical_dir.name}'.")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def plot_continuous_vs_target(
|
|
152
|
+
df_continuous: pd.DataFrame,
|
|
153
|
+
df_targets: pd.DataFrame,
|
|
154
|
+
save_dir: Union[str, Path],
|
|
155
|
+
verbose: int = 1
|
|
156
|
+
):
|
|
157
|
+
"""
|
|
158
|
+
Plots each continuous feature from df_continuous against each target in df_targets.
|
|
159
|
+
|
|
160
|
+
This function creates a scatter plot for each feature-target pair, overlays a
|
|
161
|
+
simple linear regression line, and saves each plot as an individual .svg file.
|
|
162
|
+
|
|
163
|
+
Plots are saved in a structured way, with a subdirectory created for
|
|
164
|
+
each target variable.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
|
|
168
|
+
df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
|
|
169
|
+
save_dir (str | Path): The base directory where plots will be saved.
|
|
170
|
+
verbose (int): Verbosity level for logging warnings.
|
|
171
|
+
|
|
172
|
+
Notes:
|
|
173
|
+
- Only numeric features and numeric targets are processed.
|
|
174
|
+
- Rows with NaN in either the feature or the target are dropped pairwise.
|
|
175
|
+
- Assumes df_continuous and df_targets share the same index.
|
|
176
|
+
"""
|
|
177
|
+
# 1. Validate the base save directory
|
|
178
|
+
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
179
|
+
|
|
180
|
+
# 2. Validation helper
|
|
181
|
+
def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> list[str]:
|
|
182
|
+
valid_cols = []
|
|
183
|
+
for col in df.columns:
|
|
184
|
+
if not is_numeric_dtype(df[col]):
|
|
185
|
+
if verbose > 0:
|
|
186
|
+
_LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
|
|
187
|
+
else:
|
|
188
|
+
valid_cols.append(col)
|
|
189
|
+
return valid_cols
|
|
190
|
+
|
|
191
|
+
# 3. Validate target columns
|
|
192
|
+
valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
|
|
193
|
+
if not valid_targets:
|
|
194
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
195
|
+
return
|
|
196
|
+
|
|
197
|
+
# 4. Validate feature columns
|
|
198
|
+
valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
|
|
199
|
+
if not valid_features:
|
|
200
|
+
_LOGGER.error("No valid numeric feature columns provided in df_continuous.")
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
# 5. Main plotting loop
|
|
204
|
+
total_plots_saved = 0
|
|
205
|
+
|
|
206
|
+
for target_name in valid_targets:
|
|
207
|
+
# Create a sanitized subdirectory for this target
|
|
208
|
+
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Continuous")
|
|
209
|
+
target_save_dir = base_save_path / safe_target_dir_name
|
|
210
|
+
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
|
|
212
|
+
if verbose > 0:
|
|
213
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
214
|
+
|
|
215
|
+
for feature_name in valid_features:
|
|
216
|
+
|
|
217
|
+
# Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
|
|
218
|
+
temp_df = pd.concat([
|
|
219
|
+
df_continuous[feature_name],
|
|
220
|
+
df_targets[target_name]
|
|
221
|
+
], axis=1).dropna()
|
|
222
|
+
|
|
223
|
+
if temp_df.empty:
|
|
224
|
+
if verbose > 1:
|
|
225
|
+
_LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
x = temp_df[feature_name]
|
|
229
|
+
y = temp_df[target_name]
|
|
230
|
+
|
|
231
|
+
# 6. Perform linear fit
|
|
232
|
+
try:
|
|
233
|
+
# Modern replacement for np.polyfit + np.poly1d
|
|
234
|
+
p = np.polynomial.Polynomial.fit(x, y, deg=1)
|
|
235
|
+
plot_regression_line = True
|
|
236
|
+
except (np.linalg.LinAlgError, ValueError):
|
|
237
|
+
if verbose > 0:
|
|
238
|
+
_LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
|
|
239
|
+
plot_regression_line = False
|
|
240
|
+
|
|
241
|
+
# 7. Create the plot
|
|
242
|
+
plt.figure(figsize=(10, 6))
|
|
243
|
+
ax = plt.gca()
|
|
244
|
+
|
|
245
|
+
# Plot the raw data points
|
|
246
|
+
ax.plot(x, y, 'o', alpha=0.5, label='Data points', markersize=5)
|
|
247
|
+
|
|
248
|
+
# Plot the regression line
|
|
249
|
+
if plot_regression_line:
|
|
250
|
+
ax.plot(x, p(x), "r--", label='Linear Fit') # type: ignore
|
|
251
|
+
|
|
252
|
+
ax.set_title(f'{feature_name} vs {target_name}')
|
|
253
|
+
ax.set_xlabel(feature_name)
|
|
254
|
+
ax.set_ylabel(target_name)
|
|
255
|
+
ax.legend()
|
|
256
|
+
plt.grid(True, linestyle='--', alpha=0.6)
|
|
257
|
+
plt.tight_layout()
|
|
258
|
+
|
|
259
|
+
# 8. Save the plot
|
|
260
|
+
safe_feature_name = sanitize_filename(feature_name)
|
|
261
|
+
plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
|
|
262
|
+
plot_path = target_save_dir / plot_filename
|
|
263
|
+
|
|
264
|
+
try:
|
|
265
|
+
plt.savefig(plot_path, bbox_inches="tight", format='svg')
|
|
266
|
+
total_plots_saved += 1
|
|
267
|
+
except Exception as e:
|
|
268
|
+
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
269
|
+
|
|
270
|
+
# Close the figure to free up memory
|
|
271
|
+
plt.close()
|
|
272
|
+
|
|
273
|
+
if verbose > 0:
|
|
274
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def plot_categorical_vs_target(
|
|
278
|
+
df_categorical: pd.DataFrame,
|
|
279
|
+
df_targets: pd.DataFrame,
|
|
280
|
+
save_dir: Union[str, Path],
|
|
281
|
+
max_categories: int = 50,
|
|
282
|
+
fill_na_with: str = "MISSING DATA",
|
|
283
|
+
drop_empty_targets: bool = True,
|
|
284
|
+
verbose: int = 1
|
|
285
|
+
):
|
|
286
|
+
"""
|
|
287
|
+
Plots each feature in df_categorical against each numeric target in df_targets using box plots.
|
|
288
|
+
|
|
289
|
+
Automatically aligns the two DataFrames by index. If a numeric
|
|
290
|
+
column is passed within df_categorical, it will be cast to object type to treat it as a category.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
|
|
294
|
+
df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
|
|
295
|
+
save_dir (str | Path): Base directory for saving plots.
|
|
296
|
+
max_categories (int): The maximum number of unique categories a feature can have to be plotted.
|
|
297
|
+
fill_na_with (str): String to replace NaN values in categorical columns.
|
|
298
|
+
drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
|
|
299
|
+
verbose (int): Verbosity level for logging warnings.
|
|
300
|
+
|
|
301
|
+
Notes:
|
|
302
|
+
- Assumes df_categorical and df_targets share the same index.
|
|
303
|
+
"""
|
|
304
|
+
# 1. Validate the base save directory
|
|
305
|
+
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
306
|
+
|
|
307
|
+
# 2. Validate target columns (must be numeric)
|
|
308
|
+
valid_targets = []
|
|
309
|
+
for col in df_targets.columns:
|
|
310
|
+
if not is_numeric_dtype(df_targets[col]):
|
|
311
|
+
if verbose > 0:
|
|
312
|
+
_LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
|
|
313
|
+
else:
|
|
314
|
+
valid_targets.append(col)
|
|
315
|
+
|
|
316
|
+
if not valid_targets:
|
|
317
|
+
_LOGGER.error("No valid numeric target columns provided in df_targets.")
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
# 3. Validate feature columns (Flexible: Allow numeric but warn)
|
|
321
|
+
valid_features = []
|
|
322
|
+
for col in df_categorical.columns:
|
|
323
|
+
# If numeric, warn but accept it (will be cast to object later)
|
|
324
|
+
if is_numeric_dtype(df_categorical[col]):
|
|
325
|
+
if verbose > 0:
|
|
326
|
+
_LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
|
|
327
|
+
valid_features.append(col)
|
|
328
|
+
else:
|
|
329
|
+
# Assume it is already object/category
|
|
330
|
+
valid_features.append(col)
|
|
331
|
+
|
|
332
|
+
if not valid_features:
|
|
333
|
+
_LOGGER.error("No valid feature columns provided in df_categorical.")
|
|
334
|
+
return
|
|
335
|
+
|
|
336
|
+
# 4. Main plotting loop
|
|
337
|
+
total_plots_saved = 0
|
|
338
|
+
|
|
339
|
+
for target_name in valid_targets:
|
|
340
|
+
# Create a sanitized subdirectory for this target
|
|
341
|
+
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
|
|
342
|
+
target_save_dir = base_save_path / safe_target_dir_name
|
|
343
|
+
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
344
|
+
|
|
345
|
+
if verbose > 0:
|
|
346
|
+
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
347
|
+
|
|
348
|
+
for feature_name in valid_features:
|
|
349
|
+
|
|
350
|
+
# Align data using concat to respect indices
|
|
351
|
+
feature_series = df_categorical[feature_name]
|
|
352
|
+
target_series = df_targets[target_name]
|
|
353
|
+
|
|
354
|
+
# Create a temporary DataFrame for this pair
|
|
355
|
+
temp_df = pd.concat([feature_series, target_series], axis=1)
|
|
356
|
+
|
|
357
|
+
# Optional: Drop rows where the target is NaN
|
|
358
|
+
if drop_empty_targets:
|
|
359
|
+
temp_df = temp_df.dropna(subset=[target_name])
|
|
360
|
+
if temp_df.empty:
|
|
361
|
+
if verbose > 1:
|
|
362
|
+
_LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
# Force feature to object if it isn't already (handling the numeric flexibility)
|
|
366
|
+
if not is_object_dtype(temp_df[feature_name]):
|
|
367
|
+
temp_df[feature_name] = temp_df[feature_name].astype(object)
|
|
368
|
+
|
|
369
|
+
# Handle NaNs in the feature column (treat as a category)
|
|
370
|
+
if temp_df[feature_name].isnull().any():
|
|
371
|
+
temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
|
|
372
|
+
|
|
373
|
+
# Convert to string to ensure consistent plotting and cardinality check
|
|
374
|
+
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
375
|
+
|
|
376
|
+
# Check cardinality
|
|
377
|
+
n_unique = temp_df[feature_name].nunique()
|
|
378
|
+
if n_unique > max_categories:
|
|
379
|
+
if verbose > 1:
|
|
380
|
+
_LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
|
|
381
|
+
continue
|
|
382
|
+
|
|
383
|
+
# 5. Create the plot
|
|
384
|
+
# Dynamic figure width based on number of categories
|
|
385
|
+
plt.figure(figsize=(max(10, n_unique * 0.8), 10))
|
|
386
|
+
|
|
387
|
+
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
388
|
+
|
|
389
|
+
plt.title(f'{target_name} vs {feature_name}')
|
|
390
|
+
plt.xlabel(feature_name)
|
|
391
|
+
plt.ylabel(target_name)
|
|
392
|
+
plt.xticks(rotation=45, ha='right')
|
|
393
|
+
plt.grid(True, linestyle='--', alpha=0.6, axis='y')
|
|
394
|
+
plt.tight_layout()
|
|
395
|
+
|
|
396
|
+
# 6. Save the plot
|
|
397
|
+
safe_feature_name = sanitize_filename(feature_name)
|
|
398
|
+
plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
|
|
399
|
+
plot_path = target_save_dir / plot_filename
|
|
400
|
+
|
|
401
|
+
try:
|
|
402
|
+
plt.savefig(plot_path, bbox_inches="tight", format='svg')
|
|
403
|
+
total_plots_saved += 1
|
|
404
|
+
except Exception as e:
|
|
405
|
+
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
406
|
+
|
|
407
|
+
plt.close()
|
|
408
|
+
|
|
409
|
+
if verbose > 0:
|
|
410
|
+
_LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def plot_correlation_heatmap(df: pd.DataFrame,
|
|
415
|
+
plot_title: str,
|
|
416
|
+
save_dir: Union[str, Path, None] = None,
|
|
417
|
+
method: Literal["pearson", "kendall", "spearman"]="pearson"):
|
|
418
|
+
"""
|
|
419
|
+
Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
df (pd.DataFrame): The input dataset.
|
|
423
|
+
save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
|
|
424
|
+
plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
|
|
425
|
+
method (str): Correlation method to use. Must be one of:
|
|
426
|
+
- 'pearson' (default): measures linear correlation (assumes normally distributed data),
|
|
427
|
+
- 'kendall': rank correlation (non-parametric),
|
|
428
|
+
- 'spearman': monotonic relationship (non-parametric).
|
|
429
|
+
|
|
430
|
+
Notes:
|
|
431
|
+
- Only numeric columns are included.
|
|
432
|
+
- Annotations are disabled if there are more than 20 features.
|
|
433
|
+
- Missing values are handled via pairwise complete observations.
|
|
434
|
+
"""
|
|
435
|
+
numeric_df = df.select_dtypes(include='number')
|
|
436
|
+
if numeric_df.empty:
|
|
437
|
+
_LOGGER.warning("No numeric columns found. Heatmap not generated.")
|
|
438
|
+
return
|
|
439
|
+
if method not in ["pearson", "kendall", "spearman"]:
|
|
440
|
+
_LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
|
|
441
|
+
raise ValueError()
|
|
442
|
+
|
|
443
|
+
corr = numeric_df.corr(method=method)
|
|
444
|
+
|
|
445
|
+
# Create a mask for the upper triangle
|
|
446
|
+
mask = np.triu(np.ones_like(corr, dtype=bool))
|
|
447
|
+
|
|
448
|
+
# Plot setup
|
|
449
|
+
size = max(10, numeric_df.shape[1])
|
|
450
|
+
plt.figure(figsize=(size, size * 0.8))
|
|
451
|
+
|
|
452
|
+
annot_bool = numeric_df.shape[1] <= 20
|
|
453
|
+
sns.heatmap(
|
|
454
|
+
corr,
|
|
455
|
+
mask=mask,
|
|
456
|
+
annot=annot_bool,
|
|
457
|
+
cmap='coolwarm',
|
|
458
|
+
fmt=".2f",
|
|
459
|
+
cbar_kws={"shrink": 0.8},
|
|
460
|
+
vmin=-1, # Anchors minimum color to -1
|
|
461
|
+
vmax=1, # Anchors maximum color to 1
|
|
462
|
+
center=0 # Ensures 0 corresponds to the neutral color (white)
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
# add suffix to title
|
|
466
|
+
full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
|
|
467
|
+
|
|
468
|
+
plt.title(full_plot_title)
|
|
469
|
+
plt.xticks(rotation=45, ha='right')
|
|
470
|
+
plt.yticks(rotation=0)
|
|
471
|
+
|
|
472
|
+
plt.tight_layout()
|
|
473
|
+
|
|
474
|
+
if save_dir:
|
|
475
|
+
save_path = make_fullpath(save_dir, make=True)
|
|
476
|
+
# sanitize the plot title to save the file
|
|
477
|
+
sanitized_plot_title = sanitize_filename(plot_title)
|
|
478
|
+
plot_filename = sanitized_plot_title + ".svg"
|
|
479
|
+
|
|
480
|
+
full_path = save_path / plot_filename
|
|
481
|
+
|
|
482
|
+
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
483
|
+
_LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
|
|
484
|
+
|
|
485
|
+
plt.show()
|
|
486
|
+
plt.close()
|
|
487
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from ..schema import FeatureSchema
|
|
5
|
+
|
|
6
|
+
from .._core import get_logger
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_LOGGER = get_logger("Data Exploration: Schema Ops")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def finalize_feature_schema(
|
|
13
|
+
df_features: pd.DataFrame,
|
|
14
|
+
categorical_mappings: Optional[dict[str, dict[str, int]]]
|
|
15
|
+
) -> FeatureSchema:
|
|
16
|
+
"""
|
|
17
|
+
Analyzes the final features DataFrame to create a definitive schema.
|
|
18
|
+
|
|
19
|
+
This function is the "single source of truth" for column order
|
|
20
|
+
and type (categorical vs. continuous) for the entire ML pipeline.
|
|
21
|
+
|
|
22
|
+
It should be called at the end of the feature engineering process.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
df_features (pd.DataFrame):
|
|
26
|
+
The final, processed DataFrame containing *only* feature columns
|
|
27
|
+
in the exact order they will be fed to the model.
|
|
28
|
+
categorical_mappings (Dict[str, Dict[str, int]] | None):
|
|
29
|
+
The mappings dictionary generated by
|
|
30
|
+
`encode_categorical_features`. Can be None if no
|
|
31
|
+
categorical features exist.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
|
|
35
|
+
"""
|
|
36
|
+
feature_names: list[str] = df_features.columns.to_list()
|
|
37
|
+
|
|
38
|
+
# Intermediate lists for building
|
|
39
|
+
continuous_feature_names_list: list[str] = []
|
|
40
|
+
categorical_feature_names_list: list[str] = []
|
|
41
|
+
categorical_index_map_dict: dict[int, int] = {}
|
|
42
|
+
|
|
43
|
+
# _LOGGER.info("Finalizing feature schema...")
|
|
44
|
+
|
|
45
|
+
if categorical_mappings:
|
|
46
|
+
# --- Categorical features are present ---
|
|
47
|
+
categorical_names_set = set(categorical_mappings.keys())
|
|
48
|
+
|
|
49
|
+
for index, name in enumerate(feature_names):
|
|
50
|
+
if name in categorical_names_set:
|
|
51
|
+
# This is a categorical feature
|
|
52
|
+
cardinality = len(categorical_mappings[name])
|
|
53
|
+
categorical_index_map_dict[index] = cardinality
|
|
54
|
+
categorical_feature_names_list.append(name)
|
|
55
|
+
else:
|
|
56
|
+
# This is a continuous feature
|
|
57
|
+
continuous_feature_names_list.append(name)
|
|
58
|
+
|
|
59
|
+
# Use the populated dict, or None if it's empty
|
|
60
|
+
final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
|
|
61
|
+
|
|
62
|
+
else:
|
|
63
|
+
# --- No categorical features ---
|
|
64
|
+
_LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
|
|
65
|
+
continuous_feature_names_list = list(feature_names)
|
|
66
|
+
# categorical_feature_names_list remains empty
|
|
67
|
+
# categorical_index_map_dict remains empty
|
|
68
|
+
final_index_map = None # Explicitly set to None to match Optional type
|
|
69
|
+
|
|
70
|
+
_LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
|
|
71
|
+
|
|
72
|
+
# Create the final immutable instance
|
|
73
|
+
schema_instance = FeatureSchema(
|
|
74
|
+
feature_names=tuple(feature_names),
|
|
75
|
+
continuous_feature_names=tuple(continuous_feature_names_list),
|
|
76
|
+
categorical_feature_names=tuple(categorical_feature_names_list),
|
|
77
|
+
categorical_index_map=final_index_map,
|
|
78
|
+
categorical_mappings=categorical_mappings
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return schema_instance
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def apply_feature_schema(
|
|
85
|
+
df: pd.DataFrame,
|
|
86
|
+
schema: FeatureSchema,
|
|
87
|
+
targets: Optional[list[str]] = None,
|
|
88
|
+
unknown_value: int = 99999,
|
|
89
|
+
verbose: bool = True
|
|
90
|
+
) -> pd.DataFrame:
|
|
91
|
+
"""
|
|
92
|
+
Aligns the input DataFrame with the provided FeatureSchema.
|
|
93
|
+
|
|
94
|
+
This function aligns data for inference/fine-tuning by enforcing the schema's
|
|
95
|
+
structure and encoding.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
df (pd.DataFrame): The input DataFrame.
|
|
99
|
+
schema (FeatureSchema): The schema defining feature names, types, and mappings.
|
|
100
|
+
targets (list[str] | None): Optional list of target column names.
|
|
101
|
+
unknown_value (int): Integer value to assign to unknown categorical levels.
|
|
102
|
+
Defaults to 99999 to avoid collision with existing categories.
|
|
103
|
+
verbose (bool): If True, logs info about dropped extra columns.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
ValueError: If any required feature or target column is missing.
|
|
110
|
+
"""
|
|
111
|
+
# 1. Setup
|
|
112
|
+
df_processed = df.copy()
|
|
113
|
+
targets = targets if targets is not None else []
|
|
114
|
+
|
|
115
|
+
# 2. Validation: Strict Column Presence
|
|
116
|
+
missing_features = [col for col in schema.feature_names if col not in df_processed.columns]
|
|
117
|
+
if missing_features:
|
|
118
|
+
_LOGGER.error(f"Schema Mismatch: Missing required features: {missing_features}")
|
|
119
|
+
raise ValueError()
|
|
120
|
+
|
|
121
|
+
# target columns should not be part of feature columns
|
|
122
|
+
if targets:
|
|
123
|
+
overlapping_columns = set(schema.feature_names).intersection(set(targets))
|
|
124
|
+
if overlapping_columns:
|
|
125
|
+
_LOGGER.error(f"Schema Mismatch: Target columns overlap with feature columns: {overlapping_columns}")
|
|
126
|
+
raise ValueError()
|
|
127
|
+
|
|
128
|
+
# targets were provided, check their presence
|
|
129
|
+
missing_targets = [col for col in targets if col not in df_processed.columns]
|
|
130
|
+
if missing_targets:
|
|
131
|
+
_LOGGER.error(f"Target Mismatch: Missing target columns: {missing_targets}")
|
|
132
|
+
raise ValueError()
|
|
133
|
+
|
|
134
|
+
# 3. Apply Categorical Encoding
|
|
135
|
+
if schema.categorical_feature_names and schema.categorical_mappings:
|
|
136
|
+
for col_name in schema.categorical_feature_names:
|
|
137
|
+
# Should never happen due to schema construction, but double-check and raise
|
|
138
|
+
if col_name not in schema.categorical_mappings:
|
|
139
|
+
_LOGGER.error(f"Schema Inconsistency: No mapping found for categorical feature '{col_name}'.")
|
|
140
|
+
raise ValueError()
|
|
141
|
+
|
|
142
|
+
mapping = schema.categorical_mappings[col_name]
|
|
143
|
+
|
|
144
|
+
# Apply mapping (unknowns become NaN)
|
|
145
|
+
df_processed[col_name] = df_processed[col_name].astype(str).map(mapping)
|
|
146
|
+
|
|
147
|
+
# Handle Unknown Categories
|
|
148
|
+
if df_processed[col_name].isnull().any():
|
|
149
|
+
n_missing = df_processed[col_name].isnull().sum()
|
|
150
|
+
_LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
|
|
151
|
+
|
|
152
|
+
# Fill unknowns with the specified integer
|
|
153
|
+
df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
|
|
154
|
+
|
|
155
|
+
df_processed[col_name] = df_processed[col_name].astype(int)
|
|
156
|
+
|
|
157
|
+
# 4. Reorder and Filter
|
|
158
|
+
final_column_order = list(schema.feature_names) + targets
|
|
159
|
+
|
|
160
|
+
extra_cols = set(df_processed.columns) - set(final_column_order)
|
|
161
|
+
if extra_cols:
|
|
162
|
+
_LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
|
|
163
|
+
if verbose:
|
|
164
|
+
for extra_column in extra_cols:
|
|
165
|
+
print(f" - Dropping column: '{extra_column}'")
|
|
166
|
+
|
|
167
|
+
df_final = df_processed[final_column_order]
|
|
168
|
+
|
|
169
|
+
_LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
|
|
170
|
+
|
|
171
|
+
# df_final should be a dataframe
|
|
172
|
+
if isinstance(df_final, pd.Series):
|
|
173
|
+
df_final = df_final.to_frame()
|
|
174
|
+
|
|
175
|
+
return df_final
|
|
176
|
+
|
|
@@ -1,14 +1,16 @@
|
|
|
1
|
-
from .
|
|
1
|
+
from ._ensemble_evaluation import (
|
|
2
2
|
evaluate_model_classification,
|
|
3
3
|
plot_roc_curve,
|
|
4
4
|
plot_precision_recall_curve,
|
|
5
5
|
plot_calibration_curve,
|
|
6
6
|
evaluate_model_regression,
|
|
7
7
|
get_shap_values,
|
|
8
|
-
plot_learning_curves
|
|
9
|
-
info
|
|
8
|
+
plot_learning_curves
|
|
10
9
|
)
|
|
11
10
|
|
|
11
|
+
from ._imprimir import info
|
|
12
|
+
|
|
13
|
+
|
|
12
14
|
__all__ = [
|
|
13
15
|
"evaluate_model_classification",
|
|
14
16
|
"plot_roc_curve",
|
|
@@ -17,4 +19,4 @@ __all__ = [
|
|
|
17
19
|
"evaluate_model_regression",
|
|
18
20
|
"get_shap_values",
|
|
19
21
|
"plot_learning_curves"
|
|
20
|
-
]
|
|
22
|
+
]
|