dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1901
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,1901 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from pandas.api.types import is_numeric_dtype, is_object_dtype
|
|
3
|
-
import numpy as np
|
|
4
|
-
import matplotlib.pyplot as plt
|
|
5
|
-
import seaborn as sns
|
|
6
|
-
from typing import Union, Literal, Dict, Tuple, List, Optional, Any
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
import re
|
|
9
|
-
|
|
10
|
-
from ._path_manager import sanitize_filename, make_fullpath
|
|
11
|
-
from ._script_info import _script_info
|
|
12
|
-
from ._logger import get_logger
|
|
13
|
-
from ._utilities import save_dataframe_filename
|
|
14
|
-
from ._schema import FeatureSchema
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
_LOGGER = get_logger("Data Exploration")
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
__all__ = [
|
|
21
|
-
"summarize_dataframe",
|
|
22
|
-
"drop_constant_columns",
|
|
23
|
-
"drop_rows_with_missing_data",
|
|
24
|
-
"show_null_columns",
|
|
25
|
-
"drop_columns_with_missing_data",
|
|
26
|
-
"drop_macro",
|
|
27
|
-
"clean_column_names",
|
|
28
|
-
"plot_value_distributions",
|
|
29
|
-
"plot_continuous_vs_target",
|
|
30
|
-
"plot_categorical_vs_target",
|
|
31
|
-
"split_features_targets",
|
|
32
|
-
"encode_categorical_features",
|
|
33
|
-
"clip_outliers_single",
|
|
34
|
-
"clip_outliers_multi",
|
|
35
|
-
"drop_outlier_samples",
|
|
36
|
-
"plot_correlation_heatmap",
|
|
37
|
-
"finalize_feature_schema",
|
|
38
|
-
"match_and_filter_columns_by_regex",
|
|
39
|
-
"standardize_percentages",
|
|
40
|
-
"reconstruct_one_hot",
|
|
41
|
-
"reconstruct_binary",
|
|
42
|
-
"reconstruct_multibinary",
|
|
43
|
-
"split_continuous_binary",
|
|
44
|
-
"apply_feature_schema"
|
|
45
|
-
]
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
|
|
49
|
-
"""
|
|
50
|
-
Returns a summary DataFrame with data types, non-null counts, number of unique values,
|
|
51
|
-
missing value percentage, and basic statistics for each column.
|
|
52
|
-
|
|
53
|
-
Parameters:
|
|
54
|
-
df (pd.DataFrame): The input DataFrame.
|
|
55
|
-
round_digits (int): Decimal places to round numerical statistics.
|
|
56
|
-
|
|
57
|
-
Returns:
|
|
58
|
-
pd.DataFrame: Summary table.
|
|
59
|
-
"""
|
|
60
|
-
summary = pd.DataFrame({
|
|
61
|
-
'Data Type': df.dtypes,
|
|
62
|
-
'Non-Null Count': df.notnull().sum(),
|
|
63
|
-
'Unique Values': df.nunique(),
|
|
64
|
-
'Missing %': (df.isnull().mean() * 100).round(round_digits)
|
|
65
|
-
})
|
|
66
|
-
|
|
67
|
-
# For numeric columns, add summary statistics
|
|
68
|
-
numeric_cols = df.select_dtypes(include='number').columns
|
|
69
|
-
if not numeric_cols.empty:
|
|
70
|
-
summary_numeric = df[numeric_cols].describe().T[
|
|
71
|
-
['mean', 'std', 'min', '25%', '50%', '75%', 'max']
|
|
72
|
-
].round(round_digits)
|
|
73
|
-
summary = summary.join(summary_numeric, how='left')
|
|
74
|
-
|
|
75
|
-
print(f"DataFrame Shape: {df.shape}")
|
|
76
|
-
return summary
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
|
|
80
|
-
"""
|
|
81
|
-
Removes columns from a pandas DataFrame that contain only a single unique
|
|
82
|
-
value or are entirely null/NaN.
|
|
83
|
-
|
|
84
|
-
This utility is useful for cleaning data by removing constant features that
|
|
85
|
-
have no predictive value.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
df (pd.DataFrame):
|
|
89
|
-
The pandas DataFrame to clean.
|
|
90
|
-
verbose (bool):
|
|
91
|
-
If True, prints the names of the columns that were dropped.
|
|
92
|
-
Defaults to True.
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
pd.DataFrame:
|
|
96
|
-
A new DataFrame with the constant columns removed.
|
|
97
|
-
"""
|
|
98
|
-
if not isinstance(df, pd.DataFrame):
|
|
99
|
-
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
100
|
-
raise TypeError()
|
|
101
|
-
|
|
102
|
-
# make copy to avoid modifying original
|
|
103
|
-
df_clean = df.copy()
|
|
104
|
-
|
|
105
|
-
original_columns = set(df.columns)
|
|
106
|
-
cols_to_keep = []
|
|
107
|
-
|
|
108
|
-
for col_name in df_clean.columns:
|
|
109
|
-
column = df_clean[col_name]
|
|
110
|
-
|
|
111
|
-
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
112
|
-
if column.nunique(dropna=True) > 1:
|
|
113
|
-
cols_to_keep.append(col_name)
|
|
114
|
-
|
|
115
|
-
dropped_columns = original_columns - set(cols_to_keep)
|
|
116
|
-
if verbose:
|
|
117
|
-
if dropped_columns:
|
|
118
|
-
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
|
|
119
|
-
else:
|
|
120
|
-
_LOGGER.info("No constant columns found.")
|
|
121
|
-
|
|
122
|
-
# Return a new DataFrame with only the columns to keep
|
|
123
|
-
df_clean = df_clean[cols_to_keep]
|
|
124
|
-
|
|
125
|
-
if isinstance(df_clean, pd.Series):
|
|
126
|
-
df_clean = df_clean.to_frame()
|
|
127
|
-
|
|
128
|
-
return df_clean
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
|
|
132
|
-
"""
|
|
133
|
-
Drops rows from the DataFrame using a two-stage strategy:
|
|
134
|
-
|
|
135
|
-
1. If `targets`, remove any row where all target columns are missing.
|
|
136
|
-
2. Among features, drop those with more than `threshold` fraction of missing values.
|
|
137
|
-
|
|
138
|
-
Parameters:
|
|
139
|
-
df (pd.DataFrame): The input DataFrame.
|
|
140
|
-
targets (list[str] | None): List of target column names.
|
|
141
|
-
threshold (float): Maximum allowed fraction of missing values in feature columns.
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
pd.DataFrame: A cleaned DataFrame with problematic rows removed.
|
|
145
|
-
"""
|
|
146
|
-
df_clean = df.copy()
|
|
147
|
-
|
|
148
|
-
# Stage 1: Drop rows with all target columns missing
|
|
149
|
-
valid_targets = []
|
|
150
|
-
if targets:
|
|
151
|
-
# validate targets
|
|
152
|
-
valid_targets = _validate_columns(df_clean, targets)
|
|
153
|
-
|
|
154
|
-
# Only proceed if we actually have columns to check
|
|
155
|
-
if valid_targets:
|
|
156
|
-
target_na = df_clean[valid_targets].isnull().all(axis=1)
|
|
157
|
-
if target_na.any():
|
|
158
|
-
_LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
159
|
-
df_clean = df_clean[~target_na]
|
|
160
|
-
else:
|
|
161
|
-
_LOGGER.info("No rows found where all targets are missing.")
|
|
162
|
-
else:
|
|
163
|
-
_LOGGER.error("Targets list provided but no matching columns found in DataFrame.")
|
|
164
|
-
raise ValueError()
|
|
165
|
-
|
|
166
|
-
# Stage 2: Drop rows based on feature column missing values
|
|
167
|
-
feature_cols = [col for col in df_clean.columns if col not in valid_targets]
|
|
168
|
-
if feature_cols:
|
|
169
|
-
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
170
|
-
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index # type: ignore
|
|
171
|
-
if len(rows_to_drop) > 0:
|
|
172
|
-
_LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
173
|
-
df_clean = df_clean.drop(index=rows_to_drop)
|
|
174
|
-
else:
|
|
175
|
-
_LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
176
|
-
else:
|
|
177
|
-
_LOGGER.warning("No feature columns available to evaluate.")
|
|
178
|
-
|
|
179
|
-
return df_clean
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def show_null_columns(
|
|
183
|
-
df: pd.DataFrame,
|
|
184
|
-
round_digits: int = 2,
|
|
185
|
-
plot_to_dir: Optional[Union[str, Path]] = None,
|
|
186
|
-
plot_filename: Optional[str] = None,
|
|
187
|
-
use_all_columns: bool = False
|
|
188
|
-
) -> pd.DataFrame:
|
|
189
|
-
"""
|
|
190
|
-
Returns a table of columns with missing values, showing both the count and
|
|
191
|
-
percentage of missing entries per column.
|
|
192
|
-
|
|
193
|
-
Optionally generates a visualization of the missing data profile.
|
|
194
|
-
|
|
195
|
-
Parameters:
|
|
196
|
-
df (pd.DataFrame): The input DataFrame.
|
|
197
|
-
round_digits (int): Number of decimal places for the percentage.
|
|
198
|
-
plot_to_dir (str | Path | None): If provided, saves a visualization of the
|
|
199
|
-
missing data to this directory.
|
|
200
|
-
plot_filename (str): The filename for the saved plot (without extension).
|
|
201
|
-
Used only if `plot_to_dir` is set.
|
|
202
|
-
use_all_columns (bool): If True, includes all columns in the summary and plot,
|
|
203
|
-
even those with no missing values.
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
pd.DataFrame: A DataFrame summarizing missing values in each column.
|
|
207
|
-
"""
|
|
208
|
-
null_counts = df.isnull().sum()
|
|
209
|
-
null_percent = df.isnull().mean() * 100
|
|
210
|
-
|
|
211
|
-
if use_all_columns:
|
|
212
|
-
null_summary = pd.DataFrame({
|
|
213
|
-
'Missing Count': null_counts,
|
|
214
|
-
'Missing %': null_percent.round(round_digits)
|
|
215
|
-
})
|
|
216
|
-
else:
|
|
217
|
-
# Filter only columns with at least one null
|
|
218
|
-
mask = null_counts > 0
|
|
219
|
-
null_summary = pd.DataFrame({
|
|
220
|
-
'Missing Count': null_counts[mask],
|
|
221
|
-
'Missing %': null_percent[mask].round(round_digits)
|
|
222
|
-
})
|
|
223
|
-
|
|
224
|
-
# Sort by descending percentage of missing values
|
|
225
|
-
null_summary = null_summary.sort_values(by='Missing %', ascending=False)
|
|
226
|
-
|
|
227
|
-
# --- Visualization Logic ---
|
|
228
|
-
if plot_to_dir:
|
|
229
|
-
if null_summary.empty:
|
|
230
|
-
_LOGGER.info("No missing data found. Skipping plot generation.")
|
|
231
|
-
else:
|
|
232
|
-
try:
|
|
233
|
-
# Validate and create save directory
|
|
234
|
-
save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
|
|
235
|
-
|
|
236
|
-
# Prepare data
|
|
237
|
-
features = null_summary.index.tolist()
|
|
238
|
-
missing_pct = np.array(null_summary['Missing %'].values)
|
|
239
|
-
present_pct = 100 - missing_pct
|
|
240
|
-
n_features = len(features)
|
|
241
|
-
|
|
242
|
-
# Dynamic width
|
|
243
|
-
width = max(10, n_features * 0.4)
|
|
244
|
-
plt.figure(figsize=(width, 8))
|
|
245
|
-
|
|
246
|
-
# Stacked Bar Chart Logic
|
|
247
|
-
|
|
248
|
-
# Grid behind bars
|
|
249
|
-
plt.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
|
|
250
|
-
|
|
251
|
-
# 1. Present Data: Solid Green
|
|
252
|
-
plt.bar(
|
|
253
|
-
features,
|
|
254
|
-
present_pct,
|
|
255
|
-
color='tab:green',
|
|
256
|
-
label='Present',
|
|
257
|
-
width=0.6,
|
|
258
|
-
zorder=3
|
|
259
|
-
)
|
|
260
|
-
|
|
261
|
-
# 2. Missing Data: Transparent Red Fill + Solid Red Hatch
|
|
262
|
-
# define facecolor (fill) with alpha, but edgecolor (lines) without alpha.
|
|
263
|
-
plt.bar(
|
|
264
|
-
features,
|
|
265
|
-
missing_pct,
|
|
266
|
-
bottom=present_pct,
|
|
267
|
-
facecolor=(1.0, 1.0, 1.0, 0.2), # RGBA
|
|
268
|
-
edgecolor='tab:red', # Solid red for the hatch lines
|
|
269
|
-
hatch='///', # hatch pattern
|
|
270
|
-
linewidth=0.4, # Ensure lines are thick enough to see
|
|
271
|
-
label='Missing',
|
|
272
|
-
width=0.6,
|
|
273
|
-
zorder=3
|
|
274
|
-
)
|
|
275
|
-
|
|
276
|
-
# Styling
|
|
277
|
-
plt.ylim(0, 100)
|
|
278
|
-
plt.ylabel("Data Completeness (%)", fontsize=13)
|
|
279
|
-
plt.yticks(np.arange(0, 101, 10))
|
|
280
|
-
plot_title = f"Missing Data - {plot_filename.replace('_', ' ')}" if plot_filename else "Missing Data"
|
|
281
|
-
plt.title(plot_title)
|
|
282
|
-
plt.xticks(rotation=45, ha='right', fontsize=9)
|
|
283
|
-
|
|
284
|
-
# Reference line
|
|
285
|
-
plt.axhline(y=100, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
|
|
286
|
-
|
|
287
|
-
plt.legend(loc='lower right', framealpha=0.95)
|
|
288
|
-
plt.tight_layout()
|
|
289
|
-
|
|
290
|
-
# Save
|
|
291
|
-
if plot_filename is None or plot_filename.strip() == "":
|
|
292
|
-
plot_filename = "Missing_Data_Profile"
|
|
293
|
-
else:
|
|
294
|
-
plot_filename = "Missing_Data_" + sanitize_filename(plot_filename)
|
|
295
|
-
|
|
296
|
-
full_filename = plot_filename + ".svg"
|
|
297
|
-
plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
|
|
298
|
-
plt.close()
|
|
299
|
-
|
|
300
|
-
_LOGGER.info(f"Saved missing data plot as '{full_filename}'")
|
|
301
|
-
|
|
302
|
-
except Exception as e:
|
|
303
|
-
_LOGGER.error(f"Failed to generate missing data plot. Error: {e}")
|
|
304
|
-
plt.close()
|
|
305
|
-
|
|
306
|
-
return null_summary
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
|
|
310
|
-
"""
|
|
311
|
-
Drops columns with more than `threshold` fraction of missing values.
|
|
312
|
-
|
|
313
|
-
Parameters:
|
|
314
|
-
df (pd.DataFrame): The input DataFrame.
|
|
315
|
-
threshold (float): Fraction of missing values above which columns are dropped.
|
|
316
|
-
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
317
|
-
skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
|
|
318
|
-
|
|
319
|
-
Returns:
|
|
320
|
-
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
321
|
-
"""
|
|
322
|
-
# If skip_columns is provided, create a list of columns to check.
|
|
323
|
-
# Otherwise, check all columns.
|
|
324
|
-
cols_to_check = df.columns
|
|
325
|
-
if skip_columns:
|
|
326
|
-
# Use set difference for efficient exclusion
|
|
327
|
-
cols_to_check = df.columns.difference(skip_columns)
|
|
328
|
-
|
|
329
|
-
# Calculate the missing fraction only on the columns to be checked
|
|
330
|
-
missing_fraction = df[cols_to_check].isnull().mean()
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
|
|
334
|
-
|
|
335
|
-
if len(cols_to_drop) > 0:
|
|
336
|
-
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
|
|
337
|
-
|
|
338
|
-
result_df = df.drop(columns=cols_to_drop)
|
|
339
|
-
if show_nulls_after:
|
|
340
|
-
print(show_null_columns(df=result_df))
|
|
341
|
-
|
|
342
|
-
return result_df
|
|
343
|
-
else:
|
|
344
|
-
_LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
345
|
-
return df
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
def drop_macro(df: pd.DataFrame,
|
|
349
|
-
log_directory: Union[str,Path],
|
|
350
|
-
targets: list[str],
|
|
351
|
-
skip_targets: bool=False,
|
|
352
|
-
threshold: float=0.7) -> pd.DataFrame:
|
|
353
|
-
"""
|
|
354
|
-
Iteratively removes rows and columns with excessive missing data.
|
|
355
|
-
|
|
356
|
-
This function performs a comprehensive cleaning cycle on a DataFrame. It
|
|
357
|
-
repeatedly drops columns with constant values, followed by rows and columns that exceed
|
|
358
|
-
a specified threshold of missing values. The process continues until the
|
|
359
|
-
DataFrame's dimensions stabilize, ensuring that the interdependency between
|
|
360
|
-
row and column deletions is handled.
|
|
361
|
-
|
|
362
|
-
Initial and final missing data reports are saved to the specified log directory.
|
|
363
|
-
|
|
364
|
-
Args:
|
|
365
|
-
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
366
|
-
log_directory (Union[str, Path]): Path to the directory where the missing data reports
|
|
367
|
-
and plots will be saved inside a "Missing Report" subdirectory.
|
|
368
|
-
targets (list[str]): A list of column names to be treated as target
|
|
369
|
-
variables. This list guides the row-dropping logic.
|
|
370
|
-
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
371
|
-
will be exempt from being dropped, even if they exceed the missing
|
|
372
|
-
data threshold.
|
|
373
|
-
threshold (float, optional): The proportion of missing data required to drop
|
|
374
|
-
a row or column. For example, 0.7 means a row/column will be
|
|
375
|
-
dropped if 70% or more of its data is missing.
|
|
376
|
-
|
|
377
|
-
Returns:
|
|
378
|
-
pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
|
|
379
|
-
"""
|
|
380
|
-
# make a deep copy to work with
|
|
381
|
-
df_clean = df.copy()
|
|
382
|
-
|
|
383
|
-
base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
384
|
-
full_path = base_dir_path / "Missing Report"
|
|
385
|
-
|
|
386
|
-
# Log initial state + Plot
|
|
387
|
-
missing_data_start = show_null_columns(
|
|
388
|
-
df=df_clean,
|
|
389
|
-
plot_to_dir=full_path,
|
|
390
|
-
plot_filename="Original",
|
|
391
|
-
use_all_columns=True
|
|
392
|
-
)
|
|
393
|
-
save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
|
|
394
|
-
save_dir=full_path,
|
|
395
|
-
filename="Missing_Data_Original")
|
|
396
|
-
|
|
397
|
-
# Clean cycles for rows and columns
|
|
398
|
-
master = True
|
|
399
|
-
while master:
|
|
400
|
-
# track rows and columns
|
|
401
|
-
initial_rows, initial_columns = df_clean.shape
|
|
402
|
-
|
|
403
|
-
# drop constant columns
|
|
404
|
-
df_clean = drop_constant_columns(df=df_clean)
|
|
405
|
-
|
|
406
|
-
# clean rows
|
|
407
|
-
df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
|
|
408
|
-
|
|
409
|
-
# clean columns
|
|
410
|
-
if skip_targets:
|
|
411
|
-
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
|
|
412
|
-
else:
|
|
413
|
-
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
|
|
414
|
-
|
|
415
|
-
# cleaned?
|
|
416
|
-
remaining_rows, remaining_columns = df_clean.shape
|
|
417
|
-
if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
|
|
418
|
-
master = False
|
|
419
|
-
|
|
420
|
-
# log final state + plot
|
|
421
|
-
missing_data_final = show_null_columns(
|
|
422
|
-
df=df_clean,
|
|
423
|
-
plot_to_dir=full_path,
|
|
424
|
-
plot_filename="Processed",
|
|
425
|
-
use_all_columns=True
|
|
426
|
-
)
|
|
427
|
-
save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
|
|
428
|
-
save_dir=full_path,
|
|
429
|
-
filename="Missing_Data_Processed")
|
|
430
|
-
|
|
431
|
-
# return cleaned dataframe
|
|
432
|
-
return df_clean
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacement_pattern: str = r'[\[\]{}<>,:"]', verbose: bool = True) -> pd.DataFrame:
|
|
436
|
-
"""
|
|
437
|
-
Cleans DataFrame column names by replacing special characters.
|
|
438
|
-
|
|
439
|
-
This function is useful for ensuring compatibility with libraries like LightGBM,
|
|
440
|
-
which do not support special JSON characters such as `[]{}<>,:"` in feature names.
|
|
441
|
-
|
|
442
|
-
Args:
|
|
443
|
-
df (pd.DataFrame): The input DataFrame.
|
|
444
|
-
replacement_char (str): The character to use for replacing characters.
|
|
445
|
-
replacement_pattern (str): Regex pattern to use for the replacement logic.
|
|
446
|
-
verbose (bool): If True, prints the renamed columns.
|
|
447
|
-
|
|
448
|
-
Returns:
|
|
449
|
-
pd.DataFrame: A new DataFrame with cleaned column names.
|
|
450
|
-
"""
|
|
451
|
-
new_df = df.copy()
|
|
452
|
-
|
|
453
|
-
original_columns = new_df.columns
|
|
454
|
-
new_columns = original_columns.str.replace(replacement_pattern, replacement_char, regex=True)
|
|
455
|
-
|
|
456
|
-
# Create a map of changes for logging
|
|
457
|
-
rename_map = {old: new for old, new in zip(original_columns, new_columns) if old != new}
|
|
458
|
-
|
|
459
|
-
if verbose:
|
|
460
|
-
if rename_map:
|
|
461
|
-
_LOGGER.info(f"Cleaned {len(rename_map)} column name(s) containing special characters:")
|
|
462
|
-
for old, new in rename_map.items():
|
|
463
|
-
print(f" '{old}' -> '{new}'")
|
|
464
|
-
else:
|
|
465
|
-
_LOGGER.info("No column names required cleaning.")
|
|
466
|
-
|
|
467
|
-
new_df.columns = new_columns
|
|
468
|
-
return new_df
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
def plot_value_distributions(
|
|
472
|
-
df: pd.DataFrame,
|
|
473
|
-
save_dir: Union[str, Path],
|
|
474
|
-
categorical_columns: Optional[List[str]] = None,
|
|
475
|
-
max_categories: int = 100,
|
|
476
|
-
fill_na_with: str = "MISSING DATA"
|
|
477
|
-
):
|
|
478
|
-
"""
|
|
479
|
-
Plots and saves the value distributions for all columns in a DataFrame,
|
|
480
|
-
using the best plot type for each column (histogram or count plot).
|
|
481
|
-
|
|
482
|
-
Plots are saved as SVG files under two subdirectories in `save_dir`:
|
|
483
|
-
- "Distribution_Continuous" for continuous numeric features (histograms).
|
|
484
|
-
- "Distribution_Categorical" for categorical features (count plots).
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
df (pd.DataFrame): The input DataFrame to analyze.
|
|
488
|
-
save_dir (str | Path): Directory path to save the plots.
|
|
489
|
-
categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
|
|
490
|
-
max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
491
|
-
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
|
|
492
|
-
|
|
493
|
-
Notes:
|
|
494
|
-
- `seaborn.histplot` with KDE is used for continuous features.
|
|
495
|
-
- `seaborn.countplot` is used for categorical features.
|
|
496
|
-
"""
|
|
497
|
-
# 1. Setup save directories
|
|
498
|
-
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
499
|
-
numeric_dir = base_save_path / "Distribution_Continuous"
|
|
500
|
-
categorical_dir = base_save_path / "Distribution_Categorical"
|
|
501
|
-
numeric_dir.mkdir(parents=True, exist_ok=True)
|
|
502
|
-
categorical_dir.mkdir(parents=True, exist_ok=True)
|
|
503
|
-
|
|
504
|
-
# 2. Filter columns to plot
|
|
505
|
-
columns_to_plot = df.columns.to_list()
|
|
506
|
-
|
|
507
|
-
# Setup for forced categorical logic
|
|
508
|
-
categorical_set = set(categorical_columns) if categorical_columns is not None else None
|
|
509
|
-
|
|
510
|
-
numeric_plots_saved = 0
|
|
511
|
-
categorical_plots_saved = 0
|
|
512
|
-
|
|
513
|
-
for col_name in columns_to_plot:
|
|
514
|
-
try:
|
|
515
|
-
is_numeric = is_numeric_dtype(df[col_name])
|
|
516
|
-
n_unique = df[col_name].nunique()
|
|
517
|
-
|
|
518
|
-
# --- 3. Determine Plot Type ---
|
|
519
|
-
is_continuous = False
|
|
520
|
-
if categorical_set is not None:
|
|
521
|
-
# Use the explicit list
|
|
522
|
-
if col_name not in categorical_set:
|
|
523
|
-
is_continuous = True
|
|
524
|
-
else:
|
|
525
|
-
# Use auto-detection
|
|
526
|
-
if is_numeric:
|
|
527
|
-
is_continuous = True
|
|
528
|
-
|
|
529
|
-
# --- Case 1: Continuous Numeric (Histogram) ---
|
|
530
|
-
if is_continuous:
|
|
531
|
-
plt.figure(figsize=(10, 6))
|
|
532
|
-
# Drop NaNs for histogram, as they can't be plotted on a numeric axis
|
|
533
|
-
sns.histplot(x=df[col_name].dropna(), kde=True, bins=30)
|
|
534
|
-
plt.title(f"Distribution of '{col_name}' (Continuous)")
|
|
535
|
-
plt.xlabel(col_name)
|
|
536
|
-
plt.ylabel("Count")
|
|
537
|
-
|
|
538
|
-
save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
|
|
539
|
-
numeric_plots_saved += 1
|
|
540
|
-
|
|
541
|
-
# --- Case 2: Categorical (Count Plot) ---
|
|
542
|
-
else:
|
|
543
|
-
# Check max categories
|
|
544
|
-
if n_unique > max_categories:
|
|
545
|
-
_LOGGER.warning(f"Skipping plot for '{col_name}': {n_unique} unique values > {max_categories} max_categories.")
|
|
546
|
-
continue
|
|
547
|
-
|
|
548
|
-
# Adaptive figure size
|
|
549
|
-
fig_width = max(10, n_unique * 0.5)
|
|
550
|
-
plt.figure(figsize=(fig_width, 8))
|
|
551
|
-
|
|
552
|
-
# Make a temporary copy for plotting to handle NaNs
|
|
553
|
-
temp_series = df[col_name].copy()
|
|
554
|
-
|
|
555
|
-
# Handle NaNs by replacing them with the specified string
|
|
556
|
-
if temp_series.isnull().any():
|
|
557
|
-
# Convert to object type first to allow string replacement
|
|
558
|
-
temp_series = temp_series.astype(object).fillna(fill_na_with)
|
|
559
|
-
|
|
560
|
-
# Convert all to string to be safe (handles low-card numeric)
|
|
561
|
-
temp_series = temp_series.astype(str)
|
|
562
|
-
|
|
563
|
-
# Get category order by frequency
|
|
564
|
-
order = temp_series.value_counts().index
|
|
565
|
-
sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
|
|
566
|
-
|
|
567
|
-
plt.title(f"Distribution of '{col_name}' (Categorical)")
|
|
568
|
-
plt.xlabel(col_name)
|
|
569
|
-
plt.ylabel("Count")
|
|
570
|
-
|
|
571
|
-
# Smart tick rotation
|
|
572
|
-
max_label_len = 0
|
|
573
|
-
if n_unique > 0:
|
|
574
|
-
max_label_len = max(len(str(s)) for s in order)
|
|
575
|
-
|
|
576
|
-
# Rotate if labels are long OR there are many categories
|
|
577
|
-
if max_label_len > 10 or n_unique > 25:
|
|
578
|
-
plt.xticks(rotation=45, ha='right')
|
|
579
|
-
|
|
580
|
-
save_path = categorical_dir / f"{sanitize_filename(col_name)}.svg"
|
|
581
|
-
categorical_plots_saved += 1
|
|
582
|
-
|
|
583
|
-
# --- 4. Save Plot ---
|
|
584
|
-
plt.grid(True, linestyle='--', alpha=0.6, axis='y')
|
|
585
|
-
plt.tight_layout()
|
|
586
|
-
# Save as .svg
|
|
587
|
-
plt.savefig(save_path, format='svg', bbox_inches="tight")
|
|
588
|
-
plt.close()
|
|
589
|
-
|
|
590
|
-
except Exception as e:
|
|
591
|
-
_LOGGER.error(f"Failed to plot distribution for '{col_name}'. Error: {e}")
|
|
592
|
-
plt.close()
|
|
593
|
-
|
|
594
|
-
_LOGGER.info(f"Saved {numeric_plots_saved} continuous distribution plots to '{numeric_dir.name}'.")
|
|
595
|
-
_LOGGER.info(f"Saved {categorical_plots_saved} categorical distribution plots to '{categorical_dir.name}'.")
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
def plot_continuous_vs_target(
|
|
599
|
-
df: pd.DataFrame,
|
|
600
|
-
targets: List[str],
|
|
601
|
-
save_dir: Union[str, Path],
|
|
602
|
-
features: Optional[List[str]] = None
|
|
603
|
-
):
|
|
604
|
-
"""
|
|
605
|
-
Plots each continuous feature against each target to visualize linear relationships.
|
|
606
|
-
|
|
607
|
-
This function is a common EDA step for regression tasks. It creates a
|
|
608
|
-
scatter plot for each feature-target pair, overlays a simple linear
|
|
609
|
-
regression line, and saves each plot as an individual .svg file.
|
|
610
|
-
|
|
611
|
-
Plots are saved in a structured way, with a subdirectory created for
|
|
612
|
-
each target variable.
|
|
613
|
-
|
|
614
|
-
Args:
|
|
615
|
-
df (pd.DataFrame): The input DataFrame.
|
|
616
|
-
targets (List[str]): A list of target column names to plot (y-axis).
|
|
617
|
-
save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
|
|
618
|
-
features (List[str] | None): A list of feature column names to plot (x-axis). If None, all non-target columns in the
|
|
619
|
-
DataFrame will be used.
|
|
620
|
-
|
|
621
|
-
Notes:
|
|
622
|
-
- Only numeric features and numeric targets are processed. Non-numeric
|
|
623
|
-
columns in the lists will be skipped with a warning.
|
|
624
|
-
- Rows with NaN in either the feature or the target are dropped
|
|
625
|
-
pairwise for each plot.
|
|
626
|
-
"""
|
|
627
|
-
# 1. Validate the base save directory
|
|
628
|
-
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
629
|
-
|
|
630
|
-
# 2. Validate helper
|
|
631
|
-
def _validate_numeric_cols(col_list: List[str], col_type: str) -> List[str]:
|
|
632
|
-
valid_cols = []
|
|
633
|
-
for col in col_list:
|
|
634
|
-
if col not in df.columns:
|
|
635
|
-
_LOGGER.warning(f"{col_type} column '{col}' not found. Skipping.")
|
|
636
|
-
elif not is_numeric_dtype(df[col]):
|
|
637
|
-
_LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
|
|
638
|
-
else:
|
|
639
|
-
valid_cols.append(col)
|
|
640
|
-
return valid_cols
|
|
641
|
-
|
|
642
|
-
# 3. Validate target columns FIRST
|
|
643
|
-
valid_targets = _validate_numeric_cols(targets, "Target")
|
|
644
|
-
if not valid_targets:
|
|
645
|
-
_LOGGER.error("No valid numeric target columns provided to plot.")
|
|
646
|
-
return
|
|
647
|
-
|
|
648
|
-
# 4. Determine and validate feature columns
|
|
649
|
-
if features is None:
|
|
650
|
-
_LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
|
|
651
|
-
target_set = set(valid_targets)
|
|
652
|
-
# Get all columns that are not in the valid_targets set
|
|
653
|
-
features_to_validate = [col for col in df.columns if col not in target_set]
|
|
654
|
-
else:
|
|
655
|
-
features_to_validate = features
|
|
656
|
-
|
|
657
|
-
valid_features = _validate_numeric_cols(features_to_validate, "Feature")
|
|
658
|
-
|
|
659
|
-
if not valid_features:
|
|
660
|
-
_LOGGER.error("No valid numeric feature columns found to plot.")
|
|
661
|
-
return
|
|
662
|
-
|
|
663
|
-
# 5. Main plotting loop
|
|
664
|
-
total_plots_saved = 0
|
|
665
|
-
|
|
666
|
-
for target_name in valid_targets:
|
|
667
|
-
# Create a sanitized subdirectory for this target
|
|
668
|
-
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Continuous")
|
|
669
|
-
target_save_dir = base_save_path / safe_target_dir_name
|
|
670
|
-
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
671
|
-
|
|
672
|
-
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
673
|
-
|
|
674
|
-
for feature_name in valid_features:
|
|
675
|
-
|
|
676
|
-
# Drop NaNs pairwise for this specific plot
|
|
677
|
-
temp_df = df[[feature_name, target_name]].dropna()
|
|
678
|
-
|
|
679
|
-
if temp_df.empty:
|
|
680
|
-
_LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
|
|
681
|
-
continue
|
|
682
|
-
|
|
683
|
-
x = temp_df[feature_name]
|
|
684
|
-
y = temp_df[target_name]
|
|
685
|
-
|
|
686
|
-
# 6. Perform linear fit
|
|
687
|
-
try:
|
|
688
|
-
# Modern replacement for np.polyfit + np.poly1d. Compatible with NumPy 1.14+ and NumPy 2.0+
|
|
689
|
-
p = np.polynomial.Polynomial.fit(x, y, deg=1)
|
|
690
|
-
plot_regression_line = True
|
|
691
|
-
except (np.linalg.LinAlgError, ValueError):
|
|
692
|
-
_LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
|
|
693
|
-
plot_regression_line = False
|
|
694
|
-
|
|
695
|
-
# 7. Create the plot
|
|
696
|
-
plt.figure(figsize=(10, 6))
|
|
697
|
-
ax = plt.gca()
|
|
698
|
-
|
|
699
|
-
# Plot the raw data points
|
|
700
|
-
ax.plot(x, y, 'o', alpha=0.5, label='Data points', markersize=5)
|
|
701
|
-
|
|
702
|
-
# Plot the regression line
|
|
703
|
-
if plot_regression_line:
|
|
704
|
-
ax.plot(x, p(x), "r--", label='Linear Fit') # type: ignore
|
|
705
|
-
|
|
706
|
-
ax.set_title(f'{feature_name} vs {target_name}')
|
|
707
|
-
ax.set_xlabel(feature_name)
|
|
708
|
-
ax.set_ylabel(target_name)
|
|
709
|
-
ax.legend()
|
|
710
|
-
plt.grid(True, linestyle='--', alpha=0.6)
|
|
711
|
-
plt.tight_layout()
|
|
712
|
-
|
|
713
|
-
# 8. Save the plot
|
|
714
|
-
safe_feature_name = sanitize_filename(feature_name)
|
|
715
|
-
plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
|
|
716
|
-
plot_path = target_save_dir / plot_filename
|
|
717
|
-
|
|
718
|
-
try:
|
|
719
|
-
plt.savefig(plot_path, bbox_inches="tight", format='svg')
|
|
720
|
-
total_plots_saved += 1
|
|
721
|
-
except Exception as e:
|
|
722
|
-
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
723
|
-
|
|
724
|
-
# Close the figure to free up memory
|
|
725
|
-
plt.close()
|
|
726
|
-
|
|
727
|
-
_LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
def plot_categorical_vs_target(
|
|
731
|
-
df: pd.DataFrame,
|
|
732
|
-
targets: List[str],
|
|
733
|
-
save_dir: Union[str, Path],
|
|
734
|
-
features: Optional[List[str]] = None,
|
|
735
|
-
max_categories: int = 50,
|
|
736
|
-
fill_na_with: str = "MISSING DATA"
|
|
737
|
-
):
|
|
738
|
-
"""
|
|
739
|
-
Plots each categorical feature against each numeric target using box plots.
|
|
740
|
-
|
|
741
|
-
This function is a core EDA step for regression tasks to understand the
|
|
742
|
-
relationship between a categorical independent variable and a continuous
|
|
743
|
-
dependent variable.
|
|
744
|
-
|
|
745
|
-
Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
|
|
746
|
-
|
|
747
|
-
Args:
|
|
748
|
-
df (pd.DataFrame): The input DataFrame.
|
|
749
|
-
targets (List[str]): A list of numeric target column names (y-axis).
|
|
750
|
-
save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
|
|
751
|
-
features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
|
|
752
|
-
max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
|
|
753
|
-
fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
|
|
754
|
-
|
|
755
|
-
Notes:
|
|
756
|
-
- Only numeric targets are processed.
|
|
757
|
-
- Features are automatically identified as categorical if they are 'object' dtype.
|
|
758
|
-
"""
|
|
759
|
-
# 1. Validate the base save directory and inputs
|
|
760
|
-
base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
|
|
761
|
-
|
|
762
|
-
# 2. Validate target columns (must be numeric)
|
|
763
|
-
valid_targets = []
|
|
764
|
-
for col in targets:
|
|
765
|
-
if col not in df.columns:
|
|
766
|
-
_LOGGER.warning(f"Target column '{col}' not found. Skipping.")
|
|
767
|
-
elif not is_numeric_dtype(df[col]):
|
|
768
|
-
_LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
|
|
769
|
-
else:
|
|
770
|
-
valid_targets.append(col)
|
|
771
|
-
|
|
772
|
-
if not valid_targets:
|
|
773
|
-
_LOGGER.error("No valid numeric target columns provided to plot.")
|
|
774
|
-
return
|
|
775
|
-
|
|
776
|
-
# 3. Determine and validate feature columns
|
|
777
|
-
features_to_plot = []
|
|
778
|
-
if features is None:
|
|
779
|
-
_LOGGER.info("No 'features' list provided. Auto-detecting categorical features.")
|
|
780
|
-
for col in df.columns:
|
|
781
|
-
if col in valid_targets:
|
|
782
|
-
continue
|
|
783
|
-
# Auto-include object dtypes
|
|
784
|
-
if is_object_dtype(df[col]):
|
|
785
|
-
features_to_plot.append(col)
|
|
786
|
-
|
|
787
|
-
else:
|
|
788
|
-
# Validate user-provided list
|
|
789
|
-
for col in features:
|
|
790
|
-
if col not in df.columns:
|
|
791
|
-
_LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
|
|
792
|
-
else:
|
|
793
|
-
features_to_plot.append(col)
|
|
794
|
-
|
|
795
|
-
if not features_to_plot:
|
|
796
|
-
_LOGGER.error("No valid categorical feature columns found to plot.")
|
|
797
|
-
return
|
|
798
|
-
|
|
799
|
-
# 4. Main plotting loop
|
|
800
|
-
total_plots_saved = 0
|
|
801
|
-
|
|
802
|
-
for target_name in valid_targets:
|
|
803
|
-
# Create a sanitized subdirectory for this target
|
|
804
|
-
safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
|
|
805
|
-
target_save_dir = base_save_path / safe_target_dir_name
|
|
806
|
-
target_save_dir.mkdir(parents=True, exist_ok=True)
|
|
807
|
-
|
|
808
|
-
_LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
|
|
809
|
-
for feature_name in features_to_plot:
|
|
810
|
-
|
|
811
|
-
# Make a temporary copy for plotting to handle NaNs and dtypes
|
|
812
|
-
temp_df = df[[feature_name, target_name]].copy()
|
|
813
|
-
|
|
814
|
-
# Check cardinality
|
|
815
|
-
n_unique = temp_df[feature_name].nunique()
|
|
816
|
-
if n_unique > max_categories:
|
|
817
|
-
_LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique values > {max_categories} max_categories.")
|
|
818
|
-
continue
|
|
819
|
-
|
|
820
|
-
# Handle NaNs by replacing them with the specified string
|
|
821
|
-
if temp_df[feature_name].isnull().any():
|
|
822
|
-
# Convert to object type first to allow string replacement
|
|
823
|
-
temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
|
|
824
|
-
|
|
825
|
-
# Convert feature to string to ensure correct plotting order
|
|
826
|
-
temp_df[feature_name] = temp_df[feature_name].astype(str)
|
|
827
|
-
|
|
828
|
-
# 5. Create the plot
|
|
829
|
-
# Increase figure width for categories
|
|
830
|
-
plt.figure(figsize=(max(10, n_unique * 1.2), 10))
|
|
831
|
-
|
|
832
|
-
sns.boxplot(x=feature_name, y=target_name, data=temp_df)
|
|
833
|
-
|
|
834
|
-
plt.title(f'{target_name} vs {feature_name}')
|
|
835
|
-
plt.xlabel(feature_name)
|
|
836
|
-
plt.ylabel(target_name)
|
|
837
|
-
plt.xticks(rotation=45, ha='right')
|
|
838
|
-
plt.grid(True, linestyle='--', alpha=0.6, axis='y')
|
|
839
|
-
plt.tight_layout()
|
|
840
|
-
|
|
841
|
-
# 6. Save the plot
|
|
842
|
-
safe_feature_name = sanitize_filename(feature_name)
|
|
843
|
-
plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
|
|
844
|
-
plot_path = target_save_dir / plot_filename
|
|
845
|
-
|
|
846
|
-
try:
|
|
847
|
-
plt.savefig(plot_path, bbox_inches="tight", format='svg')
|
|
848
|
-
total_plots_saved += 1
|
|
849
|
-
except Exception as e:
|
|
850
|
-
_LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
|
|
851
|
-
|
|
852
|
-
plt.close()
|
|
853
|
-
|
|
854
|
-
_LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
def encode_categorical_features(
|
|
858
|
-
df: pd.DataFrame,
|
|
859
|
-
columns_to_encode: List[str],
|
|
860
|
-
encode_nulls: bool,
|
|
861
|
-
null_label: str = "Other",
|
|
862
|
-
split_resulting_dataset: bool = True,
|
|
863
|
-
verbose: bool = True
|
|
864
|
-
) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
|
|
865
|
-
"""
|
|
866
|
-
Finds unique values in specified categorical columns, encodes them into integers,
|
|
867
|
-
and returns a dictionary containing the mappings for each column.
|
|
868
|
-
|
|
869
|
-
This function automates the label encoding process and generates a simple,
|
|
870
|
-
human-readable dictionary of the mappings.
|
|
871
|
-
|
|
872
|
-
Args:
|
|
873
|
-
df (pd.DataFrame): The input DataFrame.
|
|
874
|
-
columns_to_encode (List[str]): A list of column names to be encoded.
|
|
875
|
-
encode_nulls (bool):
|
|
876
|
-
- If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
|
|
877
|
-
- If False, Nulls are ignored and categories start from 0.
|
|
878
|
-
|
|
879
|
-
null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
|
|
880
|
-
split_resulting_dataset (bool):
|
|
881
|
-
- If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
|
|
882
|
-
- If False, returns a single DataFrame with all columns.
|
|
883
|
-
verbose (bool): If True, prints encoding progress.
|
|
884
|
-
|
|
885
|
-
Returns:
|
|
886
|
-
Tuple:
|
|
887
|
-
|
|
888
|
-
- Dict[str, Dict[str, int]]: A dictionary where each key is a column name and the value is its category-to-integer mapping.
|
|
889
|
-
|
|
890
|
-
- pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
|
|
891
|
-
|
|
892
|
-
- pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
|
|
893
|
-
|
|
894
|
-
## **Important:**
|
|
895
|
-
1. Do not encode 'Ordinal Features' (e.g., Low=1, Med=2, High=3), these must be treated as numerical (continuous).
|
|
896
|
-
2. Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
|
|
897
|
-
"""
|
|
898
|
-
df_encoded = df.copy()
|
|
899
|
-
|
|
900
|
-
# Validate columns
|
|
901
|
-
valid_columns = [col for col in columns_to_encode if col in df_encoded.columns]
|
|
902
|
-
missing_columns = set(columns_to_encode) - set(valid_columns)
|
|
903
|
-
if missing_columns:
|
|
904
|
-
_LOGGER.warning(f"Columns not found and will be skipped: {list(missing_columns)}")
|
|
905
|
-
|
|
906
|
-
mappings: Dict[str, Dict[str, int]] = {}
|
|
907
|
-
|
|
908
|
-
_LOGGER.info(f"Encoding {len(valid_columns)} categorical column(s).")
|
|
909
|
-
for col_name in valid_columns:
|
|
910
|
-
has_nulls = df_encoded[col_name].isnull().any()
|
|
911
|
-
|
|
912
|
-
# Get unique values once to check cardinality and generate categories
|
|
913
|
-
raw_unique_values = df_encoded[col_name].dropna().unique()
|
|
914
|
-
|
|
915
|
-
# --- Check for constant columns ---
|
|
916
|
-
if len(raw_unique_values) <= 1:
|
|
917
|
-
# Exception: If we are encoding nulls and nulls exist, this is effectively a binary feature (Null vs Value)
|
|
918
|
-
is_effectively_binary = encode_nulls and has_nulls
|
|
919
|
-
|
|
920
|
-
if not is_effectively_binary:
|
|
921
|
-
_LOGGER.warning(f"Column '{col_name}' has only {len(raw_unique_values)} unique value(s). Consider dropping it before encoding as it offers no predictive variance.")
|
|
922
|
-
|
|
923
|
-
# Prepare categories (sorted string representation)
|
|
924
|
-
categories = sorted([str(cat) for cat in raw_unique_values])
|
|
925
|
-
|
|
926
|
-
if encode_nulls and has_nulls:
|
|
927
|
-
# Handle nulls: "Other" -> 0, other categories -> 1, 2, 3...
|
|
928
|
-
# Start mapping from 1 for non-null values
|
|
929
|
-
mapping = {category: i + 1 for i, category in enumerate(categories)}
|
|
930
|
-
|
|
931
|
-
# Apply mapping and fill remaining NaNs with 0
|
|
932
|
-
mapped_series = df_encoded[col_name].astype(str).map(mapping)
|
|
933
|
-
df_encoded[col_name] = mapped_series.fillna(0).astype(int)
|
|
934
|
-
|
|
935
|
-
# --- Validate nulls category---
|
|
936
|
-
# Ensure the key for 0 doesn't collide with a real category.
|
|
937
|
-
if null_label in mapping.keys():
|
|
938
|
-
# COLLISION! null_label is a real category
|
|
939
|
-
original_label = null_label
|
|
940
|
-
null_label = "__NULL__" # fallback
|
|
941
|
-
_LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
|
|
942
|
-
|
|
943
|
-
# Create the complete user-facing map including "Other"
|
|
944
|
-
user_mapping = {**mapping, null_label: 0}
|
|
945
|
-
mappings[col_name] = user_mapping
|
|
946
|
-
else:
|
|
947
|
-
# ignore nulls: categories start from 0
|
|
948
|
-
mapping = {category: i for i, category in enumerate(categories)}
|
|
949
|
-
|
|
950
|
-
df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping)
|
|
951
|
-
|
|
952
|
-
mappings[col_name] = mapping
|
|
953
|
-
|
|
954
|
-
if verbose:
|
|
955
|
-
cardinality = len(mappings[col_name])
|
|
956
|
-
print(f" - Encoded '{col_name}' with {cardinality} unique values.")
|
|
957
|
-
|
|
958
|
-
# Handle the dataset splitting logic
|
|
959
|
-
if split_resulting_dataset:
|
|
960
|
-
df_categorical = df_encoded[valid_columns]
|
|
961
|
-
df_non_categorical = df.drop(columns=valid_columns)
|
|
962
|
-
return mappings, df_non_categorical, df_categorical
|
|
963
|
-
else:
|
|
964
|
-
return mappings, df_encoded, None
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
def split_features_targets(df: pd.DataFrame, targets: list[str]):
|
|
968
|
-
"""
|
|
969
|
-
Splits a DataFrame's columns into features and targets.
|
|
970
|
-
|
|
971
|
-
Args:
|
|
972
|
-
df (pd.DataFrame): Pandas DataFrame containing the dataset.
|
|
973
|
-
targets (list[str]): List of column names to be treated as target variables.
|
|
974
|
-
|
|
975
|
-
Returns:
|
|
976
|
-
tuple: A tuple containing:
|
|
977
|
-
- pd.DataFrame: Features dataframe.
|
|
978
|
-
- pd.DataFrame: Targets dataframe.
|
|
979
|
-
|
|
980
|
-
Prints:
|
|
981
|
-
- Shape of the original dataframe.
|
|
982
|
-
- Shape of the features dataframe.
|
|
983
|
-
- Shape of the targets dataframe.
|
|
984
|
-
"""
|
|
985
|
-
valid_targets = _validate_columns(df, targets)
|
|
986
|
-
df_targets = df[valid_targets]
|
|
987
|
-
df_features = df.drop(columns=valid_targets)
|
|
988
|
-
print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
|
|
989
|
-
return df_features, df_targets
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
|
993
|
-
"""
|
|
994
|
-
Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
|
|
995
|
-
Normalize binary values like 0.0/1.0 to 0/1 if detected.
|
|
996
|
-
|
|
997
|
-
Parameters:
|
|
998
|
-
df (pd.DataFrame): Input DataFrame with only numeric columns.
|
|
999
|
-
|
|
1000
|
-
Returns:
|
|
1001
|
-
Tuple(pd.DataFrame, pd.DataFrame): (continuous_columns_df, binary_columns_df)
|
|
1002
|
-
|
|
1003
|
-
Raises:
|
|
1004
|
-
TypeError: If any column is not numeric.
|
|
1005
|
-
"""
|
|
1006
|
-
if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
|
|
1007
|
-
_LOGGER.error("All columns must be numeric (int or float).")
|
|
1008
|
-
raise TypeError()
|
|
1009
|
-
|
|
1010
|
-
binary_cols = []
|
|
1011
|
-
continuous_cols = []
|
|
1012
|
-
|
|
1013
|
-
for col in df.columns:
|
|
1014
|
-
series = df[col]
|
|
1015
|
-
unique_values = set(series[~series.isna()].unique())
|
|
1016
|
-
|
|
1017
|
-
if unique_values.issubset({0, 1}):
|
|
1018
|
-
binary_cols.append(col)
|
|
1019
|
-
elif unique_values.issubset({0.0, 1.0}):
|
|
1020
|
-
df[col] = df[col].apply(lambda x: 0 if x == 0.0 else (1 if x == 1.0 else x))
|
|
1021
|
-
binary_cols.append(col)
|
|
1022
|
-
else:
|
|
1023
|
-
continuous_cols.append(col)
|
|
1024
|
-
|
|
1025
|
-
binary_cols.sort()
|
|
1026
|
-
|
|
1027
|
-
df_cont = df[continuous_cols]
|
|
1028
|
-
df_bin = df[binary_cols]
|
|
1029
|
-
|
|
1030
|
-
print(f"Continuous columns shape: {df_cont.shape}")
|
|
1031
|
-
print(f"Binary columns shape: {df_bin.shape}")
|
|
1032
|
-
|
|
1033
|
-
return df_cont, df_bin # type: ignore
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
def plot_correlation_heatmap(df: pd.DataFrame,
|
|
1037
|
-
plot_title: str,
|
|
1038
|
-
save_dir: Union[str, Path, None] = None,
|
|
1039
|
-
method: Literal["pearson", "kendall", "spearman"]="pearson"):
|
|
1040
|
-
"""
|
|
1041
|
-
Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
|
|
1042
|
-
|
|
1043
|
-
Args:
|
|
1044
|
-
df (pd.DataFrame): The input dataset.
|
|
1045
|
-
save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
|
|
1046
|
-
plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
|
|
1047
|
-
method (str): Correlation method to use. Must be one of:
|
|
1048
|
-
- 'pearson' (default): measures linear correlation (assumes normally distributed data),
|
|
1049
|
-
- 'kendall': rank correlation (non-parametric),
|
|
1050
|
-
- 'spearman': monotonic relationship (non-parametric).
|
|
1051
|
-
|
|
1052
|
-
Notes:
|
|
1053
|
-
- Only numeric columns are included.
|
|
1054
|
-
- Annotations are disabled if there are more than 20 features.
|
|
1055
|
-
- Missing values are handled via pairwise complete observations.
|
|
1056
|
-
"""
|
|
1057
|
-
numeric_df = df.select_dtypes(include='number')
|
|
1058
|
-
if numeric_df.empty:
|
|
1059
|
-
_LOGGER.warning("No numeric columns found. Heatmap not generated.")
|
|
1060
|
-
return
|
|
1061
|
-
if method not in ["pearson", "kendall", "spearman"]:
|
|
1062
|
-
_LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
|
|
1063
|
-
raise ValueError()
|
|
1064
|
-
|
|
1065
|
-
corr = numeric_df.corr(method=method)
|
|
1066
|
-
|
|
1067
|
-
# Create a mask for the upper triangle
|
|
1068
|
-
mask = np.triu(np.ones_like(corr, dtype=bool))
|
|
1069
|
-
|
|
1070
|
-
# Plot setup
|
|
1071
|
-
size = max(10, numeric_df.shape[1])
|
|
1072
|
-
plt.figure(figsize=(size, size * 0.8))
|
|
1073
|
-
|
|
1074
|
-
annot_bool = numeric_df.shape[1] <= 20
|
|
1075
|
-
sns.heatmap(
|
|
1076
|
-
corr,
|
|
1077
|
-
mask=mask,
|
|
1078
|
-
annot=annot_bool,
|
|
1079
|
-
cmap='coolwarm',
|
|
1080
|
-
fmt=".2f",
|
|
1081
|
-
cbar_kws={"shrink": 0.8}
|
|
1082
|
-
)
|
|
1083
|
-
|
|
1084
|
-
# add suffix to title
|
|
1085
|
-
full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
|
|
1086
|
-
|
|
1087
|
-
plt.title(full_plot_title)
|
|
1088
|
-
plt.xticks(rotation=45, ha='right')
|
|
1089
|
-
plt.yticks(rotation=0)
|
|
1090
|
-
|
|
1091
|
-
plt.tight_layout()
|
|
1092
|
-
|
|
1093
|
-
if save_dir:
|
|
1094
|
-
save_path = make_fullpath(save_dir, make=True)
|
|
1095
|
-
# sanitize the plot title to save the file
|
|
1096
|
-
sanitized_plot_title = sanitize_filename(plot_title)
|
|
1097
|
-
plot_filename = sanitized_plot_title + ".svg"
|
|
1098
|
-
|
|
1099
|
-
full_path = save_path / plot_filename
|
|
1100
|
-
|
|
1101
|
-
plt.savefig(full_path, bbox_inches="tight", format='svg')
|
|
1102
|
-
_LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
|
|
1103
|
-
|
|
1104
|
-
plt.show()
|
|
1105
|
-
plt.close()
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
def clip_outliers_single(
|
|
1109
|
-
df: pd.DataFrame,
|
|
1110
|
-
column: str,
|
|
1111
|
-
min_val: float,
|
|
1112
|
-
max_val: float
|
|
1113
|
-
) -> Union[pd.DataFrame, None]:
|
|
1114
|
-
"""
|
|
1115
|
-
Clips values in the specified numeric column to the range [min_val, max_val],
|
|
1116
|
-
and returns a new DataFrame where the original column is replaced by the clipped version.
|
|
1117
|
-
|
|
1118
|
-
Args:
|
|
1119
|
-
df (pd.DataFrame): The input DataFrame.
|
|
1120
|
-
column (str): The name of the column to clip.
|
|
1121
|
-
min_val (float): Minimum allowable value; values below are clipped to this.
|
|
1122
|
-
max_val (float): Maximum allowable value; values above are clipped to this.
|
|
1123
|
-
|
|
1124
|
-
Returns:
|
|
1125
|
-
pd.DataFrame: A new DataFrame with the specified column clipped in place.
|
|
1126
|
-
|
|
1127
|
-
None: if a problem with the dataframe column occurred.
|
|
1128
|
-
"""
|
|
1129
|
-
if column not in df.columns:
|
|
1130
|
-
_LOGGER.warning(f"Column '{column}' not found in DataFrame.")
|
|
1131
|
-
return None
|
|
1132
|
-
|
|
1133
|
-
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
1134
|
-
_LOGGER.warning(f"Column '{column}' must be numeric.")
|
|
1135
|
-
return None
|
|
1136
|
-
|
|
1137
|
-
new_df = df.copy(deep=True)
|
|
1138
|
-
new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
|
|
1139
|
-
|
|
1140
|
-
_LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
|
|
1141
|
-
return new_df
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
def clip_outliers_multi(
|
|
1145
|
-
df: pd.DataFrame,
|
|
1146
|
-
clip_dict: Union[Dict[str, Tuple[int, int]], Dict[str, Tuple[float, float]]],
|
|
1147
|
-
verbose: bool=False
|
|
1148
|
-
) -> pd.DataFrame:
|
|
1149
|
-
"""
|
|
1150
|
-
Clips values in multiple specified numeric columns to given [min, max] ranges,
|
|
1151
|
-
updating values (deep copy) and skipping invalid entries.
|
|
1152
|
-
|
|
1153
|
-
Args:
|
|
1154
|
-
df (pd.DataFrame): The input DataFrame.
|
|
1155
|
-
clip_dict (dict): A dictionary where keys are column names and values are (min_val, max_val) tuples.
|
|
1156
|
-
verbose (bool): prints clipped range for each column.
|
|
1157
|
-
|
|
1158
|
-
Returns:
|
|
1159
|
-
pd.DataFrame: A new DataFrame with specified columns clipped.
|
|
1160
|
-
|
|
1161
|
-
Notes:
|
|
1162
|
-
- Invalid specifications (missing column, non-numeric type, wrong tuple length)
|
|
1163
|
-
will be reported but skipped.
|
|
1164
|
-
"""
|
|
1165
|
-
new_df = df.copy()
|
|
1166
|
-
skipped_columns = []
|
|
1167
|
-
clipped_columns = 0
|
|
1168
|
-
|
|
1169
|
-
for col, bounds in clip_dict.items():
|
|
1170
|
-
try:
|
|
1171
|
-
if col not in df.columns:
|
|
1172
|
-
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
1173
|
-
raise ValueError()
|
|
1174
|
-
|
|
1175
|
-
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
1176
|
-
_LOGGER.error(f"Column '{col}' is not numeric.")
|
|
1177
|
-
raise TypeError()
|
|
1178
|
-
|
|
1179
|
-
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
1180
|
-
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
1181
|
-
raise ValueError()
|
|
1182
|
-
|
|
1183
|
-
min_val, max_val = bounds
|
|
1184
|
-
new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
|
|
1185
|
-
if verbose:
|
|
1186
|
-
print(f"Clipped '{col}' to range [{min_val}, {max_val}].")
|
|
1187
|
-
clipped_columns += 1
|
|
1188
|
-
|
|
1189
|
-
except Exception as e:
|
|
1190
|
-
skipped_columns.append((col, str(e)))
|
|
1191
|
-
continue
|
|
1192
|
-
|
|
1193
|
-
_LOGGER.info(f"Clipped {clipped_columns} columns.")
|
|
1194
|
-
|
|
1195
|
-
if skipped_columns:
|
|
1196
|
-
_LOGGER.warning("Skipped columns:")
|
|
1197
|
-
for col, msg in skipped_columns:
|
|
1198
|
-
print(f" - {col}")
|
|
1199
|
-
|
|
1200
|
-
return new_df
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
def drop_outlier_samples(
|
|
1204
|
-
df: pd.DataFrame,
|
|
1205
|
-
bounds_dict: Dict[str, Tuple[Union[int, float], Union[int, float]]],
|
|
1206
|
-
drop_on_nulls: bool = False,
|
|
1207
|
-
verbose: bool = True
|
|
1208
|
-
) -> pd.DataFrame:
|
|
1209
|
-
"""
|
|
1210
|
-
Drops entire rows where values in specified numeric columns fall outside
|
|
1211
|
-
a given [min, max] range.
|
|
1212
|
-
|
|
1213
|
-
This function processes a copy of the DataFrame, ensuring the original is
|
|
1214
|
-
not modified. It skips columns with invalid specifications.
|
|
1215
|
-
|
|
1216
|
-
Args:
|
|
1217
|
-
df (pd.DataFrame): The input DataFrame.
|
|
1218
|
-
bounds_dict (dict): A dictionary where keys are column names and values
|
|
1219
|
-
are (min_val, max_val) tuples defining the valid range.
|
|
1220
|
-
drop_on_nulls (bool): If True, rows with NaN/None in a checked column
|
|
1221
|
-
will also be dropped. If False, NaN/None are ignored.
|
|
1222
|
-
verbose (bool): If True, prints the number of rows dropped for each column.
|
|
1223
|
-
|
|
1224
|
-
Returns:
|
|
1225
|
-
pd.DataFrame: A new DataFrame with the outlier rows removed.
|
|
1226
|
-
|
|
1227
|
-
Notes:
|
|
1228
|
-
- Invalid specifications (e.g., missing column, non-numeric type,
|
|
1229
|
-
incorrectly formatted bounds) will be reported and skipped.
|
|
1230
|
-
"""
|
|
1231
|
-
new_df = df.copy()
|
|
1232
|
-
skipped_columns: List[Tuple[str, str]] = []
|
|
1233
|
-
initial_rows = len(new_df)
|
|
1234
|
-
|
|
1235
|
-
for col, bounds in bounds_dict.items():
|
|
1236
|
-
try:
|
|
1237
|
-
# --- Validation Checks ---
|
|
1238
|
-
if col not in df.columns:
|
|
1239
|
-
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
1240
|
-
raise ValueError()
|
|
1241
|
-
|
|
1242
|
-
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
1243
|
-
_LOGGER.error(f"Column '{col}' is not of a numeric data type.")
|
|
1244
|
-
raise TypeError()
|
|
1245
|
-
|
|
1246
|
-
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
1247
|
-
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
1248
|
-
raise ValueError()
|
|
1249
|
-
|
|
1250
|
-
# --- Filtering Logic ---
|
|
1251
|
-
min_val, max_val = bounds
|
|
1252
|
-
rows_before_drop = len(new_df)
|
|
1253
|
-
|
|
1254
|
-
# Create the base mask for values within the specified range
|
|
1255
|
-
# .between() is inclusive and evaluates to False for NaN
|
|
1256
|
-
mask_in_bounds = new_df[col].between(min_val, max_val)
|
|
1257
|
-
|
|
1258
|
-
if drop_on_nulls:
|
|
1259
|
-
# Keep only rows that are within bounds.
|
|
1260
|
-
# Since mask_in_bounds is False for NaN, nulls are dropped.
|
|
1261
|
-
final_mask = mask_in_bounds
|
|
1262
|
-
else:
|
|
1263
|
-
# Keep rows that are within bounds OR are null.
|
|
1264
|
-
mask_is_null = new_df[col].isnull()
|
|
1265
|
-
final_mask = mask_in_bounds | mask_is_null
|
|
1266
|
-
|
|
1267
|
-
# Apply the final mask
|
|
1268
|
-
new_df = new_df[final_mask]
|
|
1269
|
-
|
|
1270
|
-
rows_after_drop = len(new_df)
|
|
1271
|
-
|
|
1272
|
-
if verbose:
|
|
1273
|
-
dropped_count = rows_before_drop - rows_after_drop
|
|
1274
|
-
if dropped_count > 0:
|
|
1275
|
-
print(
|
|
1276
|
-
f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
|
|
1277
|
-
)
|
|
1278
|
-
|
|
1279
|
-
except (ValueError, TypeError) as e:
|
|
1280
|
-
skipped_columns.append((col, str(e)))
|
|
1281
|
-
continue
|
|
1282
|
-
|
|
1283
|
-
total_dropped = initial_rows - len(new_df)
|
|
1284
|
-
_LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
|
|
1285
|
-
|
|
1286
|
-
if skipped_columns:
|
|
1287
|
-
_LOGGER.warning("Skipped the following columns due to errors:")
|
|
1288
|
-
for col, msg in skipped_columns:
|
|
1289
|
-
# Only print the column name for cleaner output as the error was already logged
|
|
1290
|
-
print(f" - {col}")
|
|
1291
|
-
|
|
1292
|
-
# if new_df is a series, convert to dataframe
|
|
1293
|
-
if isinstance(new_df, pd.Series):
|
|
1294
|
-
new_df = new_df.to_frame()
|
|
1295
|
-
|
|
1296
|
-
return new_df
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
def match_and_filter_columns_by_regex(
|
|
1300
|
-
df: pd.DataFrame,
|
|
1301
|
-
pattern: str,
|
|
1302
|
-
case_sensitive: bool = False,
|
|
1303
|
-
escape_pattern: bool = False
|
|
1304
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
1305
|
-
"""
|
|
1306
|
-
Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
|
|
1307
|
-
|
|
1308
|
-
Parameters:
|
|
1309
|
-
df (pd.DataFrame): The DataFrame to search.
|
|
1310
|
-
pattern (str): The regex pattern to match column names (use a raw string).
|
|
1311
|
-
case_sensitive (bool): Whether matching is case-sensitive.
|
|
1312
|
-
escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
|
|
1313
|
-
|
|
1314
|
-
Returns:
|
|
1315
|
-
(Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
|
|
1316
|
-
"""
|
|
1317
|
-
if escape_pattern:
|
|
1318
|
-
pattern = re.escape(pattern)
|
|
1319
|
-
|
|
1320
|
-
mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
1321
|
-
matched_columns = df.columns[mask].to_list()
|
|
1322
|
-
filtered_df = df.loc[:, mask]
|
|
1323
|
-
|
|
1324
|
-
_LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
|
|
1325
|
-
|
|
1326
|
-
# if filtered df is a series, convert to dataframe
|
|
1327
|
-
if isinstance(filtered_df, pd.Series):
|
|
1328
|
-
filtered_df = filtered_df.to_frame()
|
|
1329
|
-
|
|
1330
|
-
return filtered_df, matched_columns
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
def standardize_percentages(
|
|
1334
|
-
df: pd.DataFrame,
|
|
1335
|
-
columns: list[str],
|
|
1336
|
-
treat_one_as_proportion: bool = True,
|
|
1337
|
-
round_digits: int = 2,
|
|
1338
|
-
verbose: bool=True
|
|
1339
|
-
) -> pd.DataFrame:
|
|
1340
|
-
"""
|
|
1341
|
-
Standardizes numeric columns containing mixed-format percentages.
|
|
1342
|
-
|
|
1343
|
-
This function cleans columns where percentages might be entered as whole
|
|
1344
|
-
numbers (55) and as proportions (0.55). It assumes values
|
|
1345
|
-
between 0 and 1 are proportions and multiplies them by 100.
|
|
1346
|
-
|
|
1347
|
-
Args:
|
|
1348
|
-
df (pd.Dataframe): The input pandas DataFrame.
|
|
1349
|
-
columns (list[str]): A list of column names to standardize.
|
|
1350
|
-
treat_one_as_proportion (bool):
|
|
1351
|
-
- If True (default): The value `1` is treated as a proportion and converted to `100%`.
|
|
1352
|
-
- If False: The value `1` is treated as `1%`.
|
|
1353
|
-
round_digits (int): The number of decimal places to round the final result to.
|
|
1354
|
-
|
|
1355
|
-
Returns:
|
|
1356
|
-
(pd.Dataframe):
|
|
1357
|
-
A new DataFrame with the specified columns cleaned and standardized.
|
|
1358
|
-
"""
|
|
1359
|
-
df_copy = df.copy()
|
|
1360
|
-
|
|
1361
|
-
if df_copy.empty:
|
|
1362
|
-
return df_copy
|
|
1363
|
-
|
|
1364
|
-
# This helper function contains the core cleaning logic
|
|
1365
|
-
def _clean_value(x: float) -> float:
|
|
1366
|
-
"""Applies the standardization rule to a single value."""
|
|
1367
|
-
if pd.isna(x):
|
|
1368
|
-
return x
|
|
1369
|
-
|
|
1370
|
-
# If treat_one_as_proportion is True, the range for proportions is [0, 1]
|
|
1371
|
-
if treat_one_as_proportion and 0 <= x <= 1:
|
|
1372
|
-
return x * 100
|
|
1373
|
-
# If False, the range for proportions is [0, 1) (1 is excluded)
|
|
1374
|
-
elif not treat_one_as_proportion and 0 <= x < 1:
|
|
1375
|
-
return x * 100
|
|
1376
|
-
|
|
1377
|
-
# Otherwise, the value is assumed to be a correctly formatted percentage
|
|
1378
|
-
return x
|
|
1379
|
-
|
|
1380
|
-
fixed_columns: list[str] = list()
|
|
1381
|
-
|
|
1382
|
-
for col in columns:
|
|
1383
|
-
# --- Robustness Checks ---
|
|
1384
|
-
if col not in df_copy.columns:
|
|
1385
|
-
_LOGGER.warning(f"Column '{col}' not found. Skipping.")
|
|
1386
|
-
continue
|
|
1387
|
-
|
|
1388
|
-
if not is_numeric_dtype(df_copy[col]):
|
|
1389
|
-
_LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
|
|
1390
|
-
continue
|
|
1391
|
-
|
|
1392
|
-
# --- Applying the Logic ---
|
|
1393
|
-
# Apply the cleaning function to every value in the column
|
|
1394
|
-
df_copy[col] = df_copy[col].apply(_clean_value)
|
|
1395
|
-
|
|
1396
|
-
# Round the result
|
|
1397
|
-
df_copy[col] = df_copy[col].round(round_digits)
|
|
1398
|
-
|
|
1399
|
-
fixed_columns.append(col)
|
|
1400
|
-
|
|
1401
|
-
if verbose:
|
|
1402
|
-
_LOGGER.info(f"Columns standardized:")
|
|
1403
|
-
for fixed_col in fixed_columns:
|
|
1404
|
-
print(f" '{fixed_col}'")
|
|
1405
|
-
|
|
1406
|
-
return df_copy
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
def reconstruct_one_hot(
|
|
1410
|
-
df: pd.DataFrame,
|
|
1411
|
-
features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
|
|
1412
|
-
separator: str = '_',
|
|
1413
|
-
baseline_category_name: Optional[str] = "Other",
|
|
1414
|
-
drop_original: bool = True,
|
|
1415
|
-
verbose: bool = True
|
|
1416
|
-
) -> pd.DataFrame:
|
|
1417
|
-
"""
|
|
1418
|
-
Reconstructs original categorical columns from a one-hot encoded DataFrame.
|
|
1419
|
-
|
|
1420
|
-
This function identifies groups of one-hot encoded columns based on a common
|
|
1421
|
-
prefix (base feature name) and a separator. It then collapses each group
|
|
1422
|
-
into a single column containing the categorical value.
|
|
1423
|
-
|
|
1424
|
-
Args:
|
|
1425
|
-
df (pd.DataFrame):
|
|
1426
|
-
The input DataFrame with one-hot encoded columns.
|
|
1427
|
-
features_to_reconstruct (List[str | Tuple[str, str | None]]):
|
|
1428
|
-
A list defining the features to reconstruct. This list can contain:
|
|
1429
|
-
|
|
1430
|
-
- A string: (e.g., "Color")
|
|
1431
|
-
This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
|
|
1432
|
-
- A tuple: (e.g., ("Pet", "Dog"))
|
|
1433
|
-
This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
|
|
1434
|
-
- A tuple with None: (e.g., ("Size", None))
|
|
1435
|
-
This reconstructs 'Size' and maps all-zero rows to the NaN value.
|
|
1436
|
-
Example:
|
|
1437
|
-
[
|
|
1438
|
-
"Mood", # All-zeros -> "Other"
|
|
1439
|
-
("Color", "Red"), # All-zeros -> "Red"
|
|
1440
|
-
("Size", None) # All-zeros -> NaN
|
|
1441
|
-
]
|
|
1442
|
-
separator (str):
|
|
1443
|
-
The character separating the base name from the categorical value in
|
|
1444
|
-
the column names (e.g., '_' in 'B_a').
|
|
1445
|
-
baseline_category_name (str | None):
|
|
1446
|
-
The baseline category name to use by default if it is not explicitly provided.
|
|
1447
|
-
drop_original (bool):
|
|
1448
|
-
If True, the original one-hot encoded columns will be dropped from
|
|
1449
|
-
the returned DataFrame.
|
|
1450
|
-
|
|
1451
|
-
Returns:
|
|
1452
|
-
pd.DataFrame:
|
|
1453
|
-
A new DataFrame with the specified one-hot encoded features
|
|
1454
|
-
reconstructed into single categorical columns.
|
|
1455
|
-
|
|
1456
|
-
<br>
|
|
1457
|
-
|
|
1458
|
-
## Note:
|
|
1459
|
-
|
|
1460
|
-
This function is designed to be robust, but users should be aware of two key edge cases:
|
|
1461
|
-
|
|
1462
|
-
1. **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
|
|
1463
|
-
|
|
1464
|
-
2. **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
|
|
1465
|
-
"""
|
|
1466
|
-
if not isinstance(df, pd.DataFrame):
|
|
1467
|
-
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
1468
|
-
raise TypeError()
|
|
1469
|
-
|
|
1470
|
-
if not (baseline_category_name is None or isinstance(baseline_category_name, str)):
|
|
1471
|
-
_LOGGER.error("The baseline_category must be None or a string.")
|
|
1472
|
-
raise TypeError()
|
|
1473
|
-
|
|
1474
|
-
new_df = df.copy()
|
|
1475
|
-
all_ohe_cols_to_drop = []
|
|
1476
|
-
reconstructed_count = 0
|
|
1477
|
-
|
|
1478
|
-
# --- 1. Parse and validate the reconstruction config ---
|
|
1479
|
-
# This normalizes the input into a clean {base_name: baseline_val} dict
|
|
1480
|
-
reconstruction_config: Dict[str, Optional[str]] = {}
|
|
1481
|
-
try:
|
|
1482
|
-
for item in features_to_reconstruct:
|
|
1483
|
-
if isinstance(item, str):
|
|
1484
|
-
# Case 1: "Color"
|
|
1485
|
-
base_name = item
|
|
1486
|
-
baseline_val = baseline_category_name
|
|
1487
|
-
elif isinstance(item, tuple) and len(item) == 2:
|
|
1488
|
-
# Case 2: ("Pet", "dog") or ("Size", None)
|
|
1489
|
-
base_name, baseline_val = item
|
|
1490
|
-
if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
|
|
1491
|
-
_LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
|
|
1492
|
-
raise ValueError()
|
|
1493
|
-
else:
|
|
1494
|
-
_LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
|
|
1495
|
-
raise ValueError()
|
|
1496
|
-
|
|
1497
|
-
if base_name in reconstruction_config and verbose:
|
|
1498
|
-
_LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
|
|
1499
|
-
|
|
1500
|
-
reconstruction_config[base_name] = baseline_val
|
|
1501
|
-
|
|
1502
|
-
except Exception as e:
|
|
1503
|
-
_LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
|
|
1504
|
-
raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
|
|
1505
|
-
|
|
1506
|
-
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
|
|
1507
|
-
|
|
1508
|
-
# Main logic
|
|
1509
|
-
for base_name, baseline_category in reconstruction_config.items():
|
|
1510
|
-
# Regex to find all columns belonging to this base feature.
|
|
1511
|
-
pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
|
|
1512
|
-
|
|
1513
|
-
# Find matching columns
|
|
1514
|
-
ohe_cols = [col for col in df.columns if re.match(pattern, col)]
|
|
1515
|
-
|
|
1516
|
-
if not ohe_cols:
|
|
1517
|
-
_LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
|
|
1518
|
-
continue
|
|
1519
|
-
|
|
1520
|
-
# For each row, find the column name with the maximum value (which is 1)
|
|
1521
|
-
reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
|
|
1522
|
-
|
|
1523
|
-
# Extract the categorical value (the suffix) from the column name
|
|
1524
|
-
# Use n=1 in split to handle cases where the category itself might contain the separator
|
|
1525
|
-
new_column_values = reconstructed_series.str.split(separator, n=1).str[1] # type: ignore
|
|
1526
|
-
|
|
1527
|
-
# Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
|
|
1528
|
-
all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
|
|
1529
|
-
|
|
1530
|
-
if baseline_category is not None:
|
|
1531
|
-
# A baseline category was provided
|
|
1532
|
-
new_column_values.loc[all_zero_mask] = baseline_category
|
|
1533
|
-
else:
|
|
1534
|
-
# No baseline provided: assign NaN
|
|
1535
|
-
new_column_values.loc[all_zero_mask] = np.nan # type: ignore
|
|
1536
|
-
|
|
1537
|
-
if verbose:
|
|
1538
|
-
print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
|
|
1539
|
-
|
|
1540
|
-
# Assign the new reconstructed column to the DataFrame
|
|
1541
|
-
new_df[base_name] = new_column_values
|
|
1542
|
-
|
|
1543
|
-
all_ohe_cols_to_drop.extend(ohe_cols)
|
|
1544
|
-
reconstructed_count += 1
|
|
1545
|
-
if verbose:
|
|
1546
|
-
print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
|
|
1547
|
-
|
|
1548
|
-
# Cleanup
|
|
1549
|
-
if drop_original and all_ohe_cols_to_drop:
|
|
1550
|
-
# Drop the original OHE columns, ensuring no duplicates in the drop list
|
|
1551
|
-
unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
|
|
1552
|
-
new_df.drop(columns=unique_cols_to_drop, inplace=True)
|
|
1553
|
-
_LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
|
|
1554
|
-
|
|
1555
|
-
_LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
|
|
1556
|
-
|
|
1557
|
-
return new_df
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
def reconstruct_binary(
|
|
1561
|
-
df: pd.DataFrame,
|
|
1562
|
-
reconstruction_map: Dict[str, Tuple[str, Any, Any]],
|
|
1563
|
-
drop_original: bool = True,
|
|
1564
|
-
verbose: bool = True
|
|
1565
|
-
) -> pd.DataFrame:
|
|
1566
|
-
"""
|
|
1567
|
-
Reconstructs new categorical columns from existing binary (0/1) columns.
|
|
1568
|
-
|
|
1569
|
-
Used to reverse a binary encoding by mapping 0 and 1 back to
|
|
1570
|
-
descriptive categorical labels.
|
|
1571
|
-
|
|
1572
|
-
Args:
|
|
1573
|
-
df (pd.DataFrame):
|
|
1574
|
-
The input DataFrame.
|
|
1575
|
-
reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
|
|
1576
|
-
A dictionary defining the reconstructions.
|
|
1577
|
-
Format:
|
|
1578
|
-
{ "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
|
|
1579
|
-
Example:
|
|
1580
|
-
{
|
|
1581
|
-
"Sex": ("Sex_male", "Female", "Male"),
|
|
1582
|
-
"Smoker": ("Is_Smoker", "No", "Yes")
|
|
1583
|
-
}
|
|
1584
|
-
drop_original (bool):
|
|
1585
|
-
If True, the original binary source columns (e.g., "Sex_male")
|
|
1586
|
-
will be dropped from the returned DataFrame.
|
|
1587
|
-
verbose (bool):
|
|
1588
|
-
If True, prints the details of each reconstruction.
|
|
1589
|
-
|
|
1590
|
-
Returns:
|
|
1591
|
-
pd.DataFrame:
|
|
1592
|
-
A new DataFrame with the reconstructed categorical columns.
|
|
1593
|
-
|
|
1594
|
-
Raises:
|
|
1595
|
-
TypeError: If `df` is not a pandas DataFrame.
|
|
1596
|
-
ValueError: If `reconstruction_map` is not a dictionary or a
|
|
1597
|
-
configuration is invalid (e.g., column name collision).
|
|
1598
|
-
|
|
1599
|
-
Notes:
|
|
1600
|
-
- The function operates on a copy of the DataFrame.
|
|
1601
|
-
- Rows with `NaN` in the source column will have `NaN` in the
|
|
1602
|
-
new column.
|
|
1603
|
-
- Values in the source column other than 0 or 1 (e.g., 2) will
|
|
1604
|
-
result in `NaN` in the new column.
|
|
1605
|
-
"""
|
|
1606
|
-
if not isinstance(df, pd.DataFrame):
|
|
1607
|
-
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
1608
|
-
raise TypeError()
|
|
1609
|
-
|
|
1610
|
-
if not isinstance(reconstruction_map, dict):
|
|
1611
|
-
_LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
|
|
1612
|
-
raise ValueError()
|
|
1613
|
-
|
|
1614
|
-
new_df = df.copy()
|
|
1615
|
-
source_cols_to_drop: List[str] = []
|
|
1616
|
-
reconstructed_count = 0
|
|
1617
|
-
|
|
1618
|
-
_LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
|
|
1619
|
-
|
|
1620
|
-
for new_col_name, config in reconstruction_map.items():
|
|
1621
|
-
|
|
1622
|
-
# --- 1. Validation ---
|
|
1623
|
-
if not (isinstance(config, tuple) and len(config) == 3):
|
|
1624
|
-
_LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
|
|
1625
|
-
raise ValueError()
|
|
1626
|
-
|
|
1627
|
-
source_col, label_for_0, label_for_1 = config
|
|
1628
|
-
|
|
1629
|
-
if source_col not in new_df.columns:
|
|
1630
|
-
_LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
|
|
1631
|
-
raise ValueError()
|
|
1632
|
-
|
|
1633
|
-
if new_col_name in new_df.columns and new_col_name != source_col and verbose:
|
|
1634
|
-
_LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
|
|
1635
|
-
|
|
1636
|
-
# --- 2. Reconstruction ---
|
|
1637
|
-
mapping_dict = {0: label_for_0, 1: label_for_1}
|
|
1638
|
-
new_df[new_col_name] = new_df[source_col].map(mapping_dict)
|
|
1639
|
-
|
|
1640
|
-
# --- 3. Logging/Tracking ---
|
|
1641
|
-
# Only mark source for dropping if it's NOT the same as the new column
|
|
1642
|
-
if source_col != new_col_name:
|
|
1643
|
-
source_cols_to_drop.append(source_col)
|
|
1644
|
-
|
|
1645
|
-
reconstructed_count += 1
|
|
1646
|
-
if verbose:
|
|
1647
|
-
print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
|
|
1648
|
-
|
|
1649
|
-
# --- 4. Cleanup ---
|
|
1650
|
-
if drop_original and source_cols_to_drop:
|
|
1651
|
-
unique_cols_to_drop = list(set(source_cols_to_drop))
|
|
1652
|
-
new_df.drop(columns=unique_cols_to_drop, inplace=True)
|
|
1653
|
-
_LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
|
|
1654
|
-
|
|
1655
|
-
_LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
|
|
1656
|
-
|
|
1657
|
-
return new_df
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
def reconstruct_multibinary(
|
|
1661
|
-
df: pd.DataFrame,
|
|
1662
|
-
pattern: str,
|
|
1663
|
-
pos_label: str = "Yes",
|
|
1664
|
-
neg_label: str = "No",
|
|
1665
|
-
case_sensitive: bool = False,
|
|
1666
|
-
verbose: bool = True
|
|
1667
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
|
1668
|
-
"""
|
|
1669
|
-
Identifies binary columns matching a regex pattern and converts their numeric
|
|
1670
|
-
values (0/1) into categorical string labels (e.g., "No"/"Yes").
|
|
1671
|
-
|
|
1672
|
-
This allows mass-labeling of binary features so they are treated as proper
|
|
1673
|
-
categorical variables with meaningful keys during subsequent encoding steps.
|
|
1674
|
-
|
|
1675
|
-
Args:
|
|
1676
|
-
df (pd.DataFrame): The input DataFrame.
|
|
1677
|
-
pattern (str): Regex pattern to identify the group of binary columns.
|
|
1678
|
-
pos_label (str): The label to assign to 1 or True (default "Yes").
|
|
1679
|
-
neg_label (str): The label to assign to 0 or False (default "No").
|
|
1680
|
-
case_sensitive (bool): If True, regex matching is case-sensitive.
|
|
1681
|
-
verbose (bool): If True, prints a summary of the operation.
|
|
1682
|
-
|
|
1683
|
-
Returns:
|
|
1684
|
-
Tuple(pd.DataFrame, List[str]):
|
|
1685
|
-
- A new DataFrame with the matched columns converted to Strings.
|
|
1686
|
-
- A list of the column names that were modified.
|
|
1687
|
-
"""
|
|
1688
|
-
if not isinstance(df, pd.DataFrame):
|
|
1689
|
-
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
1690
|
-
raise TypeError()
|
|
1691
|
-
|
|
1692
|
-
new_df = df.copy()
|
|
1693
|
-
|
|
1694
|
-
# 1. Find columns matching the regex
|
|
1695
|
-
mask = new_df.columns.str.contains(pattern, case=case_sensitive, regex=True)
|
|
1696
|
-
target_columns = new_df.columns[mask].to_list()
|
|
1697
|
-
|
|
1698
|
-
if not target_columns:
|
|
1699
|
-
_LOGGER.warning(f"No columns found matching pattern '{pattern}'. Returning original DataFrame.")
|
|
1700
|
-
return new_df, list()
|
|
1701
|
-
|
|
1702
|
-
# 2. Define robust mapping (handles ints, floats, and booleans)
|
|
1703
|
-
# Note: Any value not in this map will become NaN
|
|
1704
|
-
mapping_dict = {
|
|
1705
|
-
0: neg_label,
|
|
1706
|
-
0.0: neg_label,
|
|
1707
|
-
False: neg_label,
|
|
1708
|
-
1: pos_label,
|
|
1709
|
-
1.0: pos_label,
|
|
1710
|
-
True: pos_label
|
|
1711
|
-
}
|
|
1712
|
-
|
|
1713
|
-
converted_count = 0
|
|
1714
|
-
|
|
1715
|
-
# 3. Apply mapping
|
|
1716
|
-
for col in target_columns:
|
|
1717
|
-
# Check if column is numeric or boolean before attempting map to avoid destroying existing strings
|
|
1718
|
-
if is_numeric_dtype(new_df[col]) or is_object_dtype(new_df[col]):
|
|
1719
|
-
# We cast to object implicitly by mapping to strings
|
|
1720
|
-
new_df[col] = new_df[col].map(mapping_dict)
|
|
1721
|
-
converted_count += 1
|
|
1722
|
-
|
|
1723
|
-
if verbose:
|
|
1724
|
-
_LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
|
|
1725
|
-
|
|
1726
|
-
return new_df, target_columns
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
def finalize_feature_schema(
|
|
1730
|
-
df_features: pd.DataFrame,
|
|
1731
|
-
categorical_mappings: Optional[Dict[str, Dict[str, int]]]
|
|
1732
|
-
) -> FeatureSchema:
|
|
1733
|
-
"""
|
|
1734
|
-
Analyzes the final features DataFrame to create a definitive schema.
|
|
1735
|
-
|
|
1736
|
-
This function is the "single source of truth" for column order
|
|
1737
|
-
and type (categorical vs. continuous) for the entire ML pipeline.
|
|
1738
|
-
|
|
1739
|
-
It should be called at the end of the feature engineering process.
|
|
1740
|
-
|
|
1741
|
-
Args:
|
|
1742
|
-
df_features (pd.DataFrame):
|
|
1743
|
-
The final, processed DataFrame containing *only* feature columns
|
|
1744
|
-
in the exact order they will be fed to the model.
|
|
1745
|
-
categorical_mappings (Dict[str, Dict[str, int]] | None):
|
|
1746
|
-
The mappings dictionary generated by
|
|
1747
|
-
`encode_categorical_features`. Can be None if no
|
|
1748
|
-
categorical features exist.
|
|
1749
|
-
|
|
1750
|
-
Returns:
|
|
1751
|
-
FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
|
|
1752
|
-
"""
|
|
1753
|
-
feature_names: List[str] = df_features.columns.to_list()
|
|
1754
|
-
|
|
1755
|
-
# Intermediate lists for building
|
|
1756
|
-
continuous_feature_names_list: List[str] = []
|
|
1757
|
-
categorical_feature_names_list: List[str] = []
|
|
1758
|
-
categorical_index_map_dict: Dict[int, int] = {}
|
|
1759
|
-
|
|
1760
|
-
# _LOGGER.info("Finalizing feature schema...")
|
|
1761
|
-
|
|
1762
|
-
if categorical_mappings:
|
|
1763
|
-
# --- Categorical features are present ---
|
|
1764
|
-
categorical_names_set = set(categorical_mappings.keys())
|
|
1765
|
-
|
|
1766
|
-
for index, name in enumerate(feature_names):
|
|
1767
|
-
if name in categorical_names_set:
|
|
1768
|
-
# This is a categorical feature
|
|
1769
|
-
cardinality = len(categorical_mappings[name])
|
|
1770
|
-
categorical_index_map_dict[index] = cardinality
|
|
1771
|
-
categorical_feature_names_list.append(name)
|
|
1772
|
-
else:
|
|
1773
|
-
# This is a continuous feature
|
|
1774
|
-
continuous_feature_names_list.append(name)
|
|
1775
|
-
|
|
1776
|
-
# Use the populated dict, or None if it's empty
|
|
1777
|
-
final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
|
|
1778
|
-
|
|
1779
|
-
else:
|
|
1780
|
-
# --- No categorical features ---
|
|
1781
|
-
_LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
|
|
1782
|
-
continuous_feature_names_list = list(feature_names)
|
|
1783
|
-
# categorical_feature_names_list remains empty
|
|
1784
|
-
# categorical_index_map_dict remains empty
|
|
1785
|
-
final_index_map = None # Explicitly set to None to match Optional type
|
|
1786
|
-
|
|
1787
|
-
_LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
|
|
1788
|
-
|
|
1789
|
-
# Create the final immutable instance
|
|
1790
|
-
schema_instance = FeatureSchema(
|
|
1791
|
-
feature_names=tuple(feature_names),
|
|
1792
|
-
continuous_feature_names=tuple(continuous_feature_names_list),
|
|
1793
|
-
categorical_feature_names=tuple(categorical_feature_names_list),
|
|
1794
|
-
categorical_index_map=final_index_map,
|
|
1795
|
-
categorical_mappings=categorical_mappings
|
|
1796
|
-
)
|
|
1797
|
-
|
|
1798
|
-
return schema_instance
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
def apply_feature_schema(
|
|
1802
|
-
df: pd.DataFrame,
|
|
1803
|
-
schema: FeatureSchema,
|
|
1804
|
-
targets: Optional[List[str]] = None,
|
|
1805
|
-
unknown_value: int = 99999,
|
|
1806
|
-
verbose: bool = True
|
|
1807
|
-
) -> pd.DataFrame:
|
|
1808
|
-
"""
|
|
1809
|
-
Aligns the input DataFrame with the provided FeatureSchema.
|
|
1810
|
-
|
|
1811
|
-
This function aligns data for inference/fine-tuning by enforcing the schema's
|
|
1812
|
-
structure and encoding.
|
|
1813
|
-
|
|
1814
|
-
Args:
|
|
1815
|
-
df (pd.DataFrame): The input DataFrame.
|
|
1816
|
-
schema (FeatureSchema): The schema defining feature names, types, and mappings.
|
|
1817
|
-
targets (list[str] | None): Optional list of target column names.
|
|
1818
|
-
unknown_value (int): Integer value to assign to unknown categorical levels.
|
|
1819
|
-
Defaults to 99999 to avoid collision with existing categories.
|
|
1820
|
-
verbose (bool): If True, logs info about dropped extra columns.
|
|
1821
|
-
|
|
1822
|
-
Returns:
|
|
1823
|
-
pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
|
|
1824
|
-
|
|
1825
|
-
Raises:
|
|
1826
|
-
ValueError: If any required feature or target column is missing.
|
|
1827
|
-
"""
|
|
1828
|
-
# 1. Setup
|
|
1829
|
-
df_processed = df.copy()
|
|
1830
|
-
targets = targets if targets is not None else []
|
|
1831
|
-
|
|
1832
|
-
# 2. Validation: Strict Column Presence
|
|
1833
|
-
missing_features = [col for col in schema.feature_names if col not in df_processed.columns]
|
|
1834
|
-
if missing_features:
|
|
1835
|
-
_LOGGER.error(f"Schema Mismatch: Missing required features: {missing_features}")
|
|
1836
|
-
raise ValueError()
|
|
1837
|
-
|
|
1838
|
-
# target columns should not be part of feature columns
|
|
1839
|
-
if targets:
|
|
1840
|
-
overlapping_columns = set(schema.feature_names).intersection(set(targets))
|
|
1841
|
-
if overlapping_columns:
|
|
1842
|
-
_LOGGER.error(f"Schema Mismatch: Target columns overlap with feature columns: {overlapping_columns}")
|
|
1843
|
-
raise ValueError()
|
|
1844
|
-
|
|
1845
|
-
# targets were provided, check their presence
|
|
1846
|
-
missing_targets = [col for col in targets if col not in df_processed.columns]
|
|
1847
|
-
if missing_targets:
|
|
1848
|
-
_LOGGER.error(f"Target Mismatch: Missing target columns: {missing_targets}")
|
|
1849
|
-
raise ValueError()
|
|
1850
|
-
|
|
1851
|
-
# 3. Apply Categorical Encoding
|
|
1852
|
-
if schema.categorical_feature_names and schema.categorical_mappings:
|
|
1853
|
-
for col_name in schema.categorical_feature_names:
|
|
1854
|
-
# Should never happen due to schema construction, but double-check and raise
|
|
1855
|
-
if col_name not in schema.categorical_mappings:
|
|
1856
|
-
_LOGGER.error(f"Schema Inconsistency: No mapping found for categorical feature '{col_name}'.")
|
|
1857
|
-
raise ValueError()
|
|
1858
|
-
|
|
1859
|
-
mapping = schema.categorical_mappings[col_name]
|
|
1860
|
-
|
|
1861
|
-
# Apply mapping (unknowns become NaN)
|
|
1862
|
-
df_processed[col_name] = df_processed[col_name].astype(str).map(mapping)
|
|
1863
|
-
|
|
1864
|
-
# Handle Unknown Categories
|
|
1865
|
-
if df_processed[col_name].isnull().any():
|
|
1866
|
-
n_missing = df_processed[col_name].isnull().sum()
|
|
1867
|
-
_LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
|
|
1868
|
-
|
|
1869
|
-
# Fill unknowns with the specified integer
|
|
1870
|
-
df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
|
|
1871
|
-
|
|
1872
|
-
df_processed[col_name] = df_processed[col_name].astype(int)
|
|
1873
|
-
|
|
1874
|
-
# 4. Reorder and Filter
|
|
1875
|
-
final_column_order = list(schema.feature_names) + targets
|
|
1876
|
-
|
|
1877
|
-
extra_cols = set(df_processed.columns) - set(final_column_order)
|
|
1878
|
-
if extra_cols:
|
|
1879
|
-
_LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
|
|
1880
|
-
if verbose:
|
|
1881
|
-
for extra_column in extra_cols:
|
|
1882
|
-
print(f" - Dropping column: '{extra_column}'")
|
|
1883
|
-
|
|
1884
|
-
df_final = df_processed[final_column_order]
|
|
1885
|
-
|
|
1886
|
-
_LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
|
|
1887
|
-
|
|
1888
|
-
# df_final should be a dataframe
|
|
1889
|
-
if isinstance(df_final, pd.Series):
|
|
1890
|
-
df_final = df_final.to_frame()
|
|
1891
|
-
|
|
1892
|
-
return df_final
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
def _validate_columns(df: pd.DataFrame, columns: list[str]):
|
|
1896
|
-
valid_columns = [column for column in columns if column in df.columns]
|
|
1897
|
-
return valid_columns
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
def info():
|
|
1901
|
-
_script_info(__all__)
|