dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
- dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
- ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
- ml_tools/ETL_cleaning/_basic_clean.py +351 -0
- ml_tools/ETL_cleaning/_clean_tools.py +128 -0
- ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
- ml_tools/ETL_cleaning/_imprimir.py +13 -0
- ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
- ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
- ml_tools/ETL_engineering/_imprimir.py +24 -0
- ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
- ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
- ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
- ml_tools/GUI_tools/_imprimir.py +12 -0
- ml_tools/IO_tools/_IO_loggers.py +235 -0
- ml_tools/IO_tools/_IO_save_load.py +151 -0
- ml_tools/IO_tools/_IO_utils.py +140 -0
- ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
- ml_tools/IO_tools/_imprimir.py +14 -0
- ml_tools/MICE/_MICE_imputation.py +132 -0
- ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
- ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
- ml_tools/MICE/_imprimir.py +11 -0
- ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
- ml_tools/ML_callbacks/_base.py +101 -0
- ml_tools/ML_callbacks/_checkpoint.py +232 -0
- ml_tools/ML_callbacks/_early_stop.py +208 -0
- ml_tools/ML_callbacks/_imprimir.py +12 -0
- ml_tools/ML_callbacks/_scheduler.py +197 -0
- ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
- ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
- ml_tools/ML_chain/_dragon_chain.py +140 -0
- ml_tools/ML_chain/_imprimir.py +11 -0
- ml_tools/ML_configuration/__init__.py +90 -0
- ml_tools/ML_configuration/_base_model_config.py +69 -0
- ml_tools/ML_configuration/_finalize.py +366 -0
- ml_tools/ML_configuration/_imprimir.py +47 -0
- ml_tools/ML_configuration/_metrics.py +593 -0
- ml_tools/ML_configuration/_models.py +206 -0
- ml_tools/ML_configuration/_training.py +124 -0
- ml_tools/ML_datasetmaster/__init__.py +28 -0
- ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
- ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
- ml_tools/ML_datasetmaster/_imprimir.py +15 -0
- ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
- ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
- ml_tools/ML_evaluation/__init__.py +53 -0
- ml_tools/ML_evaluation/_classification.py +629 -0
- ml_tools/ML_evaluation/_feature_importance.py +409 -0
- ml_tools/ML_evaluation/_imprimir.py +25 -0
- ml_tools/ML_evaluation/_loss.py +92 -0
- ml_tools/ML_evaluation/_regression.py +273 -0
- ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
- ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
- ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
- ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
- ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
- ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
- ml_tools/ML_finalize_handler/__init__.py +10 -0
- ml_tools/ML_finalize_handler/_imprimir.py +8 -0
- ml_tools/ML_inference/__init__.py +22 -0
- ml_tools/ML_inference/_base_inference.py +166 -0
- ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
- ml_tools/ML_inference/_dragon_inference.py +332 -0
- ml_tools/ML_inference/_imprimir.py +11 -0
- ml_tools/ML_inference/_multi_inference.py +180 -0
- ml_tools/ML_inference_sequence/__init__.py +10 -0
- ml_tools/ML_inference_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
- ml_tools/ML_inference_vision/__init__.py +10 -0
- ml_tools/ML_inference_vision/_imprimir.py +8 -0
- ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
- ml_tools/ML_models/__init__.py +32 -0
- ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
- ml_tools/ML_models/_base_mlp_attention.py +198 -0
- ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
- ml_tools/ML_models/_dragon_tabular.py +248 -0
- ml_tools/ML_models/_imprimir.py +18 -0
- ml_tools/ML_models/_mlp_attention.py +134 -0
- ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
- ml_tools/ML_models_sequence/__init__.py +10 -0
- ml_tools/ML_models_sequence/_imprimir.py +8 -0
- ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
- ml_tools/ML_models_vision/__init__.py +29 -0
- ml_tools/ML_models_vision/_base_wrapper.py +254 -0
- ml_tools/ML_models_vision/_image_classification.py +182 -0
- ml_tools/ML_models_vision/_image_segmentation.py +108 -0
- ml_tools/ML_models_vision/_imprimir.py +16 -0
- ml_tools/ML_models_vision/_object_detection.py +135 -0
- ml_tools/ML_optimization/__init__.py +21 -0
- ml_tools/ML_optimization/_imprimir.py +13 -0
- ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
- ml_tools/ML_optimization/_single_dragon.py +203 -0
- ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
- ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
- ml_tools/ML_scaler/__init__.py +10 -0
- ml_tools/ML_scaler/_imprimir.py +8 -0
- ml_tools/ML_trainer/__init__.py +20 -0
- ml_tools/ML_trainer/_base_trainer.py +297 -0
- ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
- ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
- ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
- ml_tools/ML_trainer/_imprimir.py +10 -0
- ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
- ml_tools/ML_utilities/_artifact_finder.py +382 -0
- ml_tools/ML_utilities/_imprimir.py +16 -0
- ml_tools/ML_utilities/_inspection.py +325 -0
- ml_tools/ML_utilities/_train_tools.py +205 -0
- ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
- ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
- ml_tools/ML_vision_transformers/_imprimir.py +14 -0
- ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
- ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
- ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
- ml_tools/PSO_optimization/_imprimir.py +10 -0
- ml_tools/SQL/__init__.py +7 -0
- ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
- ml_tools/SQL/_imprimir.py +8 -0
- ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
- ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
- ml_tools/VIF/_imprimir.py +10 -0
- ml_tools/_core/__init__.py +7 -1
- ml_tools/_core/_logger.py +8 -18
- ml_tools/_core/_schema_load_ops.py +43 -0
- ml_tools/_core/_script_info.py +2 -2
- ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
- ml_tools/data_exploration/_analysis.py +214 -0
- ml_tools/data_exploration/_cleaning.py +566 -0
- ml_tools/data_exploration/_features.py +583 -0
- ml_tools/data_exploration/_imprimir.py +32 -0
- ml_tools/data_exploration/_plotting.py +487 -0
- ml_tools/data_exploration/_schema_ops.py +176 -0
- ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
- ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
- ml_tools/ensemble_evaluation/_imprimir.py +14 -0
- ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
- ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
- ml_tools/ensemble_inference/_imprimir.py +9 -0
- ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
- ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
- ml_tools/ensemble_learning/_imprimir.py +10 -0
- ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
- ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
- ml_tools/excel_handler/_imprimir.py +13 -0
- ml_tools/{keys.py → keys/__init__.py} +4 -1
- ml_tools/keys/_imprimir.py +11 -0
- ml_tools/{_core → keys}/_keys.py +2 -0
- ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
- ml_tools/math_utilities/_imprimir.py +11 -0
- ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
- ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
- ml_tools/optimization_tools/_imprimir.py +13 -0
- ml_tools/optimization_tools/_optimization_bounds.py +236 -0
- ml_tools/optimization_tools/_optimization_plots.py +218 -0
- ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
- ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
- ml_tools/path_manager/_imprimir.py +15 -0
- ml_tools/path_manager/_path_tools.py +346 -0
- ml_tools/plot_fonts/__init__.py +8 -0
- ml_tools/plot_fonts/_imprimir.py +8 -0
- ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
- ml_tools/schema/__init__.py +15 -0
- ml_tools/schema/_feature_schema.py +223 -0
- ml_tools/schema/_gui_schema.py +191 -0
- ml_tools/schema/_imprimir.py +10 -0
- ml_tools/{serde.py → serde/__init__.py} +4 -2
- ml_tools/serde/_imprimir.py +10 -0
- ml_tools/{_core → serde}/_serde.py +3 -8
- ml_tools/{utilities.py → utilities/__init__.py} +11 -6
- ml_tools/utilities/_imprimir.py +18 -0
- ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
- ml_tools/utilities/_utility_tools.py +192 -0
- dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
- ml_tools/ML_chaining_inference.py +0 -8
- ml_tools/ML_configuration.py +0 -86
- ml_tools/ML_configuration_pytab.py +0 -14
- ml_tools/ML_datasetmaster.py +0 -10
- ml_tools/ML_evaluation.py +0 -16
- ml_tools/ML_evaluation_multi.py +0 -12
- ml_tools/ML_finalize_handler.py +0 -8
- ml_tools/ML_inference.py +0 -12
- ml_tools/ML_models.py +0 -14
- ml_tools/ML_models_advanced.py +0 -14
- ml_tools/ML_models_pytab.py +0 -14
- ml_tools/ML_optimization.py +0 -14
- ml_tools/ML_optimization_pareto.py +0 -8
- ml_tools/ML_scaler.py +0 -8
- ml_tools/ML_sequence_datasetmaster.py +0 -8
- ml_tools/ML_sequence_evaluation.py +0 -10
- ml_tools/ML_sequence_inference.py +0 -8
- ml_tools/ML_sequence_models.py +0 -8
- ml_tools/ML_trainer.py +0 -12
- ml_tools/ML_vision_datasetmaster.py +0 -12
- ml_tools/ML_vision_evaluation.py +0 -10
- ml_tools/ML_vision_inference.py +0 -8
- ml_tools/ML_vision_models.py +0 -18
- ml_tools/SQL.py +0 -8
- ml_tools/_core/_ETL_cleaning.py +0 -694
- ml_tools/_core/_IO_tools.py +0 -498
- ml_tools/_core/_ML_callbacks.py +0 -702
- ml_tools/_core/_ML_configuration.py +0 -1332
- ml_tools/_core/_ML_configuration_pytab.py +0 -102
- ml_tools/_core/_ML_evaluation.py +0 -867
- ml_tools/_core/_ML_evaluation_multi.py +0 -544
- ml_tools/_core/_ML_inference.py +0 -646
- ml_tools/_core/_ML_models.py +0 -668
- ml_tools/_core/_ML_models_pytab.py +0 -693
- ml_tools/_core/_ML_trainer.py +0 -2323
- ml_tools/_core/_ML_utilities.py +0 -886
- ml_tools/_core/_ML_vision_models.py +0 -644
- ml_tools/_core/_data_exploration.py +0 -1909
- ml_tools/_core/_optimization_tools.py +0 -493
- ml_tools/_core/_schema.py +0 -359
- ml_tools/plot_fonts.py +0 -8
- ml_tools/schema.py +0 -12
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
- {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,566 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from pandas.api.types import is_numeric_dtype
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from ..utilities import save_dataframe_filename
|
|
7
|
+
|
|
8
|
+
from ..path_manager import make_fullpath
|
|
9
|
+
from .._core import get_logger
|
|
10
|
+
|
|
11
|
+
from ._analysis import show_null_columns
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_LOGGER = get_logger("Data Exploration: Cleaning")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"drop_constant_columns",
|
|
19
|
+
"drop_rows_with_missing_data",
|
|
20
|
+
"drop_columns_with_missing_data",
|
|
21
|
+
"drop_macro",
|
|
22
|
+
"clean_column_names",
|
|
23
|
+
"clip_outliers_single",
|
|
24
|
+
"clip_outliers_multi",
|
|
25
|
+
"drop_outlier_samples",
|
|
26
|
+
"standardize_percentages",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
|
|
31
|
+
"""
|
|
32
|
+
Removes columns from a pandas DataFrame that contain only a single unique
|
|
33
|
+
value or are entirely null/NaN.
|
|
34
|
+
|
|
35
|
+
This utility is useful for cleaning data by removing constant features that
|
|
36
|
+
have no predictive value.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
df (pd.DataFrame):
|
|
40
|
+
The pandas DataFrame to clean.
|
|
41
|
+
verbose (bool):
|
|
42
|
+
If True, prints the names of the columns that were dropped.
|
|
43
|
+
Defaults to True.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
pd.DataFrame:
|
|
47
|
+
A new DataFrame with the constant columns removed.
|
|
48
|
+
"""
|
|
49
|
+
if not isinstance(df, pd.DataFrame):
|
|
50
|
+
_LOGGER.error("Input must be a pandas DataFrame.")
|
|
51
|
+
raise TypeError()
|
|
52
|
+
|
|
53
|
+
# make copy to avoid modifying original
|
|
54
|
+
df_clean = df.copy()
|
|
55
|
+
|
|
56
|
+
original_columns = set(df.columns)
|
|
57
|
+
cols_to_keep = []
|
|
58
|
+
|
|
59
|
+
for col_name in df_clean.columns:
|
|
60
|
+
column = df_clean[col_name]
|
|
61
|
+
|
|
62
|
+
# Keep a column if it has more than one unique value (nunique ignores NaNs by default)
|
|
63
|
+
if column.nunique(dropna=True) > 1:
|
|
64
|
+
cols_to_keep.append(col_name)
|
|
65
|
+
|
|
66
|
+
dropped_columns = original_columns - set(cols_to_keep)
|
|
67
|
+
if verbose:
|
|
68
|
+
if dropped_columns:
|
|
69
|
+
_LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
|
|
70
|
+
else:
|
|
71
|
+
_LOGGER.info("No constant columns found.")
|
|
72
|
+
|
|
73
|
+
# Return a new DataFrame with only the columns to keep
|
|
74
|
+
df_clean = df_clean[cols_to_keep]
|
|
75
|
+
|
|
76
|
+
if isinstance(df_clean, pd.Series):
|
|
77
|
+
df_clean = df_clean.to_frame()
|
|
78
|
+
|
|
79
|
+
return df_clean
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
|
|
83
|
+
"""
|
|
84
|
+
Drops rows from the DataFrame using a two-stage strategy:
|
|
85
|
+
|
|
86
|
+
1. If `targets`, remove any row where all target columns are missing.
|
|
87
|
+
2. Among features, drop those with more than `threshold` fraction of missing values.
|
|
88
|
+
|
|
89
|
+
Parameters:
|
|
90
|
+
df (pd.DataFrame): The input DataFrame.
|
|
91
|
+
targets (list[str] | None): List of target column names.
|
|
92
|
+
threshold (float): Maximum allowed fraction of missing values in feature columns.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
pd.DataFrame: A cleaned DataFrame with problematic rows removed.
|
|
96
|
+
"""
|
|
97
|
+
df_clean = df.copy()
|
|
98
|
+
|
|
99
|
+
# Stage 1: Drop rows with all target columns missing
|
|
100
|
+
valid_targets = []
|
|
101
|
+
if targets:
|
|
102
|
+
# validate targets
|
|
103
|
+
missing_targets = [t for t in targets if t not in df_clean.columns]
|
|
104
|
+
if missing_targets:
|
|
105
|
+
_LOGGER.error(f"Target columns not found in DataFrame: {missing_targets}")
|
|
106
|
+
raise ValueError()
|
|
107
|
+
else:
|
|
108
|
+
valid_targets = targets
|
|
109
|
+
|
|
110
|
+
# Only proceed if we actually have columns to check
|
|
111
|
+
if valid_targets:
|
|
112
|
+
target_na = df_clean[valid_targets].isnull().all(axis=1)
|
|
113
|
+
if target_na.any():
|
|
114
|
+
_LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
|
|
115
|
+
df_clean = df_clean[~target_na]
|
|
116
|
+
else:
|
|
117
|
+
_LOGGER.info("No rows found where all targets are missing.")
|
|
118
|
+
else:
|
|
119
|
+
_LOGGER.error("Targets list provided but no matching columns found in DataFrame.")
|
|
120
|
+
raise ValueError()
|
|
121
|
+
|
|
122
|
+
# Stage 2: Drop rows based on feature column missing values
|
|
123
|
+
feature_cols = [col for col in df_clean.columns if col not in valid_targets]
|
|
124
|
+
if feature_cols:
|
|
125
|
+
feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
|
|
126
|
+
rows_to_drop = feature_na_frac[feature_na_frac > threshold].index # type: ignore
|
|
127
|
+
if len(rows_to_drop) > 0:
|
|
128
|
+
_LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
|
|
129
|
+
df_clean = df_clean.drop(index=rows_to_drop)
|
|
130
|
+
else:
|
|
131
|
+
_LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
|
|
132
|
+
else:
|
|
133
|
+
_LOGGER.warning("No feature columns available to evaluate.")
|
|
134
|
+
|
|
135
|
+
return df_clean
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[list[str]]=None) -> pd.DataFrame:
|
|
139
|
+
"""
|
|
140
|
+
Drops columns with more than `threshold` fraction of missing values.
|
|
141
|
+
|
|
142
|
+
Parameters:
|
|
143
|
+
df (pd.DataFrame): The input DataFrame.
|
|
144
|
+
threshold (float): Fraction of missing values above which columns are dropped.
|
|
145
|
+
show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
|
|
146
|
+
skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
pd.DataFrame: A new DataFrame without the dropped columns.
|
|
150
|
+
"""
|
|
151
|
+
# If skip_columns is provided, create a list of columns to check.
|
|
152
|
+
# Otherwise, check all columns.
|
|
153
|
+
cols_to_check = df.columns
|
|
154
|
+
if skip_columns:
|
|
155
|
+
# Use set difference for efficient exclusion
|
|
156
|
+
cols_to_check = df.columns.difference(skip_columns)
|
|
157
|
+
|
|
158
|
+
# Calculate the missing fraction only on the columns to be checked
|
|
159
|
+
missing_fraction = df[cols_to_check].isnull().mean()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
|
|
163
|
+
|
|
164
|
+
if len(cols_to_drop) > 0:
|
|
165
|
+
_LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
|
|
166
|
+
|
|
167
|
+
result_df = df.drop(columns=cols_to_drop)
|
|
168
|
+
if show_nulls_after:
|
|
169
|
+
print(show_null_columns(df=result_df))
|
|
170
|
+
|
|
171
|
+
return result_df
|
|
172
|
+
else:
|
|
173
|
+
_LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
|
|
174
|
+
return df
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def drop_macro(df: pd.DataFrame,
|
|
178
|
+
log_directory: Union[str,Path],
|
|
179
|
+
targets: list[str],
|
|
180
|
+
skip_targets: bool=False,
|
|
181
|
+
threshold: float=0.7) -> pd.DataFrame:
|
|
182
|
+
"""
|
|
183
|
+
Iteratively removes rows and columns with excessive missing data.
|
|
184
|
+
|
|
185
|
+
This function performs a comprehensive cleaning cycle on a DataFrame. It
|
|
186
|
+
repeatedly drops columns with constant values, followed by rows and columns that exceed
|
|
187
|
+
a specified threshold of missing values. The process continues until the
|
|
188
|
+
DataFrame's dimensions stabilize, ensuring that the interdependency between
|
|
189
|
+
row and column deletions is handled.
|
|
190
|
+
|
|
191
|
+
Initial and final missing data reports are saved to the specified log directory.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
df (pd.DataFrame): The input pandas DataFrame to be cleaned.
|
|
195
|
+
log_directory (Union[str, Path]): Path to the directory where the missing data reports
|
|
196
|
+
and plots will be saved inside a "Missing Report" subdirectory.
|
|
197
|
+
targets (list[str]): A list of column names to be treated as target
|
|
198
|
+
variables. This list guides the row-dropping logic.
|
|
199
|
+
skip_targets (bool, optional): If True, the columns listed in `targets`
|
|
200
|
+
will be exempt from being dropped, even if they exceed the missing
|
|
201
|
+
data threshold.
|
|
202
|
+
threshold (float, optional): The proportion of missing data required to drop
|
|
203
|
+
a row or column. For example, 0.7 means a row/column will be
|
|
204
|
+
dropped if 70% or more of its data is missing.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
|
|
208
|
+
"""
|
|
209
|
+
# make a deep copy to work with
|
|
210
|
+
df_clean = df.copy()
|
|
211
|
+
|
|
212
|
+
base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
|
|
213
|
+
full_path = base_dir_path / "Missing Report"
|
|
214
|
+
|
|
215
|
+
# Log initial state + Plot
|
|
216
|
+
missing_data_start = show_null_columns(
|
|
217
|
+
df=df_clean,
|
|
218
|
+
plot_to_dir=full_path,
|
|
219
|
+
plot_filename="Original",
|
|
220
|
+
use_all_columns=True
|
|
221
|
+
)
|
|
222
|
+
save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
|
|
223
|
+
save_dir=full_path,
|
|
224
|
+
filename="Missing_Data_Original")
|
|
225
|
+
|
|
226
|
+
# Clean cycles for rows and columns
|
|
227
|
+
master = True
|
|
228
|
+
while master:
|
|
229
|
+
# track rows and columns
|
|
230
|
+
initial_rows, initial_columns = df_clean.shape
|
|
231
|
+
|
|
232
|
+
# drop constant columns
|
|
233
|
+
df_clean = drop_constant_columns(df=df_clean)
|
|
234
|
+
|
|
235
|
+
# clean rows
|
|
236
|
+
df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
|
|
237
|
+
|
|
238
|
+
# clean columns
|
|
239
|
+
if skip_targets:
|
|
240
|
+
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
|
|
241
|
+
else:
|
|
242
|
+
df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
|
|
243
|
+
|
|
244
|
+
# cleaned?
|
|
245
|
+
remaining_rows, remaining_columns = df_clean.shape
|
|
246
|
+
if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
|
|
247
|
+
master = False
|
|
248
|
+
|
|
249
|
+
# log final state + plot
|
|
250
|
+
missing_data_final = show_null_columns(
|
|
251
|
+
df=df_clean,
|
|
252
|
+
plot_to_dir=full_path,
|
|
253
|
+
plot_filename="Processed",
|
|
254
|
+
use_all_columns=True
|
|
255
|
+
)
|
|
256
|
+
save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
|
|
257
|
+
save_dir=full_path,
|
|
258
|
+
filename="Missing_Data_Processed")
|
|
259
|
+
|
|
260
|
+
# return cleaned dataframe
|
|
261
|
+
return df_clean
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacement_pattern: str = r'[\[\]{}<>,:"]', verbose: bool = True) -> pd.DataFrame:
|
|
265
|
+
"""
|
|
266
|
+
Cleans DataFrame column names by replacing special characters.
|
|
267
|
+
|
|
268
|
+
This function is useful for ensuring compatibility with libraries like LightGBM,
|
|
269
|
+
which do not support special JSON characters such as `[]{}<>,:"` in feature names.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
df (pd.DataFrame): The input DataFrame.
|
|
273
|
+
replacement_char (str): The character to use for replacing characters.
|
|
274
|
+
replacement_pattern (str): Regex pattern to use for the replacement logic.
|
|
275
|
+
verbose (bool): If True, prints the renamed columns.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
pd.DataFrame: A new DataFrame with cleaned column names.
|
|
279
|
+
"""
|
|
280
|
+
new_df = df.copy()
|
|
281
|
+
|
|
282
|
+
original_columns = new_df.columns
|
|
283
|
+
new_columns = original_columns.str.replace(replacement_pattern, replacement_char, regex=True)
|
|
284
|
+
|
|
285
|
+
# Create a map of changes for logging
|
|
286
|
+
rename_map = {old: new for old, new in zip(original_columns, new_columns) if old != new}
|
|
287
|
+
|
|
288
|
+
if verbose:
|
|
289
|
+
if rename_map:
|
|
290
|
+
_LOGGER.info(f"Cleaned {len(rename_map)} column name(s) containing special characters:")
|
|
291
|
+
for old, new in rename_map.items():
|
|
292
|
+
print(f" '{old}' -> '{new}'")
|
|
293
|
+
else:
|
|
294
|
+
_LOGGER.info("No column names required cleaning.")
|
|
295
|
+
|
|
296
|
+
new_df.columns = new_columns
|
|
297
|
+
return new_df
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def clip_outliers_single(
|
|
302
|
+
df: pd.DataFrame,
|
|
303
|
+
column: str,
|
|
304
|
+
min_val: float,
|
|
305
|
+
max_val: float
|
|
306
|
+
) -> Union[pd.DataFrame, None]:
|
|
307
|
+
"""
|
|
308
|
+
Clips values in the specified numeric column to the range [min_val, max_val],
|
|
309
|
+
and returns a new DataFrame where the original column is replaced by the clipped version.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
df (pd.DataFrame): The input DataFrame.
|
|
313
|
+
column (str): The name of the column to clip.
|
|
314
|
+
min_val (float): Minimum allowable value; values below are clipped to this.
|
|
315
|
+
max_val (float): Maximum allowable value; values above are clipped to this.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
pd.DataFrame: A new DataFrame with the specified column clipped in place.
|
|
319
|
+
|
|
320
|
+
None: if a problem with the dataframe column occurred.
|
|
321
|
+
"""
|
|
322
|
+
if column not in df.columns:
|
|
323
|
+
_LOGGER.warning(f"Column '{column}' not found in DataFrame.")
|
|
324
|
+
return None
|
|
325
|
+
|
|
326
|
+
if not pd.api.types.is_numeric_dtype(df[column]):
|
|
327
|
+
_LOGGER.warning(f"Column '{column}' must be numeric.")
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
new_df = df.copy(deep=True)
|
|
331
|
+
new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
|
|
332
|
+
|
|
333
|
+
_LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
|
|
334
|
+
return new_df
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def clip_outliers_multi(
|
|
338
|
+
df: pd.DataFrame,
|
|
339
|
+
clip_dict: Union[dict[str, tuple[int, int]], dict[str, tuple[float, float]]],
|
|
340
|
+
verbose: bool=False
|
|
341
|
+
) -> pd.DataFrame:
|
|
342
|
+
"""
|
|
343
|
+
Clips values in multiple specified numeric columns to given [min, max] ranges,
|
|
344
|
+
updating values (deep copy) and skipping invalid entries.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
df (pd.DataFrame): The input DataFrame.
|
|
348
|
+
clip_dict (dict): A dictionary where keys are column names and values are (min_val, max_val) tuples.
|
|
349
|
+
verbose (bool): prints clipped range for each column.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
pd.DataFrame: A new DataFrame with specified columns clipped.
|
|
353
|
+
|
|
354
|
+
Notes:
|
|
355
|
+
- Invalid specifications (missing column, non-numeric type, wrong tuple length)
|
|
356
|
+
will be reported but skipped.
|
|
357
|
+
"""
|
|
358
|
+
new_df = df.copy()
|
|
359
|
+
skipped_columns = []
|
|
360
|
+
clipped_columns = 0
|
|
361
|
+
|
|
362
|
+
for col, bounds in clip_dict.items():
|
|
363
|
+
try:
|
|
364
|
+
if col not in df.columns:
|
|
365
|
+
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
366
|
+
raise ValueError()
|
|
367
|
+
|
|
368
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
369
|
+
_LOGGER.error(f"Column '{col}' is not numeric.")
|
|
370
|
+
raise TypeError()
|
|
371
|
+
|
|
372
|
+
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
373
|
+
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
374
|
+
raise ValueError()
|
|
375
|
+
|
|
376
|
+
min_val, max_val = bounds
|
|
377
|
+
new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
|
|
378
|
+
if verbose:
|
|
379
|
+
print(f"Clipped '{col}' to range [{min_val}, {max_val}].")
|
|
380
|
+
clipped_columns += 1
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
skipped_columns.append((col, str(e)))
|
|
384
|
+
continue
|
|
385
|
+
|
|
386
|
+
_LOGGER.info(f"Clipped {clipped_columns} columns.")
|
|
387
|
+
|
|
388
|
+
if skipped_columns:
|
|
389
|
+
_LOGGER.warning("Skipped columns:")
|
|
390
|
+
for col, msg in skipped_columns:
|
|
391
|
+
print(f" - {col}")
|
|
392
|
+
|
|
393
|
+
return new_df
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def drop_outlier_samples(
|
|
397
|
+
df: pd.DataFrame,
|
|
398
|
+
bounds_dict: dict[str, tuple[Union[int, float], Union[int, float]]],
|
|
399
|
+
drop_on_nulls: bool = False,
|
|
400
|
+
verbose: bool = True
|
|
401
|
+
) -> pd.DataFrame:
|
|
402
|
+
"""
|
|
403
|
+
Drops entire rows where values in specified numeric columns fall outside
|
|
404
|
+
a given [min, max] range.
|
|
405
|
+
|
|
406
|
+
This function processes a copy of the DataFrame, ensuring the original is
|
|
407
|
+
not modified. It skips columns with invalid specifications.
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
df (pd.DataFrame): The input DataFrame.
|
|
411
|
+
bounds_dict (dict): A dictionary where keys are column names and values
|
|
412
|
+
are (min_val, max_val) tuples defining the valid range.
|
|
413
|
+
drop_on_nulls (bool): If True, rows with NaN/None in a checked column
|
|
414
|
+
will also be dropped. If False, NaN/None are ignored.
|
|
415
|
+
verbose (bool): If True, prints the number of rows dropped for each column.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
pd.DataFrame: A new DataFrame with the outlier rows removed.
|
|
419
|
+
|
|
420
|
+
Notes:
|
|
421
|
+
- Invalid specifications (e.g., missing column, non-numeric type,
|
|
422
|
+
incorrectly formatted bounds) will be reported and skipped.
|
|
423
|
+
"""
|
|
424
|
+
new_df = df.copy()
|
|
425
|
+
skipped_columns: list[tuple[str, str]] = []
|
|
426
|
+
initial_rows = len(new_df)
|
|
427
|
+
|
|
428
|
+
for col, bounds in bounds_dict.items():
|
|
429
|
+
try:
|
|
430
|
+
# --- Validation Checks ---
|
|
431
|
+
if col not in df.columns:
|
|
432
|
+
_LOGGER.error(f"Column '{col}' not found in DataFrame.")
|
|
433
|
+
raise ValueError()
|
|
434
|
+
|
|
435
|
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
|
436
|
+
_LOGGER.error(f"Column '{col}' is not of a numeric data type.")
|
|
437
|
+
raise TypeError()
|
|
438
|
+
|
|
439
|
+
if not (isinstance(bounds, tuple) and len(bounds) == 2):
|
|
440
|
+
_LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
|
|
441
|
+
raise ValueError()
|
|
442
|
+
|
|
443
|
+
# --- Filtering Logic ---
|
|
444
|
+
min_val, max_val = bounds
|
|
445
|
+
rows_before_drop = len(new_df)
|
|
446
|
+
|
|
447
|
+
# Create the base mask for values within the specified range
|
|
448
|
+
# .between() is inclusive and evaluates to False for NaN
|
|
449
|
+
mask_in_bounds = new_df[col].between(min_val, max_val)
|
|
450
|
+
|
|
451
|
+
if drop_on_nulls:
|
|
452
|
+
# Keep only rows that are within bounds.
|
|
453
|
+
# Since mask_in_bounds is False for NaN, nulls are dropped.
|
|
454
|
+
final_mask = mask_in_bounds
|
|
455
|
+
else:
|
|
456
|
+
# Keep rows that are within bounds OR are null.
|
|
457
|
+
mask_is_null = new_df[col].isnull()
|
|
458
|
+
final_mask = mask_in_bounds | mask_is_null
|
|
459
|
+
|
|
460
|
+
# Apply the final mask
|
|
461
|
+
new_df = new_df[final_mask]
|
|
462
|
+
|
|
463
|
+
rows_after_drop = len(new_df)
|
|
464
|
+
|
|
465
|
+
if verbose:
|
|
466
|
+
dropped_count = rows_before_drop - rows_after_drop
|
|
467
|
+
if dropped_count > 0:
|
|
468
|
+
print(
|
|
469
|
+
f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
except (ValueError, TypeError) as e:
|
|
473
|
+
skipped_columns.append((col, str(e)))
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
total_dropped = initial_rows - len(new_df)
|
|
477
|
+
_LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
|
|
478
|
+
|
|
479
|
+
if skipped_columns:
|
|
480
|
+
_LOGGER.warning("Skipped the following columns due to errors:")
|
|
481
|
+
for col, msg in skipped_columns:
|
|
482
|
+
# Only print the column name for cleaner output as the error was already logged
|
|
483
|
+
print(f" - {col}")
|
|
484
|
+
|
|
485
|
+
# if new_df is a series, convert to dataframe
|
|
486
|
+
if isinstance(new_df, pd.Series):
|
|
487
|
+
new_df = new_df.to_frame()
|
|
488
|
+
|
|
489
|
+
return new_df
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def standardize_percentages(
|
|
493
|
+
df: pd.DataFrame,
|
|
494
|
+
columns: list[str],
|
|
495
|
+
treat_one_as_proportion: bool = True,
|
|
496
|
+
round_digits: int = 2,
|
|
497
|
+
verbose: bool=True
|
|
498
|
+
) -> pd.DataFrame:
|
|
499
|
+
"""
|
|
500
|
+
Standardizes numeric columns containing mixed-format percentages.
|
|
501
|
+
|
|
502
|
+
This function cleans columns where percentages might be entered as whole
|
|
503
|
+
numbers (55) and as proportions (0.55). It assumes values
|
|
504
|
+
between 0 and 1 are proportions and multiplies them by 100.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
df (pd.Dataframe): The input pandas DataFrame.
|
|
508
|
+
columns (list[str]): A list of column names to standardize.
|
|
509
|
+
treat_one_as_proportion (bool):
|
|
510
|
+
- If True (default): The value `1` is treated as a proportion and converted to `100%`.
|
|
511
|
+
- If False: The value `1` is treated as `1%`.
|
|
512
|
+
round_digits (int): The number of decimal places to round the final result to.
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
(pd.Dataframe):
|
|
516
|
+
A new DataFrame with the specified columns cleaned and standardized.
|
|
517
|
+
"""
|
|
518
|
+
df_copy = df.copy()
|
|
519
|
+
|
|
520
|
+
if df_copy.empty:
|
|
521
|
+
return df_copy
|
|
522
|
+
|
|
523
|
+
# This helper function contains the core cleaning logic
|
|
524
|
+
def _clean_value(x: float) -> float:
|
|
525
|
+
"""Applies the standardization rule to a single value."""
|
|
526
|
+
if pd.isna(x):
|
|
527
|
+
return x
|
|
528
|
+
|
|
529
|
+
# If treat_one_as_proportion is True, the range for proportions is [0, 1]
|
|
530
|
+
if treat_one_as_proportion and 0 <= x <= 1:
|
|
531
|
+
return x * 100
|
|
532
|
+
# If False, the range for proportions is [0, 1) (1 is excluded)
|
|
533
|
+
elif not treat_one_as_proportion and 0 <= x < 1:
|
|
534
|
+
return x * 100
|
|
535
|
+
|
|
536
|
+
# Otherwise, the value is assumed to be a correctly formatted percentage
|
|
537
|
+
return x
|
|
538
|
+
|
|
539
|
+
fixed_columns: list[str] = list()
|
|
540
|
+
|
|
541
|
+
for col in columns:
|
|
542
|
+
# --- Robustness Checks ---
|
|
543
|
+
if col not in df_copy.columns:
|
|
544
|
+
_LOGGER.warning(f"Column '{col}' not found. Skipping.")
|
|
545
|
+
continue
|
|
546
|
+
|
|
547
|
+
if not is_numeric_dtype(df_copy[col]):
|
|
548
|
+
_LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
|
|
549
|
+
continue
|
|
550
|
+
|
|
551
|
+
# --- Applying the Logic ---
|
|
552
|
+
# Apply the cleaning function to every value in the column
|
|
553
|
+
df_copy[col] = df_copy[col].apply(_clean_value)
|
|
554
|
+
|
|
555
|
+
# Round the result
|
|
556
|
+
df_copy[col] = df_copy[col].round(round_digits)
|
|
557
|
+
|
|
558
|
+
fixed_columns.append(col)
|
|
559
|
+
|
|
560
|
+
if verbose:
|
|
561
|
+
_LOGGER.info(f"Columns standardized:")
|
|
562
|
+
for fixed_col in fixed_columns:
|
|
563
|
+
print(f" '{fixed_col}'")
|
|
564
|
+
|
|
565
|
+
return df_copy
|
|
566
|
+
|