dragon-ml-toolbox 19.13.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.13.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1901
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.13.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1901 +0,0 @@
1
- import pandas as pd
2
- from pandas.api.types import is_numeric_dtype, is_object_dtype
3
- import numpy as np
4
- import matplotlib.pyplot as plt
5
- import seaborn as sns
6
- from typing import Union, Literal, Dict, Tuple, List, Optional, Any
7
- from pathlib import Path
8
- import re
9
-
10
- from ._path_manager import sanitize_filename, make_fullpath
11
- from ._script_info import _script_info
12
- from ._logger import get_logger
13
- from ._utilities import save_dataframe_filename
14
- from ._schema import FeatureSchema
15
-
16
-
17
- _LOGGER = get_logger("Data Exploration")
18
-
19
-
20
- __all__ = [
21
- "summarize_dataframe",
22
- "drop_constant_columns",
23
- "drop_rows_with_missing_data",
24
- "show_null_columns",
25
- "drop_columns_with_missing_data",
26
- "drop_macro",
27
- "clean_column_names",
28
- "plot_value_distributions",
29
- "plot_continuous_vs_target",
30
- "plot_categorical_vs_target",
31
- "split_features_targets",
32
- "encode_categorical_features",
33
- "clip_outliers_single",
34
- "clip_outliers_multi",
35
- "drop_outlier_samples",
36
- "plot_correlation_heatmap",
37
- "finalize_feature_schema",
38
- "match_and_filter_columns_by_regex",
39
- "standardize_percentages",
40
- "reconstruct_one_hot",
41
- "reconstruct_binary",
42
- "reconstruct_multibinary",
43
- "split_continuous_binary",
44
- "apply_feature_schema"
45
- ]
46
-
47
-
48
- def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
49
- """
50
- Returns a summary DataFrame with data types, non-null counts, number of unique values,
51
- missing value percentage, and basic statistics for each column.
52
-
53
- Parameters:
54
- df (pd.DataFrame): The input DataFrame.
55
- round_digits (int): Decimal places to round numerical statistics.
56
-
57
- Returns:
58
- pd.DataFrame: Summary table.
59
- """
60
- summary = pd.DataFrame({
61
- 'Data Type': df.dtypes,
62
- 'Non-Null Count': df.notnull().sum(),
63
- 'Unique Values': df.nunique(),
64
- 'Missing %': (df.isnull().mean() * 100).round(round_digits)
65
- })
66
-
67
- # For numeric columns, add summary statistics
68
- numeric_cols = df.select_dtypes(include='number').columns
69
- if not numeric_cols.empty:
70
- summary_numeric = df[numeric_cols].describe().T[
71
- ['mean', 'std', 'min', '25%', '50%', '75%', 'max']
72
- ].round(round_digits)
73
- summary = summary.join(summary_numeric, how='left')
74
-
75
- print(f"DataFrame Shape: {df.shape}")
76
- return summary
77
-
78
-
79
- def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
80
- """
81
- Removes columns from a pandas DataFrame that contain only a single unique
82
- value or are entirely null/NaN.
83
-
84
- This utility is useful for cleaning data by removing constant features that
85
- have no predictive value.
86
-
87
- Args:
88
- df (pd.DataFrame):
89
- The pandas DataFrame to clean.
90
- verbose (bool):
91
- If True, prints the names of the columns that were dropped.
92
- Defaults to True.
93
-
94
- Returns:
95
- pd.DataFrame:
96
- A new DataFrame with the constant columns removed.
97
- """
98
- if not isinstance(df, pd.DataFrame):
99
- _LOGGER.error("Input must be a pandas DataFrame.")
100
- raise TypeError()
101
-
102
- # make copy to avoid modifying original
103
- df_clean = df.copy()
104
-
105
- original_columns = set(df.columns)
106
- cols_to_keep = []
107
-
108
- for col_name in df_clean.columns:
109
- column = df_clean[col_name]
110
-
111
- # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
112
- if column.nunique(dropna=True) > 1:
113
- cols_to_keep.append(col_name)
114
-
115
- dropped_columns = original_columns - set(cols_to_keep)
116
- if verbose:
117
- if dropped_columns:
118
- _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
119
- else:
120
- _LOGGER.info("No constant columns found.")
121
-
122
- # Return a new DataFrame with only the columns to keep
123
- df_clean = df_clean[cols_to_keep]
124
-
125
- if isinstance(df_clean, pd.Series):
126
- df_clean = df_clean.to_frame()
127
-
128
- return df_clean
129
-
130
-
131
- def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
132
- """
133
- Drops rows from the DataFrame using a two-stage strategy:
134
-
135
- 1. If `targets`, remove any row where all target columns are missing.
136
- 2. Among features, drop those with more than `threshold` fraction of missing values.
137
-
138
- Parameters:
139
- df (pd.DataFrame): The input DataFrame.
140
- targets (list[str] | None): List of target column names.
141
- threshold (float): Maximum allowed fraction of missing values in feature columns.
142
-
143
- Returns:
144
- pd.DataFrame: A cleaned DataFrame with problematic rows removed.
145
- """
146
- df_clean = df.copy()
147
-
148
- # Stage 1: Drop rows with all target columns missing
149
- valid_targets = []
150
- if targets:
151
- # validate targets
152
- valid_targets = _validate_columns(df_clean, targets)
153
-
154
- # Only proceed if we actually have columns to check
155
- if valid_targets:
156
- target_na = df_clean[valid_targets].isnull().all(axis=1)
157
- if target_na.any():
158
- _LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
159
- df_clean = df_clean[~target_na]
160
- else:
161
- _LOGGER.info("No rows found where all targets are missing.")
162
- else:
163
- _LOGGER.error("Targets list provided but no matching columns found in DataFrame.")
164
- raise ValueError()
165
-
166
- # Stage 2: Drop rows based on feature column missing values
167
- feature_cols = [col for col in df_clean.columns if col not in valid_targets]
168
- if feature_cols:
169
- feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
170
- rows_to_drop = feature_na_frac[feature_na_frac > threshold].index # type: ignore
171
- if len(rows_to_drop) > 0:
172
- _LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
173
- df_clean = df_clean.drop(index=rows_to_drop)
174
- else:
175
- _LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
176
- else:
177
- _LOGGER.warning("No feature columns available to evaluate.")
178
-
179
- return df_clean
180
-
181
-
182
- def show_null_columns(
183
- df: pd.DataFrame,
184
- round_digits: int = 2,
185
- plot_to_dir: Optional[Union[str, Path]] = None,
186
- plot_filename: Optional[str] = None,
187
- use_all_columns: bool = False
188
- ) -> pd.DataFrame:
189
- """
190
- Returns a table of columns with missing values, showing both the count and
191
- percentage of missing entries per column.
192
-
193
- Optionally generates a visualization of the missing data profile.
194
-
195
- Parameters:
196
- df (pd.DataFrame): The input DataFrame.
197
- round_digits (int): Number of decimal places for the percentage.
198
- plot_to_dir (str | Path | None): If provided, saves a visualization of the
199
- missing data to this directory.
200
- plot_filename (str): The filename for the saved plot (without extension).
201
- Used only if `plot_to_dir` is set.
202
- use_all_columns (bool): If True, includes all columns in the summary and plot,
203
- even those with no missing values.
204
-
205
- Returns:
206
- pd.DataFrame: A DataFrame summarizing missing values in each column.
207
- """
208
- null_counts = df.isnull().sum()
209
- null_percent = df.isnull().mean() * 100
210
-
211
- if use_all_columns:
212
- null_summary = pd.DataFrame({
213
- 'Missing Count': null_counts,
214
- 'Missing %': null_percent.round(round_digits)
215
- })
216
- else:
217
- # Filter only columns with at least one null
218
- mask = null_counts > 0
219
- null_summary = pd.DataFrame({
220
- 'Missing Count': null_counts[mask],
221
- 'Missing %': null_percent[mask].round(round_digits)
222
- })
223
-
224
- # Sort by descending percentage of missing values
225
- null_summary = null_summary.sort_values(by='Missing %', ascending=False)
226
-
227
- # --- Visualization Logic ---
228
- if plot_to_dir:
229
- if null_summary.empty:
230
- _LOGGER.info("No missing data found. Skipping plot generation.")
231
- else:
232
- try:
233
- # Validate and create save directory
234
- save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
235
-
236
- # Prepare data
237
- features = null_summary.index.tolist()
238
- missing_pct = np.array(null_summary['Missing %'].values)
239
- present_pct = 100 - missing_pct
240
- n_features = len(features)
241
-
242
- # Dynamic width
243
- width = max(10, n_features * 0.4)
244
- plt.figure(figsize=(width, 8))
245
-
246
- # Stacked Bar Chart Logic
247
-
248
- # Grid behind bars
249
- plt.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
250
-
251
- # 1. Present Data: Solid Green
252
- plt.bar(
253
- features,
254
- present_pct,
255
- color='tab:green',
256
- label='Present',
257
- width=0.6,
258
- zorder=3
259
- )
260
-
261
- # 2. Missing Data: Transparent Red Fill + Solid Red Hatch
262
- # define facecolor (fill) with alpha, but edgecolor (lines) without alpha.
263
- plt.bar(
264
- features,
265
- missing_pct,
266
- bottom=present_pct,
267
- facecolor=(1.0, 1.0, 1.0, 0.2), # RGBA
268
- edgecolor='tab:red', # Solid red for the hatch lines
269
- hatch='///', # hatch pattern
270
- linewidth=0.4, # Ensure lines are thick enough to see
271
- label='Missing',
272
- width=0.6,
273
- zorder=3
274
- )
275
-
276
- # Styling
277
- plt.ylim(0, 100)
278
- plt.ylabel("Data Completeness (%)", fontsize=13)
279
- plt.yticks(np.arange(0, 101, 10))
280
- plot_title = f"Missing Data - {plot_filename.replace('_', ' ')}" if plot_filename else "Missing Data"
281
- plt.title(plot_title)
282
- plt.xticks(rotation=45, ha='right', fontsize=9)
283
-
284
- # Reference line
285
- plt.axhline(y=100, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
286
-
287
- plt.legend(loc='lower right', framealpha=0.95)
288
- plt.tight_layout()
289
-
290
- # Save
291
- if plot_filename is None or plot_filename.strip() == "":
292
- plot_filename = "Missing_Data_Profile"
293
- else:
294
- plot_filename = "Missing_Data_" + sanitize_filename(plot_filename)
295
-
296
- full_filename = plot_filename + ".svg"
297
- plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
298
- plt.close()
299
-
300
- _LOGGER.info(f"Saved missing data plot as '{full_filename}'")
301
-
302
- except Exception as e:
303
- _LOGGER.error(f"Failed to generate missing data plot. Error: {e}")
304
- plt.close()
305
-
306
- return null_summary
307
-
308
-
309
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
310
- """
311
- Drops columns with more than `threshold` fraction of missing values.
312
-
313
- Parameters:
314
- df (pd.DataFrame): The input DataFrame.
315
- threshold (float): Fraction of missing values above which columns are dropped.
316
- show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
317
- skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
318
-
319
- Returns:
320
- pd.DataFrame: A new DataFrame without the dropped columns.
321
- """
322
- # If skip_columns is provided, create a list of columns to check.
323
- # Otherwise, check all columns.
324
- cols_to_check = df.columns
325
- if skip_columns:
326
- # Use set difference for efficient exclusion
327
- cols_to_check = df.columns.difference(skip_columns)
328
-
329
- # Calculate the missing fraction only on the columns to be checked
330
- missing_fraction = df[cols_to_check].isnull().mean()
331
-
332
-
333
- cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
334
-
335
- if len(cols_to_drop) > 0:
336
- _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
337
-
338
- result_df = df.drop(columns=cols_to_drop)
339
- if show_nulls_after:
340
- print(show_null_columns(df=result_df))
341
-
342
- return result_df
343
- else:
344
- _LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
345
- return df
346
-
347
-
348
- def drop_macro(df: pd.DataFrame,
349
- log_directory: Union[str,Path],
350
- targets: list[str],
351
- skip_targets: bool=False,
352
- threshold: float=0.7) -> pd.DataFrame:
353
- """
354
- Iteratively removes rows and columns with excessive missing data.
355
-
356
- This function performs a comprehensive cleaning cycle on a DataFrame. It
357
- repeatedly drops columns with constant values, followed by rows and columns that exceed
358
- a specified threshold of missing values. The process continues until the
359
- DataFrame's dimensions stabilize, ensuring that the interdependency between
360
- row and column deletions is handled.
361
-
362
- Initial and final missing data reports are saved to the specified log directory.
363
-
364
- Args:
365
- df (pd.DataFrame): The input pandas DataFrame to be cleaned.
366
- log_directory (Union[str, Path]): Path to the directory where the missing data reports
367
- and plots will be saved inside a "Missing Report" subdirectory.
368
- targets (list[str]): A list of column names to be treated as target
369
- variables. This list guides the row-dropping logic.
370
- skip_targets (bool, optional): If True, the columns listed in `targets`
371
- will be exempt from being dropped, even if they exceed the missing
372
- data threshold.
373
- threshold (float, optional): The proportion of missing data required to drop
374
- a row or column. For example, 0.7 means a row/column will be
375
- dropped if 70% or more of its data is missing.
376
-
377
- Returns:
378
- pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
379
- """
380
- # make a deep copy to work with
381
- df_clean = df.copy()
382
-
383
- base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
384
- full_path = base_dir_path / "Missing Report"
385
-
386
- # Log initial state + Plot
387
- missing_data_start = show_null_columns(
388
- df=df_clean,
389
- plot_to_dir=full_path,
390
- plot_filename="Original",
391
- use_all_columns=True
392
- )
393
- save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
394
- save_dir=full_path,
395
- filename="Missing_Data_Original")
396
-
397
- # Clean cycles for rows and columns
398
- master = True
399
- while master:
400
- # track rows and columns
401
- initial_rows, initial_columns = df_clean.shape
402
-
403
- # drop constant columns
404
- df_clean = drop_constant_columns(df=df_clean)
405
-
406
- # clean rows
407
- df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
408
-
409
- # clean columns
410
- if skip_targets:
411
- df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
412
- else:
413
- df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
414
-
415
- # cleaned?
416
- remaining_rows, remaining_columns = df_clean.shape
417
- if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
418
- master = False
419
-
420
- # log final state + plot
421
- missing_data_final = show_null_columns(
422
- df=df_clean,
423
- plot_to_dir=full_path,
424
- plot_filename="Processed",
425
- use_all_columns=True
426
- )
427
- save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
428
- save_dir=full_path,
429
- filename="Missing_Data_Processed")
430
-
431
- # return cleaned dataframe
432
- return df_clean
433
-
434
-
435
- def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacement_pattern: str = r'[\[\]{}<>,:"]', verbose: bool = True) -> pd.DataFrame:
436
- """
437
- Cleans DataFrame column names by replacing special characters.
438
-
439
- This function is useful for ensuring compatibility with libraries like LightGBM,
440
- which do not support special JSON characters such as `[]{}<>,:"` in feature names.
441
-
442
- Args:
443
- df (pd.DataFrame): The input DataFrame.
444
- replacement_char (str): The character to use for replacing characters.
445
- replacement_pattern (str): Regex pattern to use for the replacement logic.
446
- verbose (bool): If True, prints the renamed columns.
447
-
448
- Returns:
449
- pd.DataFrame: A new DataFrame with cleaned column names.
450
- """
451
- new_df = df.copy()
452
-
453
- original_columns = new_df.columns
454
- new_columns = original_columns.str.replace(replacement_pattern, replacement_char, regex=True)
455
-
456
- # Create a map of changes for logging
457
- rename_map = {old: new for old, new in zip(original_columns, new_columns) if old != new}
458
-
459
- if verbose:
460
- if rename_map:
461
- _LOGGER.info(f"Cleaned {len(rename_map)} column name(s) containing special characters:")
462
- for old, new in rename_map.items():
463
- print(f" '{old}' -> '{new}'")
464
- else:
465
- _LOGGER.info("No column names required cleaning.")
466
-
467
- new_df.columns = new_columns
468
- return new_df
469
-
470
-
471
- def plot_value_distributions(
472
- df: pd.DataFrame,
473
- save_dir: Union[str, Path],
474
- categorical_columns: Optional[List[str]] = None,
475
- max_categories: int = 100,
476
- fill_na_with: str = "MISSING DATA"
477
- ):
478
- """
479
- Plots and saves the value distributions for all columns in a DataFrame,
480
- using the best plot type for each column (histogram or count plot).
481
-
482
- Plots are saved as SVG files under two subdirectories in `save_dir`:
483
- - "Distribution_Continuous" for continuous numeric features (histograms).
484
- - "Distribution_Categorical" for categorical features (count plots).
485
-
486
- Args:
487
- df (pd.DataFrame): The input DataFrame to analyze.
488
- save_dir (str | Path): Directory path to save the plots.
489
- categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
490
- max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
491
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
492
-
493
- Notes:
494
- - `seaborn.histplot` with KDE is used for continuous features.
495
- - `seaborn.countplot` is used for categorical features.
496
- """
497
- # 1. Setup save directories
498
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
499
- numeric_dir = base_save_path / "Distribution_Continuous"
500
- categorical_dir = base_save_path / "Distribution_Categorical"
501
- numeric_dir.mkdir(parents=True, exist_ok=True)
502
- categorical_dir.mkdir(parents=True, exist_ok=True)
503
-
504
- # 2. Filter columns to plot
505
- columns_to_plot = df.columns.to_list()
506
-
507
- # Setup for forced categorical logic
508
- categorical_set = set(categorical_columns) if categorical_columns is not None else None
509
-
510
- numeric_plots_saved = 0
511
- categorical_plots_saved = 0
512
-
513
- for col_name in columns_to_plot:
514
- try:
515
- is_numeric = is_numeric_dtype(df[col_name])
516
- n_unique = df[col_name].nunique()
517
-
518
- # --- 3. Determine Plot Type ---
519
- is_continuous = False
520
- if categorical_set is not None:
521
- # Use the explicit list
522
- if col_name not in categorical_set:
523
- is_continuous = True
524
- else:
525
- # Use auto-detection
526
- if is_numeric:
527
- is_continuous = True
528
-
529
- # --- Case 1: Continuous Numeric (Histogram) ---
530
- if is_continuous:
531
- plt.figure(figsize=(10, 6))
532
- # Drop NaNs for histogram, as they can't be plotted on a numeric axis
533
- sns.histplot(x=df[col_name].dropna(), kde=True, bins=30)
534
- plt.title(f"Distribution of '{col_name}' (Continuous)")
535
- plt.xlabel(col_name)
536
- plt.ylabel("Count")
537
-
538
- save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
539
- numeric_plots_saved += 1
540
-
541
- # --- Case 2: Categorical (Count Plot) ---
542
- else:
543
- # Check max categories
544
- if n_unique > max_categories:
545
- _LOGGER.warning(f"Skipping plot for '{col_name}': {n_unique} unique values > {max_categories} max_categories.")
546
- continue
547
-
548
- # Adaptive figure size
549
- fig_width = max(10, n_unique * 0.5)
550
- plt.figure(figsize=(fig_width, 8))
551
-
552
- # Make a temporary copy for plotting to handle NaNs
553
- temp_series = df[col_name].copy()
554
-
555
- # Handle NaNs by replacing them with the specified string
556
- if temp_series.isnull().any():
557
- # Convert to object type first to allow string replacement
558
- temp_series = temp_series.astype(object).fillna(fill_na_with)
559
-
560
- # Convert all to string to be safe (handles low-card numeric)
561
- temp_series = temp_series.astype(str)
562
-
563
- # Get category order by frequency
564
- order = temp_series.value_counts().index
565
- sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
566
-
567
- plt.title(f"Distribution of '{col_name}' (Categorical)")
568
- plt.xlabel(col_name)
569
- plt.ylabel("Count")
570
-
571
- # Smart tick rotation
572
- max_label_len = 0
573
- if n_unique > 0:
574
- max_label_len = max(len(str(s)) for s in order)
575
-
576
- # Rotate if labels are long OR there are many categories
577
- if max_label_len > 10 or n_unique > 25:
578
- plt.xticks(rotation=45, ha='right')
579
-
580
- save_path = categorical_dir / f"{sanitize_filename(col_name)}.svg"
581
- categorical_plots_saved += 1
582
-
583
- # --- 4. Save Plot ---
584
- plt.grid(True, linestyle='--', alpha=0.6, axis='y')
585
- plt.tight_layout()
586
- # Save as .svg
587
- plt.savefig(save_path, format='svg', bbox_inches="tight")
588
- plt.close()
589
-
590
- except Exception as e:
591
- _LOGGER.error(f"Failed to plot distribution for '{col_name}'. Error: {e}")
592
- plt.close()
593
-
594
- _LOGGER.info(f"Saved {numeric_plots_saved} continuous distribution plots to '{numeric_dir.name}'.")
595
- _LOGGER.info(f"Saved {categorical_plots_saved} categorical distribution plots to '{categorical_dir.name}'.")
596
-
597
-
598
- def plot_continuous_vs_target(
599
- df: pd.DataFrame,
600
- targets: List[str],
601
- save_dir: Union[str, Path],
602
- features: Optional[List[str]] = None
603
- ):
604
- """
605
- Plots each continuous feature against each target to visualize linear relationships.
606
-
607
- This function is a common EDA step for regression tasks. It creates a
608
- scatter plot for each feature-target pair, overlays a simple linear
609
- regression line, and saves each plot as an individual .svg file.
610
-
611
- Plots are saved in a structured way, with a subdirectory created for
612
- each target variable.
613
-
614
- Args:
615
- df (pd.DataFrame): The input DataFrame.
616
- targets (List[str]): A list of target column names to plot (y-axis).
617
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
618
- features (List[str] | None): A list of feature column names to plot (x-axis). If None, all non-target columns in the
619
- DataFrame will be used.
620
-
621
- Notes:
622
- - Only numeric features and numeric targets are processed. Non-numeric
623
- columns in the lists will be skipped with a warning.
624
- - Rows with NaN in either the feature or the target are dropped
625
- pairwise for each plot.
626
- """
627
- # 1. Validate the base save directory
628
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
629
-
630
- # 2. Validate helper
631
- def _validate_numeric_cols(col_list: List[str], col_type: str) -> List[str]:
632
- valid_cols = []
633
- for col in col_list:
634
- if col not in df.columns:
635
- _LOGGER.warning(f"{col_type} column '{col}' not found. Skipping.")
636
- elif not is_numeric_dtype(df[col]):
637
- _LOGGER.warning(f"{col_type} column '{col}' is not numeric. Skipping.")
638
- else:
639
- valid_cols.append(col)
640
- return valid_cols
641
-
642
- # 3. Validate target columns FIRST
643
- valid_targets = _validate_numeric_cols(targets, "Target")
644
- if not valid_targets:
645
- _LOGGER.error("No valid numeric target columns provided to plot.")
646
- return
647
-
648
- # 4. Determine and validate feature columns
649
- if features is None:
650
- _LOGGER.info("No 'features' list provided. Using all non-target columns as features.")
651
- target_set = set(valid_targets)
652
- # Get all columns that are not in the valid_targets set
653
- features_to_validate = [col for col in df.columns if col not in target_set]
654
- else:
655
- features_to_validate = features
656
-
657
- valid_features = _validate_numeric_cols(features_to_validate, "Feature")
658
-
659
- if not valid_features:
660
- _LOGGER.error("No valid numeric feature columns found to plot.")
661
- return
662
-
663
- # 5. Main plotting loop
664
- total_plots_saved = 0
665
-
666
- for target_name in valid_targets:
667
- # Create a sanitized subdirectory for this target
668
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Continuous")
669
- target_save_dir = base_save_path / safe_target_dir_name
670
- target_save_dir.mkdir(parents=True, exist_ok=True)
671
-
672
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
673
-
674
- for feature_name in valid_features:
675
-
676
- # Drop NaNs pairwise for this specific plot
677
- temp_df = df[[feature_name, target_name]].dropna()
678
-
679
- if temp_df.empty:
680
- _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
681
- continue
682
-
683
- x = temp_df[feature_name]
684
- y = temp_df[target_name]
685
-
686
- # 6. Perform linear fit
687
- try:
688
- # Modern replacement for np.polyfit + np.poly1d. Compatible with NumPy 1.14+ and NumPy 2.0+
689
- p = np.polynomial.Polynomial.fit(x, y, deg=1)
690
- plot_regression_line = True
691
- except (np.linalg.LinAlgError, ValueError):
692
- _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
693
- plot_regression_line = False
694
-
695
- # 7. Create the plot
696
- plt.figure(figsize=(10, 6))
697
- ax = plt.gca()
698
-
699
- # Plot the raw data points
700
- ax.plot(x, y, 'o', alpha=0.5, label='Data points', markersize=5)
701
-
702
- # Plot the regression line
703
- if plot_regression_line:
704
- ax.plot(x, p(x), "r--", label='Linear Fit') # type: ignore
705
-
706
- ax.set_title(f'{feature_name} vs {target_name}')
707
- ax.set_xlabel(feature_name)
708
- ax.set_ylabel(target_name)
709
- ax.legend()
710
- plt.grid(True, linestyle='--', alpha=0.6)
711
- plt.tight_layout()
712
-
713
- # 8. Save the plot
714
- safe_feature_name = sanitize_filename(feature_name)
715
- plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
716
- plot_path = target_save_dir / plot_filename
717
-
718
- try:
719
- plt.savefig(plot_path, bbox_inches="tight", format='svg')
720
- total_plots_saved += 1
721
- except Exception as e:
722
- _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
723
-
724
- # Close the figure to free up memory
725
- plt.close()
726
-
727
- _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
728
-
729
-
730
- def plot_categorical_vs_target(
731
- df: pd.DataFrame,
732
- targets: List[str],
733
- save_dir: Union[str, Path],
734
- features: Optional[List[str]] = None,
735
- max_categories: int = 50,
736
- fill_na_with: str = "MISSING DATA"
737
- ):
738
- """
739
- Plots each categorical feature against each numeric target using box plots.
740
-
741
- This function is a core EDA step for regression tasks to understand the
742
- relationship between a categorical independent variable and a continuous
743
- dependent variable.
744
-
745
- Plots are saved as individual .svg files in a structured way, with a subdirectory created for each target.
746
-
747
- Args:
748
- df (pd.DataFrame): The input DataFrame.
749
- targets (List[str]): A list of numeric target column names (y-axis).
750
- save_dir (str | Path): The base directory where plots will be saved. A subdirectory will be created here for each target.
751
- features (List[str] | None): A list of categorical feature column names (x-axis). If None, all non-numeric (object) columns will be used.
752
- max_categories (int): The maximum number of unique categories a feature can have to be plotted. Features exceeding this limit will be skipped.
753
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category. Defaults to "Missing".
754
-
755
- Notes:
756
- - Only numeric targets are processed.
757
- - Features are automatically identified as categorical if they are 'object' dtype.
758
- """
759
- # 1. Validate the base save directory and inputs
760
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
761
-
762
- # 2. Validate target columns (must be numeric)
763
- valid_targets = []
764
- for col in targets:
765
- if col not in df.columns:
766
- _LOGGER.warning(f"Target column '{col}' not found. Skipping.")
767
- elif not is_numeric_dtype(df[col]):
768
- _LOGGER.warning(f"Target column '{col}' is not numeric. Skipping.")
769
- else:
770
- valid_targets.append(col)
771
-
772
- if not valid_targets:
773
- _LOGGER.error("No valid numeric target columns provided to plot.")
774
- return
775
-
776
- # 3. Determine and validate feature columns
777
- features_to_plot = []
778
- if features is None:
779
- _LOGGER.info("No 'features' list provided. Auto-detecting categorical features.")
780
- for col in df.columns:
781
- if col in valid_targets:
782
- continue
783
- # Auto-include object dtypes
784
- if is_object_dtype(df[col]):
785
- features_to_plot.append(col)
786
-
787
- else:
788
- # Validate user-provided list
789
- for col in features:
790
- if col not in df.columns:
791
- _LOGGER.warning(f"Feature column '{col}' not found in DataFrame. Skipping.")
792
- else:
793
- features_to_plot.append(col)
794
-
795
- if not features_to_plot:
796
- _LOGGER.error("No valid categorical feature columns found to plot.")
797
- return
798
-
799
- # 4. Main plotting loop
800
- total_plots_saved = 0
801
-
802
- for target_name in valid_targets:
803
- # Create a sanitized subdirectory for this target
804
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
805
- target_save_dir = base_save_path / safe_target_dir_name
806
- target_save_dir.mkdir(parents=True, exist_ok=True)
807
-
808
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
809
- for feature_name in features_to_plot:
810
-
811
- # Make a temporary copy for plotting to handle NaNs and dtypes
812
- temp_df = df[[feature_name, target_name]].copy()
813
-
814
- # Check cardinality
815
- n_unique = temp_df[feature_name].nunique()
816
- if n_unique > max_categories:
817
- _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique values > {max_categories} max_categories.")
818
- continue
819
-
820
- # Handle NaNs by replacing them with the specified string
821
- if temp_df[feature_name].isnull().any():
822
- # Convert to object type first to allow string replacement
823
- temp_df[feature_name] = temp_df[feature_name].astype(object).fillna(fill_na_with)
824
-
825
- # Convert feature to string to ensure correct plotting order
826
- temp_df[feature_name] = temp_df[feature_name].astype(str)
827
-
828
- # 5. Create the plot
829
- # Increase figure width for categories
830
- plt.figure(figsize=(max(10, n_unique * 1.2), 10))
831
-
832
- sns.boxplot(x=feature_name, y=target_name, data=temp_df)
833
-
834
- plt.title(f'{target_name} vs {feature_name}')
835
- plt.xlabel(feature_name)
836
- plt.ylabel(target_name)
837
- plt.xticks(rotation=45, ha='right')
838
- plt.grid(True, linestyle='--', alpha=0.6, axis='y')
839
- plt.tight_layout()
840
-
841
- # 6. Save the plot
842
- safe_feature_name = sanitize_filename(feature_name)
843
- plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
844
- plot_path = target_save_dir / plot_filename
845
-
846
- try:
847
- plt.savefig(plot_path, bbox_inches="tight", format='svg')
848
- total_plots_saved += 1
849
- except Exception as e:
850
- _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
851
-
852
- plt.close()
853
-
854
- _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
855
-
856
-
857
- def encode_categorical_features(
858
- df: pd.DataFrame,
859
- columns_to_encode: List[str],
860
- encode_nulls: bool,
861
- null_label: str = "Other",
862
- split_resulting_dataset: bool = True,
863
- verbose: bool = True
864
- ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
865
- """
866
- Finds unique values in specified categorical columns, encodes them into integers,
867
- and returns a dictionary containing the mappings for each column.
868
-
869
- This function automates the label encoding process and generates a simple,
870
- human-readable dictionary of the mappings.
871
-
872
- Args:
873
- df (pd.DataFrame): The input DataFrame.
874
- columns_to_encode (List[str]): A list of column names to be encoded.
875
- encode_nulls (bool):
876
- - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
877
- - If False, Nulls are ignored and categories start from 0.
878
-
879
- null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
880
- split_resulting_dataset (bool):
881
- - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
882
- - If False, returns a single DataFrame with all columns.
883
- verbose (bool): If True, prints encoding progress.
884
-
885
- Returns:
886
- Tuple:
887
-
888
- - Dict[str, Dict[str, int]]: A dictionary where each key is a column name and the value is its category-to-integer mapping.
889
-
890
- - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
891
-
892
- - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
893
-
894
- ## **Important:**
895
- 1. Do not encode 'Ordinal Features' (e.g., Low=1, Med=2, High=3), these must be treated as numerical (continuous).
896
- 2. Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
897
- """
898
- df_encoded = df.copy()
899
-
900
- # Validate columns
901
- valid_columns = [col for col in columns_to_encode if col in df_encoded.columns]
902
- missing_columns = set(columns_to_encode) - set(valid_columns)
903
- if missing_columns:
904
- _LOGGER.warning(f"Columns not found and will be skipped: {list(missing_columns)}")
905
-
906
- mappings: Dict[str, Dict[str, int]] = {}
907
-
908
- _LOGGER.info(f"Encoding {len(valid_columns)} categorical column(s).")
909
- for col_name in valid_columns:
910
- has_nulls = df_encoded[col_name].isnull().any()
911
-
912
- # Get unique values once to check cardinality and generate categories
913
- raw_unique_values = df_encoded[col_name].dropna().unique()
914
-
915
- # --- Check for constant columns ---
916
- if len(raw_unique_values) <= 1:
917
- # Exception: If we are encoding nulls and nulls exist, this is effectively a binary feature (Null vs Value)
918
- is_effectively_binary = encode_nulls and has_nulls
919
-
920
- if not is_effectively_binary:
921
- _LOGGER.warning(f"Column '{col_name}' has only {len(raw_unique_values)} unique value(s). Consider dropping it before encoding as it offers no predictive variance.")
922
-
923
- # Prepare categories (sorted string representation)
924
- categories = sorted([str(cat) for cat in raw_unique_values])
925
-
926
- if encode_nulls and has_nulls:
927
- # Handle nulls: "Other" -> 0, other categories -> 1, 2, 3...
928
- # Start mapping from 1 for non-null values
929
- mapping = {category: i + 1 for i, category in enumerate(categories)}
930
-
931
- # Apply mapping and fill remaining NaNs with 0
932
- mapped_series = df_encoded[col_name].astype(str).map(mapping)
933
- df_encoded[col_name] = mapped_series.fillna(0).astype(int)
934
-
935
- # --- Validate nulls category---
936
- # Ensure the key for 0 doesn't collide with a real category.
937
- if null_label in mapping.keys():
938
- # COLLISION! null_label is a real category
939
- original_label = null_label
940
- null_label = "__NULL__" # fallback
941
- _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
942
-
943
- # Create the complete user-facing map including "Other"
944
- user_mapping = {**mapping, null_label: 0}
945
- mappings[col_name] = user_mapping
946
- else:
947
- # ignore nulls: categories start from 0
948
- mapping = {category: i for i, category in enumerate(categories)}
949
-
950
- df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping)
951
-
952
- mappings[col_name] = mapping
953
-
954
- if verbose:
955
- cardinality = len(mappings[col_name])
956
- print(f" - Encoded '{col_name}' with {cardinality} unique values.")
957
-
958
- # Handle the dataset splitting logic
959
- if split_resulting_dataset:
960
- df_categorical = df_encoded[valid_columns]
961
- df_non_categorical = df.drop(columns=valid_columns)
962
- return mappings, df_non_categorical, df_categorical
963
- else:
964
- return mappings, df_encoded, None
965
-
966
-
967
- def split_features_targets(df: pd.DataFrame, targets: list[str]):
968
- """
969
- Splits a DataFrame's columns into features and targets.
970
-
971
- Args:
972
- df (pd.DataFrame): Pandas DataFrame containing the dataset.
973
- targets (list[str]): List of column names to be treated as target variables.
974
-
975
- Returns:
976
- tuple: A tuple containing:
977
- - pd.DataFrame: Features dataframe.
978
- - pd.DataFrame: Targets dataframe.
979
-
980
- Prints:
981
- - Shape of the original dataframe.
982
- - Shape of the features dataframe.
983
- - Shape of the targets dataframe.
984
- """
985
- valid_targets = _validate_columns(df, targets)
986
- df_targets = df[valid_targets]
987
- df_features = df.drop(columns=valid_targets)
988
- print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
989
- return df_features, df_targets
990
-
991
-
992
- def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
993
- """
994
- Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
995
- Normalize binary values like 0.0/1.0 to 0/1 if detected.
996
-
997
- Parameters:
998
- df (pd.DataFrame): Input DataFrame with only numeric columns.
999
-
1000
- Returns:
1001
- Tuple(pd.DataFrame, pd.DataFrame): (continuous_columns_df, binary_columns_df)
1002
-
1003
- Raises:
1004
- TypeError: If any column is not numeric.
1005
- """
1006
- if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
1007
- _LOGGER.error("All columns must be numeric (int or float).")
1008
- raise TypeError()
1009
-
1010
- binary_cols = []
1011
- continuous_cols = []
1012
-
1013
- for col in df.columns:
1014
- series = df[col]
1015
- unique_values = set(series[~series.isna()].unique())
1016
-
1017
- if unique_values.issubset({0, 1}):
1018
- binary_cols.append(col)
1019
- elif unique_values.issubset({0.0, 1.0}):
1020
- df[col] = df[col].apply(lambda x: 0 if x == 0.0 else (1 if x == 1.0 else x))
1021
- binary_cols.append(col)
1022
- else:
1023
- continuous_cols.append(col)
1024
-
1025
- binary_cols.sort()
1026
-
1027
- df_cont = df[continuous_cols]
1028
- df_bin = df[binary_cols]
1029
-
1030
- print(f"Continuous columns shape: {df_cont.shape}")
1031
- print(f"Binary columns shape: {df_bin.shape}")
1032
-
1033
- return df_cont, df_bin # type: ignore
1034
-
1035
-
1036
- def plot_correlation_heatmap(df: pd.DataFrame,
1037
- plot_title: str,
1038
- save_dir: Union[str, Path, None] = None,
1039
- method: Literal["pearson", "kendall", "spearman"]="pearson"):
1040
- """
1041
- Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
1042
-
1043
- Args:
1044
- df (pd.DataFrame): The input dataset.
1045
- save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
1046
- plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
1047
- method (str): Correlation method to use. Must be one of:
1048
- - 'pearson' (default): measures linear correlation (assumes normally distributed data),
1049
- - 'kendall': rank correlation (non-parametric),
1050
- - 'spearman': monotonic relationship (non-parametric).
1051
-
1052
- Notes:
1053
- - Only numeric columns are included.
1054
- - Annotations are disabled if there are more than 20 features.
1055
- - Missing values are handled via pairwise complete observations.
1056
- """
1057
- numeric_df = df.select_dtypes(include='number')
1058
- if numeric_df.empty:
1059
- _LOGGER.warning("No numeric columns found. Heatmap not generated.")
1060
- return
1061
- if method not in ["pearson", "kendall", "spearman"]:
1062
- _LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
1063
- raise ValueError()
1064
-
1065
- corr = numeric_df.corr(method=method)
1066
-
1067
- # Create a mask for the upper triangle
1068
- mask = np.triu(np.ones_like(corr, dtype=bool))
1069
-
1070
- # Plot setup
1071
- size = max(10, numeric_df.shape[1])
1072
- plt.figure(figsize=(size, size * 0.8))
1073
-
1074
- annot_bool = numeric_df.shape[1] <= 20
1075
- sns.heatmap(
1076
- corr,
1077
- mask=mask,
1078
- annot=annot_bool,
1079
- cmap='coolwarm',
1080
- fmt=".2f",
1081
- cbar_kws={"shrink": 0.8}
1082
- )
1083
-
1084
- # add suffix to title
1085
- full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
1086
-
1087
- plt.title(full_plot_title)
1088
- plt.xticks(rotation=45, ha='right')
1089
- plt.yticks(rotation=0)
1090
-
1091
- plt.tight_layout()
1092
-
1093
- if save_dir:
1094
- save_path = make_fullpath(save_dir, make=True)
1095
- # sanitize the plot title to save the file
1096
- sanitized_plot_title = sanitize_filename(plot_title)
1097
- plot_filename = sanitized_plot_title + ".svg"
1098
-
1099
- full_path = save_path / plot_filename
1100
-
1101
- plt.savefig(full_path, bbox_inches="tight", format='svg')
1102
- _LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
1103
-
1104
- plt.show()
1105
- plt.close()
1106
-
1107
-
1108
- def clip_outliers_single(
1109
- df: pd.DataFrame,
1110
- column: str,
1111
- min_val: float,
1112
- max_val: float
1113
- ) -> Union[pd.DataFrame, None]:
1114
- """
1115
- Clips values in the specified numeric column to the range [min_val, max_val],
1116
- and returns a new DataFrame where the original column is replaced by the clipped version.
1117
-
1118
- Args:
1119
- df (pd.DataFrame): The input DataFrame.
1120
- column (str): The name of the column to clip.
1121
- min_val (float): Minimum allowable value; values below are clipped to this.
1122
- max_val (float): Maximum allowable value; values above are clipped to this.
1123
-
1124
- Returns:
1125
- pd.DataFrame: A new DataFrame with the specified column clipped in place.
1126
-
1127
- None: if a problem with the dataframe column occurred.
1128
- """
1129
- if column not in df.columns:
1130
- _LOGGER.warning(f"Column '{column}' not found in DataFrame.")
1131
- return None
1132
-
1133
- if not pd.api.types.is_numeric_dtype(df[column]):
1134
- _LOGGER.warning(f"Column '{column}' must be numeric.")
1135
- return None
1136
-
1137
- new_df = df.copy(deep=True)
1138
- new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
1139
-
1140
- _LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
1141
- return new_df
1142
-
1143
-
1144
- def clip_outliers_multi(
1145
- df: pd.DataFrame,
1146
- clip_dict: Union[Dict[str, Tuple[int, int]], Dict[str, Tuple[float, float]]],
1147
- verbose: bool=False
1148
- ) -> pd.DataFrame:
1149
- """
1150
- Clips values in multiple specified numeric columns to given [min, max] ranges,
1151
- updating values (deep copy) and skipping invalid entries.
1152
-
1153
- Args:
1154
- df (pd.DataFrame): The input DataFrame.
1155
- clip_dict (dict): A dictionary where keys are column names and values are (min_val, max_val) tuples.
1156
- verbose (bool): prints clipped range for each column.
1157
-
1158
- Returns:
1159
- pd.DataFrame: A new DataFrame with specified columns clipped.
1160
-
1161
- Notes:
1162
- - Invalid specifications (missing column, non-numeric type, wrong tuple length)
1163
- will be reported but skipped.
1164
- """
1165
- new_df = df.copy()
1166
- skipped_columns = []
1167
- clipped_columns = 0
1168
-
1169
- for col, bounds in clip_dict.items():
1170
- try:
1171
- if col not in df.columns:
1172
- _LOGGER.error(f"Column '{col}' not found in DataFrame.")
1173
- raise ValueError()
1174
-
1175
- if not pd.api.types.is_numeric_dtype(df[col]):
1176
- _LOGGER.error(f"Column '{col}' is not numeric.")
1177
- raise TypeError()
1178
-
1179
- if not (isinstance(bounds, tuple) and len(bounds) == 2):
1180
- _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
1181
- raise ValueError()
1182
-
1183
- min_val, max_val = bounds
1184
- new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
1185
- if verbose:
1186
- print(f"Clipped '{col}' to range [{min_val}, {max_val}].")
1187
- clipped_columns += 1
1188
-
1189
- except Exception as e:
1190
- skipped_columns.append((col, str(e)))
1191
- continue
1192
-
1193
- _LOGGER.info(f"Clipped {clipped_columns} columns.")
1194
-
1195
- if skipped_columns:
1196
- _LOGGER.warning("Skipped columns:")
1197
- for col, msg in skipped_columns:
1198
- print(f" - {col}")
1199
-
1200
- return new_df
1201
-
1202
-
1203
- def drop_outlier_samples(
1204
- df: pd.DataFrame,
1205
- bounds_dict: Dict[str, Tuple[Union[int, float], Union[int, float]]],
1206
- drop_on_nulls: bool = False,
1207
- verbose: bool = True
1208
- ) -> pd.DataFrame:
1209
- """
1210
- Drops entire rows where values in specified numeric columns fall outside
1211
- a given [min, max] range.
1212
-
1213
- This function processes a copy of the DataFrame, ensuring the original is
1214
- not modified. It skips columns with invalid specifications.
1215
-
1216
- Args:
1217
- df (pd.DataFrame): The input DataFrame.
1218
- bounds_dict (dict): A dictionary where keys are column names and values
1219
- are (min_val, max_val) tuples defining the valid range.
1220
- drop_on_nulls (bool): If True, rows with NaN/None in a checked column
1221
- will also be dropped. If False, NaN/None are ignored.
1222
- verbose (bool): If True, prints the number of rows dropped for each column.
1223
-
1224
- Returns:
1225
- pd.DataFrame: A new DataFrame with the outlier rows removed.
1226
-
1227
- Notes:
1228
- - Invalid specifications (e.g., missing column, non-numeric type,
1229
- incorrectly formatted bounds) will be reported and skipped.
1230
- """
1231
- new_df = df.copy()
1232
- skipped_columns: List[Tuple[str, str]] = []
1233
- initial_rows = len(new_df)
1234
-
1235
- for col, bounds in bounds_dict.items():
1236
- try:
1237
- # --- Validation Checks ---
1238
- if col not in df.columns:
1239
- _LOGGER.error(f"Column '{col}' not found in DataFrame.")
1240
- raise ValueError()
1241
-
1242
- if not pd.api.types.is_numeric_dtype(df[col]):
1243
- _LOGGER.error(f"Column '{col}' is not of a numeric data type.")
1244
- raise TypeError()
1245
-
1246
- if not (isinstance(bounds, tuple) and len(bounds) == 2):
1247
- _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
1248
- raise ValueError()
1249
-
1250
- # --- Filtering Logic ---
1251
- min_val, max_val = bounds
1252
- rows_before_drop = len(new_df)
1253
-
1254
- # Create the base mask for values within the specified range
1255
- # .between() is inclusive and evaluates to False for NaN
1256
- mask_in_bounds = new_df[col].between(min_val, max_val)
1257
-
1258
- if drop_on_nulls:
1259
- # Keep only rows that are within bounds.
1260
- # Since mask_in_bounds is False for NaN, nulls are dropped.
1261
- final_mask = mask_in_bounds
1262
- else:
1263
- # Keep rows that are within bounds OR are null.
1264
- mask_is_null = new_df[col].isnull()
1265
- final_mask = mask_in_bounds | mask_is_null
1266
-
1267
- # Apply the final mask
1268
- new_df = new_df[final_mask]
1269
-
1270
- rows_after_drop = len(new_df)
1271
-
1272
- if verbose:
1273
- dropped_count = rows_before_drop - rows_after_drop
1274
- if dropped_count > 0:
1275
- print(
1276
- f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
1277
- )
1278
-
1279
- except (ValueError, TypeError) as e:
1280
- skipped_columns.append((col, str(e)))
1281
- continue
1282
-
1283
- total_dropped = initial_rows - len(new_df)
1284
- _LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
1285
-
1286
- if skipped_columns:
1287
- _LOGGER.warning("Skipped the following columns due to errors:")
1288
- for col, msg in skipped_columns:
1289
- # Only print the column name for cleaner output as the error was already logged
1290
- print(f" - {col}")
1291
-
1292
- # if new_df is a series, convert to dataframe
1293
- if isinstance(new_df, pd.Series):
1294
- new_df = new_df.to_frame()
1295
-
1296
- return new_df
1297
-
1298
-
1299
- def match_and_filter_columns_by_regex(
1300
- df: pd.DataFrame,
1301
- pattern: str,
1302
- case_sensitive: bool = False,
1303
- escape_pattern: bool = False
1304
- ) -> Tuple[pd.DataFrame, List[str]]:
1305
- """
1306
- Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
1307
-
1308
- Parameters:
1309
- df (pd.DataFrame): The DataFrame to search.
1310
- pattern (str): The regex pattern to match column names (use a raw string).
1311
- case_sensitive (bool): Whether matching is case-sensitive.
1312
- escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
1313
-
1314
- Returns:
1315
- (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
1316
- """
1317
- if escape_pattern:
1318
- pattern = re.escape(pattern)
1319
-
1320
- mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
1321
- matched_columns = df.columns[mask].to_list()
1322
- filtered_df = df.loc[:, mask]
1323
-
1324
- _LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
1325
-
1326
- # if filtered df is a series, convert to dataframe
1327
- if isinstance(filtered_df, pd.Series):
1328
- filtered_df = filtered_df.to_frame()
1329
-
1330
- return filtered_df, matched_columns
1331
-
1332
-
1333
- def standardize_percentages(
1334
- df: pd.DataFrame,
1335
- columns: list[str],
1336
- treat_one_as_proportion: bool = True,
1337
- round_digits: int = 2,
1338
- verbose: bool=True
1339
- ) -> pd.DataFrame:
1340
- """
1341
- Standardizes numeric columns containing mixed-format percentages.
1342
-
1343
- This function cleans columns where percentages might be entered as whole
1344
- numbers (55) and as proportions (0.55). It assumes values
1345
- between 0 and 1 are proportions and multiplies them by 100.
1346
-
1347
- Args:
1348
- df (pd.Dataframe): The input pandas DataFrame.
1349
- columns (list[str]): A list of column names to standardize.
1350
- treat_one_as_proportion (bool):
1351
- - If True (default): The value `1` is treated as a proportion and converted to `100%`.
1352
- - If False: The value `1` is treated as `1%`.
1353
- round_digits (int): The number of decimal places to round the final result to.
1354
-
1355
- Returns:
1356
- (pd.Dataframe):
1357
- A new DataFrame with the specified columns cleaned and standardized.
1358
- """
1359
- df_copy = df.copy()
1360
-
1361
- if df_copy.empty:
1362
- return df_copy
1363
-
1364
- # This helper function contains the core cleaning logic
1365
- def _clean_value(x: float) -> float:
1366
- """Applies the standardization rule to a single value."""
1367
- if pd.isna(x):
1368
- return x
1369
-
1370
- # If treat_one_as_proportion is True, the range for proportions is [0, 1]
1371
- if treat_one_as_proportion and 0 <= x <= 1:
1372
- return x * 100
1373
- # If False, the range for proportions is [0, 1) (1 is excluded)
1374
- elif not treat_one_as_proportion and 0 <= x < 1:
1375
- return x * 100
1376
-
1377
- # Otherwise, the value is assumed to be a correctly formatted percentage
1378
- return x
1379
-
1380
- fixed_columns: list[str] = list()
1381
-
1382
- for col in columns:
1383
- # --- Robustness Checks ---
1384
- if col not in df_copy.columns:
1385
- _LOGGER.warning(f"Column '{col}' not found. Skipping.")
1386
- continue
1387
-
1388
- if not is_numeric_dtype(df_copy[col]):
1389
- _LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
1390
- continue
1391
-
1392
- # --- Applying the Logic ---
1393
- # Apply the cleaning function to every value in the column
1394
- df_copy[col] = df_copy[col].apply(_clean_value)
1395
-
1396
- # Round the result
1397
- df_copy[col] = df_copy[col].round(round_digits)
1398
-
1399
- fixed_columns.append(col)
1400
-
1401
- if verbose:
1402
- _LOGGER.info(f"Columns standardized:")
1403
- for fixed_col in fixed_columns:
1404
- print(f" '{fixed_col}'")
1405
-
1406
- return df_copy
1407
-
1408
-
1409
- def reconstruct_one_hot(
1410
- df: pd.DataFrame,
1411
- features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
1412
- separator: str = '_',
1413
- baseline_category_name: Optional[str] = "Other",
1414
- drop_original: bool = True,
1415
- verbose: bool = True
1416
- ) -> pd.DataFrame:
1417
- """
1418
- Reconstructs original categorical columns from a one-hot encoded DataFrame.
1419
-
1420
- This function identifies groups of one-hot encoded columns based on a common
1421
- prefix (base feature name) and a separator. It then collapses each group
1422
- into a single column containing the categorical value.
1423
-
1424
- Args:
1425
- df (pd.DataFrame):
1426
- The input DataFrame with one-hot encoded columns.
1427
- features_to_reconstruct (List[str | Tuple[str, str | None]]):
1428
- A list defining the features to reconstruct. This list can contain:
1429
-
1430
- - A string: (e.g., "Color")
1431
- This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
1432
- - A tuple: (e.g., ("Pet", "Dog"))
1433
- This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
1434
- - A tuple with None: (e.g., ("Size", None))
1435
- This reconstructs 'Size' and maps all-zero rows to the NaN value.
1436
- Example:
1437
- [
1438
- "Mood", # All-zeros -> "Other"
1439
- ("Color", "Red"), # All-zeros -> "Red"
1440
- ("Size", None) # All-zeros -> NaN
1441
- ]
1442
- separator (str):
1443
- The character separating the base name from the categorical value in
1444
- the column names (e.g., '_' in 'B_a').
1445
- baseline_category_name (str | None):
1446
- The baseline category name to use by default if it is not explicitly provided.
1447
- drop_original (bool):
1448
- If True, the original one-hot encoded columns will be dropped from
1449
- the returned DataFrame.
1450
-
1451
- Returns:
1452
- pd.DataFrame:
1453
- A new DataFrame with the specified one-hot encoded features
1454
- reconstructed into single categorical columns.
1455
-
1456
- <br>
1457
-
1458
- ## Note:
1459
-
1460
- This function is designed to be robust, but users should be aware of two key edge cases:
1461
-
1462
- 1. **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
1463
-
1464
- 2. **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
1465
- """
1466
- if not isinstance(df, pd.DataFrame):
1467
- _LOGGER.error("Input must be a pandas DataFrame.")
1468
- raise TypeError()
1469
-
1470
- if not (baseline_category_name is None or isinstance(baseline_category_name, str)):
1471
- _LOGGER.error("The baseline_category must be None or a string.")
1472
- raise TypeError()
1473
-
1474
- new_df = df.copy()
1475
- all_ohe_cols_to_drop = []
1476
- reconstructed_count = 0
1477
-
1478
- # --- 1. Parse and validate the reconstruction config ---
1479
- # This normalizes the input into a clean {base_name: baseline_val} dict
1480
- reconstruction_config: Dict[str, Optional[str]] = {}
1481
- try:
1482
- for item in features_to_reconstruct:
1483
- if isinstance(item, str):
1484
- # Case 1: "Color"
1485
- base_name = item
1486
- baseline_val = baseline_category_name
1487
- elif isinstance(item, tuple) and len(item) == 2:
1488
- # Case 2: ("Pet", "dog") or ("Size", None)
1489
- base_name, baseline_val = item
1490
- if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
1491
- _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
1492
- raise ValueError()
1493
- else:
1494
- _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
1495
- raise ValueError()
1496
-
1497
- if base_name in reconstruction_config and verbose:
1498
- _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
1499
-
1500
- reconstruction_config[base_name] = baseline_val
1501
-
1502
- except Exception as e:
1503
- _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
1504
- raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
1505
-
1506
- _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
1507
-
1508
- # Main logic
1509
- for base_name, baseline_category in reconstruction_config.items():
1510
- # Regex to find all columns belonging to this base feature.
1511
- pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
1512
-
1513
- # Find matching columns
1514
- ohe_cols = [col for col in df.columns if re.match(pattern, col)]
1515
-
1516
- if not ohe_cols:
1517
- _LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
1518
- continue
1519
-
1520
- # For each row, find the column name with the maximum value (which is 1)
1521
- reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
1522
-
1523
- # Extract the categorical value (the suffix) from the column name
1524
- # Use n=1 in split to handle cases where the category itself might contain the separator
1525
- new_column_values = reconstructed_series.str.split(separator, n=1).str[1] # type: ignore
1526
-
1527
- # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
1528
- all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
1529
-
1530
- if baseline_category is not None:
1531
- # A baseline category was provided
1532
- new_column_values.loc[all_zero_mask] = baseline_category
1533
- else:
1534
- # No baseline provided: assign NaN
1535
- new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1536
-
1537
- if verbose:
1538
- print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
1539
-
1540
- # Assign the new reconstructed column to the DataFrame
1541
- new_df[base_name] = new_column_values
1542
-
1543
- all_ohe_cols_to_drop.extend(ohe_cols)
1544
- reconstructed_count += 1
1545
- if verbose:
1546
- print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1547
-
1548
- # Cleanup
1549
- if drop_original and all_ohe_cols_to_drop:
1550
- # Drop the original OHE columns, ensuring no duplicates in the drop list
1551
- unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
1552
- new_df.drop(columns=unique_cols_to_drop, inplace=True)
1553
- _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
1554
-
1555
- _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1556
-
1557
- return new_df
1558
-
1559
-
1560
- def reconstruct_binary(
1561
- df: pd.DataFrame,
1562
- reconstruction_map: Dict[str, Tuple[str, Any, Any]],
1563
- drop_original: bool = True,
1564
- verbose: bool = True
1565
- ) -> pd.DataFrame:
1566
- """
1567
- Reconstructs new categorical columns from existing binary (0/1) columns.
1568
-
1569
- Used to reverse a binary encoding by mapping 0 and 1 back to
1570
- descriptive categorical labels.
1571
-
1572
- Args:
1573
- df (pd.DataFrame):
1574
- The input DataFrame.
1575
- reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
1576
- A dictionary defining the reconstructions.
1577
- Format:
1578
- { "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
1579
- Example:
1580
- {
1581
- "Sex": ("Sex_male", "Female", "Male"),
1582
- "Smoker": ("Is_Smoker", "No", "Yes")
1583
- }
1584
- drop_original (bool):
1585
- If True, the original binary source columns (e.g., "Sex_male")
1586
- will be dropped from the returned DataFrame.
1587
- verbose (bool):
1588
- If True, prints the details of each reconstruction.
1589
-
1590
- Returns:
1591
- pd.DataFrame:
1592
- A new DataFrame with the reconstructed categorical columns.
1593
-
1594
- Raises:
1595
- TypeError: If `df` is not a pandas DataFrame.
1596
- ValueError: If `reconstruction_map` is not a dictionary or a
1597
- configuration is invalid (e.g., column name collision).
1598
-
1599
- Notes:
1600
- - The function operates on a copy of the DataFrame.
1601
- - Rows with `NaN` in the source column will have `NaN` in the
1602
- new column.
1603
- - Values in the source column other than 0 or 1 (e.g., 2) will
1604
- result in `NaN` in the new column.
1605
- """
1606
- if not isinstance(df, pd.DataFrame):
1607
- _LOGGER.error("Input must be a pandas DataFrame.")
1608
- raise TypeError()
1609
-
1610
- if not isinstance(reconstruction_map, dict):
1611
- _LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
1612
- raise ValueError()
1613
-
1614
- new_df = df.copy()
1615
- source_cols_to_drop: List[str] = []
1616
- reconstructed_count = 0
1617
-
1618
- _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
1619
-
1620
- for new_col_name, config in reconstruction_map.items():
1621
-
1622
- # --- 1. Validation ---
1623
- if not (isinstance(config, tuple) and len(config) == 3):
1624
- _LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
1625
- raise ValueError()
1626
-
1627
- source_col, label_for_0, label_for_1 = config
1628
-
1629
- if source_col not in new_df.columns:
1630
- _LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
1631
- raise ValueError()
1632
-
1633
- if new_col_name in new_df.columns and new_col_name != source_col and verbose:
1634
- _LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
1635
-
1636
- # --- 2. Reconstruction ---
1637
- mapping_dict = {0: label_for_0, 1: label_for_1}
1638
- new_df[new_col_name] = new_df[source_col].map(mapping_dict)
1639
-
1640
- # --- 3. Logging/Tracking ---
1641
- # Only mark source for dropping if it's NOT the same as the new column
1642
- if source_col != new_col_name:
1643
- source_cols_to_drop.append(source_col)
1644
-
1645
- reconstructed_count += 1
1646
- if verbose:
1647
- print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
1648
-
1649
- # --- 4. Cleanup ---
1650
- if drop_original and source_cols_to_drop:
1651
- unique_cols_to_drop = list(set(source_cols_to_drop))
1652
- new_df.drop(columns=unique_cols_to_drop, inplace=True)
1653
- _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
1654
-
1655
- _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1656
-
1657
- return new_df
1658
-
1659
-
1660
- def reconstruct_multibinary(
1661
- df: pd.DataFrame,
1662
- pattern: str,
1663
- pos_label: str = "Yes",
1664
- neg_label: str = "No",
1665
- case_sensitive: bool = False,
1666
- verbose: bool = True
1667
- ) -> Tuple[pd.DataFrame, List[str]]:
1668
- """
1669
- Identifies binary columns matching a regex pattern and converts their numeric
1670
- values (0/1) into categorical string labels (e.g., "No"/"Yes").
1671
-
1672
- This allows mass-labeling of binary features so they are treated as proper
1673
- categorical variables with meaningful keys during subsequent encoding steps.
1674
-
1675
- Args:
1676
- df (pd.DataFrame): The input DataFrame.
1677
- pattern (str): Regex pattern to identify the group of binary columns.
1678
- pos_label (str): The label to assign to 1 or True (default "Yes").
1679
- neg_label (str): The label to assign to 0 or False (default "No").
1680
- case_sensitive (bool): If True, regex matching is case-sensitive.
1681
- verbose (bool): If True, prints a summary of the operation.
1682
-
1683
- Returns:
1684
- Tuple(pd.DataFrame, List[str]):
1685
- - A new DataFrame with the matched columns converted to Strings.
1686
- - A list of the column names that were modified.
1687
- """
1688
- if not isinstance(df, pd.DataFrame):
1689
- _LOGGER.error("Input must be a pandas DataFrame.")
1690
- raise TypeError()
1691
-
1692
- new_df = df.copy()
1693
-
1694
- # 1. Find columns matching the regex
1695
- mask = new_df.columns.str.contains(pattern, case=case_sensitive, regex=True)
1696
- target_columns = new_df.columns[mask].to_list()
1697
-
1698
- if not target_columns:
1699
- _LOGGER.warning(f"No columns found matching pattern '{pattern}'. Returning original DataFrame.")
1700
- return new_df, list()
1701
-
1702
- # 2. Define robust mapping (handles ints, floats, and booleans)
1703
- # Note: Any value not in this map will become NaN
1704
- mapping_dict = {
1705
- 0: neg_label,
1706
- 0.0: neg_label,
1707
- False: neg_label,
1708
- 1: pos_label,
1709
- 1.0: pos_label,
1710
- True: pos_label
1711
- }
1712
-
1713
- converted_count = 0
1714
-
1715
- # 3. Apply mapping
1716
- for col in target_columns:
1717
- # Check if column is numeric or boolean before attempting map to avoid destroying existing strings
1718
- if is_numeric_dtype(new_df[col]) or is_object_dtype(new_df[col]):
1719
- # We cast to object implicitly by mapping to strings
1720
- new_df[col] = new_df[col].map(mapping_dict)
1721
- converted_count += 1
1722
-
1723
- if verbose:
1724
- _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
1725
-
1726
- return new_df, target_columns
1727
-
1728
-
1729
- def finalize_feature_schema(
1730
- df_features: pd.DataFrame,
1731
- categorical_mappings: Optional[Dict[str, Dict[str, int]]]
1732
- ) -> FeatureSchema:
1733
- """
1734
- Analyzes the final features DataFrame to create a definitive schema.
1735
-
1736
- This function is the "single source of truth" for column order
1737
- and type (categorical vs. continuous) for the entire ML pipeline.
1738
-
1739
- It should be called at the end of the feature engineering process.
1740
-
1741
- Args:
1742
- df_features (pd.DataFrame):
1743
- The final, processed DataFrame containing *only* feature columns
1744
- in the exact order they will be fed to the model.
1745
- categorical_mappings (Dict[str, Dict[str, int]] | None):
1746
- The mappings dictionary generated by
1747
- `encode_categorical_features`. Can be None if no
1748
- categorical features exist.
1749
-
1750
- Returns:
1751
- FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
1752
- """
1753
- feature_names: List[str] = df_features.columns.to_list()
1754
-
1755
- # Intermediate lists for building
1756
- continuous_feature_names_list: List[str] = []
1757
- categorical_feature_names_list: List[str] = []
1758
- categorical_index_map_dict: Dict[int, int] = {}
1759
-
1760
- # _LOGGER.info("Finalizing feature schema...")
1761
-
1762
- if categorical_mappings:
1763
- # --- Categorical features are present ---
1764
- categorical_names_set = set(categorical_mappings.keys())
1765
-
1766
- for index, name in enumerate(feature_names):
1767
- if name in categorical_names_set:
1768
- # This is a categorical feature
1769
- cardinality = len(categorical_mappings[name])
1770
- categorical_index_map_dict[index] = cardinality
1771
- categorical_feature_names_list.append(name)
1772
- else:
1773
- # This is a continuous feature
1774
- continuous_feature_names_list.append(name)
1775
-
1776
- # Use the populated dict, or None if it's empty
1777
- final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
1778
-
1779
- else:
1780
- # --- No categorical features ---
1781
- _LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
1782
- continuous_feature_names_list = list(feature_names)
1783
- # categorical_feature_names_list remains empty
1784
- # categorical_index_map_dict remains empty
1785
- final_index_map = None # Explicitly set to None to match Optional type
1786
-
1787
- _LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
1788
-
1789
- # Create the final immutable instance
1790
- schema_instance = FeatureSchema(
1791
- feature_names=tuple(feature_names),
1792
- continuous_feature_names=tuple(continuous_feature_names_list),
1793
- categorical_feature_names=tuple(categorical_feature_names_list),
1794
- categorical_index_map=final_index_map,
1795
- categorical_mappings=categorical_mappings
1796
- )
1797
-
1798
- return schema_instance
1799
-
1800
-
1801
- def apply_feature_schema(
1802
- df: pd.DataFrame,
1803
- schema: FeatureSchema,
1804
- targets: Optional[List[str]] = None,
1805
- unknown_value: int = 99999,
1806
- verbose: bool = True
1807
- ) -> pd.DataFrame:
1808
- """
1809
- Aligns the input DataFrame with the provided FeatureSchema.
1810
-
1811
- This function aligns data for inference/fine-tuning by enforcing the schema's
1812
- structure and encoding.
1813
-
1814
- Args:
1815
- df (pd.DataFrame): The input DataFrame.
1816
- schema (FeatureSchema): The schema defining feature names, types, and mappings.
1817
- targets (list[str] | None): Optional list of target column names.
1818
- unknown_value (int): Integer value to assign to unknown categorical levels.
1819
- Defaults to 99999 to avoid collision with existing categories.
1820
- verbose (bool): If True, logs info about dropped extra columns.
1821
-
1822
- Returns:
1823
- pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
1824
-
1825
- Raises:
1826
- ValueError: If any required feature or target column is missing.
1827
- """
1828
- # 1. Setup
1829
- df_processed = df.copy()
1830
- targets = targets if targets is not None else []
1831
-
1832
- # 2. Validation: Strict Column Presence
1833
- missing_features = [col for col in schema.feature_names if col not in df_processed.columns]
1834
- if missing_features:
1835
- _LOGGER.error(f"Schema Mismatch: Missing required features: {missing_features}")
1836
- raise ValueError()
1837
-
1838
- # target columns should not be part of feature columns
1839
- if targets:
1840
- overlapping_columns = set(schema.feature_names).intersection(set(targets))
1841
- if overlapping_columns:
1842
- _LOGGER.error(f"Schema Mismatch: Target columns overlap with feature columns: {overlapping_columns}")
1843
- raise ValueError()
1844
-
1845
- # targets were provided, check their presence
1846
- missing_targets = [col for col in targets if col not in df_processed.columns]
1847
- if missing_targets:
1848
- _LOGGER.error(f"Target Mismatch: Missing target columns: {missing_targets}")
1849
- raise ValueError()
1850
-
1851
- # 3. Apply Categorical Encoding
1852
- if schema.categorical_feature_names and schema.categorical_mappings:
1853
- for col_name in schema.categorical_feature_names:
1854
- # Should never happen due to schema construction, but double-check and raise
1855
- if col_name not in schema.categorical_mappings:
1856
- _LOGGER.error(f"Schema Inconsistency: No mapping found for categorical feature '{col_name}'.")
1857
- raise ValueError()
1858
-
1859
- mapping = schema.categorical_mappings[col_name]
1860
-
1861
- # Apply mapping (unknowns become NaN)
1862
- df_processed[col_name] = df_processed[col_name].astype(str).map(mapping)
1863
-
1864
- # Handle Unknown Categories
1865
- if df_processed[col_name].isnull().any():
1866
- n_missing = df_processed[col_name].isnull().sum()
1867
- _LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
1868
-
1869
- # Fill unknowns with the specified integer
1870
- df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
1871
-
1872
- df_processed[col_name] = df_processed[col_name].astype(int)
1873
-
1874
- # 4. Reorder and Filter
1875
- final_column_order = list(schema.feature_names) + targets
1876
-
1877
- extra_cols = set(df_processed.columns) - set(final_column_order)
1878
- if extra_cols:
1879
- _LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
1880
- if verbose:
1881
- for extra_column in extra_cols:
1882
- print(f" - Dropping column: '{extra_column}'")
1883
-
1884
- df_final = df_processed[final_column_order]
1885
-
1886
- _LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
1887
-
1888
- # df_final should be a dataframe
1889
- if isinstance(df_final, pd.Series):
1890
- df_final = df_final.to_frame()
1891
-
1892
- return df_final
1893
-
1894
-
1895
- def _validate_columns(df: pd.DataFrame, columns: list[str]):
1896
- valid_columns = [column for column in columns if column in df.columns]
1897
- return valid_columns
1898
-
1899
-
1900
- def info():
1901
- _script_info(__all__)