dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -1,1909 +0,0 @@
1
- import pandas as pd
2
- from pandas.api.types import is_numeric_dtype, is_object_dtype
3
- import numpy as np
4
- import matplotlib.pyplot as plt
5
- import seaborn as sns
6
- from typing import Union, Literal, Dict, Tuple, List, Optional, Any
7
- from pathlib import Path
8
- import re
9
-
10
- from ._path_manager import sanitize_filename, make_fullpath
11
- from ._script_info import _script_info
12
- from ._logger import get_logger
13
- from ._utilities import save_dataframe_filename
14
- from ._schema import FeatureSchema
15
-
16
-
17
- _LOGGER = get_logger("Data Exploration")
18
-
19
-
20
- __all__ = [
21
- "summarize_dataframe",
22
- "drop_constant_columns",
23
- "drop_rows_with_missing_data",
24
- "show_null_columns",
25
- "drop_columns_with_missing_data",
26
- "drop_macro",
27
- "clean_column_names",
28
- "plot_value_distributions",
29
- "split_features_targets",
30
- "encode_categorical_features",
31
- "clip_outliers_single",
32
- "clip_outliers_multi",
33
- "drop_outlier_samples",
34
- "plot_continuous_vs_target",
35
- "plot_categorical_vs_target",
36
- "plot_correlation_heatmap",
37
- "finalize_feature_schema",
38
- "match_and_filter_columns_by_regex",
39
- "standardize_percentages",
40
- "reconstruct_one_hot",
41
- "reconstruct_binary",
42
- "reconstruct_multibinary",
43
- "split_continuous_binary",
44
- "apply_feature_schema"
45
- ]
46
-
47
-
48
- def summarize_dataframe(df: pd.DataFrame, round_digits: int = 2):
49
- """
50
- Returns a summary DataFrame with data types, non-null counts, number of unique values,
51
- missing value percentage, and basic statistics for each column.
52
-
53
- Parameters:
54
- df (pd.DataFrame): The input DataFrame.
55
- round_digits (int): Decimal places to round numerical statistics.
56
-
57
- Returns:
58
- pd.DataFrame: Summary table.
59
- """
60
- summary = pd.DataFrame({
61
- 'Data Type': df.dtypes,
62
- 'Completeness %': (df.notnull().mean() * 100).round(2),
63
- 'Unique Values': df.nunique(),
64
- # 'Missing %': (df.isnull().mean() * 100).round(2)
65
- })
66
-
67
- # For numeric columns, add summary statistics
68
- numeric_cols = df.select_dtypes(include='number').columns
69
- if not numeric_cols.empty:
70
- stats = df[numeric_cols].describe(percentiles=[.10, .25, .50, .70, .80, .90])
71
-
72
- summary_numeric = stats.T[
73
- ['mean', 'std', 'min', '10%', '25%', '50%', '70%', '80%', '90%', 'max']
74
- ].round(round_digits)
75
- summary = summary.join(summary_numeric, how='left')
76
-
77
- print(f"DataFrame Shape: {df.shape}")
78
- return summary
79
-
80
-
81
- def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
82
- """
83
- Removes columns from a pandas DataFrame that contain only a single unique
84
- value or are entirely null/NaN.
85
-
86
- This utility is useful for cleaning data by removing constant features that
87
- have no predictive value.
88
-
89
- Args:
90
- df (pd.DataFrame):
91
- The pandas DataFrame to clean.
92
- verbose (bool):
93
- If True, prints the names of the columns that were dropped.
94
- Defaults to True.
95
-
96
- Returns:
97
- pd.DataFrame:
98
- A new DataFrame with the constant columns removed.
99
- """
100
- if not isinstance(df, pd.DataFrame):
101
- _LOGGER.error("Input must be a pandas DataFrame.")
102
- raise TypeError()
103
-
104
- # make copy to avoid modifying original
105
- df_clean = df.copy()
106
-
107
- original_columns = set(df.columns)
108
- cols_to_keep = []
109
-
110
- for col_name in df_clean.columns:
111
- column = df_clean[col_name]
112
-
113
- # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
114
- if column.nunique(dropna=True) > 1:
115
- cols_to_keep.append(col_name)
116
-
117
- dropped_columns = original_columns - set(cols_to_keep)
118
- if verbose:
119
- if dropped_columns:
120
- _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
121
- else:
122
- _LOGGER.info("No constant columns found.")
123
-
124
- # Return a new DataFrame with only the columns to keep
125
- df_clean = df_clean[cols_to_keep]
126
-
127
- if isinstance(df_clean, pd.Series):
128
- df_clean = df_clean.to_frame()
129
-
130
- return df_clean
131
-
132
-
133
- def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
134
- """
135
- Drops rows from the DataFrame using a two-stage strategy:
136
-
137
- 1. If `targets`, remove any row where all target columns are missing.
138
- 2. Among features, drop those with more than `threshold` fraction of missing values.
139
-
140
- Parameters:
141
- df (pd.DataFrame): The input DataFrame.
142
- targets (list[str] | None): List of target column names.
143
- threshold (float): Maximum allowed fraction of missing values in feature columns.
144
-
145
- Returns:
146
- pd.DataFrame: A cleaned DataFrame with problematic rows removed.
147
- """
148
- df_clean = df.copy()
149
-
150
- # Stage 1: Drop rows with all target columns missing
151
- valid_targets = []
152
- if targets:
153
- # validate targets
154
- valid_targets = _validate_columns(df_clean, targets)
155
-
156
- # Only proceed if we actually have columns to check
157
- if valid_targets:
158
- target_na = df_clean[valid_targets].isnull().all(axis=1)
159
- if target_na.any():
160
- _LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
161
- df_clean = df_clean[~target_na]
162
- else:
163
- _LOGGER.info("No rows found where all targets are missing.")
164
- else:
165
- _LOGGER.error("Targets list provided but no matching columns found in DataFrame.")
166
- raise ValueError()
167
-
168
- # Stage 2: Drop rows based on feature column missing values
169
- feature_cols = [col for col in df_clean.columns if col not in valid_targets]
170
- if feature_cols:
171
- feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
172
- rows_to_drop = feature_na_frac[feature_na_frac > threshold].index # type: ignore
173
- if len(rows_to_drop) > 0:
174
- _LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
175
- df_clean = df_clean.drop(index=rows_to_drop)
176
- else:
177
- _LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
178
- else:
179
- _LOGGER.warning("No feature columns available to evaluate.")
180
-
181
- return df_clean
182
-
183
-
184
- def show_null_columns(
185
- df: pd.DataFrame,
186
- round_digits: int = 2,
187
- plot_to_dir: Optional[Union[str, Path]] = None,
188
- plot_filename: Optional[str] = None,
189
- use_all_columns: bool = False
190
- ) -> pd.DataFrame:
191
- """
192
- Returns a table of columns with missing values, showing both the count and
193
- percentage of missing entries per column.
194
-
195
- Optionally generates a visualization of the missing data profile.
196
-
197
- Parameters:
198
- df (pd.DataFrame): The input DataFrame.
199
- round_digits (int): Number of decimal places for the percentage.
200
- plot_to_dir (str | Path | None): If provided, saves a visualization of the
201
- missing data to this directory.
202
- plot_filename (str): The filename for the saved plot (without extension).
203
- Used only if `plot_to_dir` is set.
204
- use_all_columns (bool): If True, includes all columns in the summary and plot,
205
- even those with no missing values.
206
-
207
- Returns:
208
- pd.DataFrame: A DataFrame summarizing missing values in each column.
209
- """
210
- null_counts = df.isnull().sum()
211
- null_percent = df.isnull().mean() * 100
212
-
213
- if use_all_columns:
214
- null_summary = pd.DataFrame({
215
- 'Missing Count': null_counts,
216
- 'Missing %': null_percent.round(round_digits)
217
- })
218
- else:
219
- # Filter only columns with at least one null
220
- mask = null_counts > 0
221
- null_summary = pd.DataFrame({
222
- 'Missing Count': null_counts[mask],
223
- 'Missing %': null_percent[mask].round(round_digits)
224
- })
225
-
226
- # Sort by descending percentage of missing values
227
- null_summary = null_summary.sort_values(by='Missing %', ascending=False)
228
-
229
- # --- Visualization Logic ---
230
- if plot_to_dir:
231
- if null_summary.empty:
232
- _LOGGER.info("No missing data found. Skipping plot generation.")
233
- else:
234
- try:
235
- # Validate and create save directory
236
- save_path = make_fullpath(plot_to_dir, make=True, enforce="directory")
237
-
238
- # Prepare data
239
- features = null_summary.index.tolist()
240
- missing_pct = np.array(null_summary['Missing %'].values)
241
- present_pct = 100 - missing_pct
242
- n_features = len(features)
243
-
244
- # Dynamic width
245
- width = max(10, n_features * 0.4)
246
- plt.figure(figsize=(width, 8))
247
-
248
- # Stacked Bar Chart Logic
249
-
250
- # Grid behind bars
251
- plt.grid(axis='y', linestyle='--', alpha=0.5, zorder=0)
252
-
253
- # 1. Present Data: Solid Green
254
- plt.bar(
255
- features,
256
- present_pct,
257
- color='tab:green',
258
- label='Present',
259
- width=0.6,
260
- zorder=3
261
- )
262
-
263
- # 2. Missing Data: Transparent Red Fill + Solid Red Hatch
264
- # define facecolor (fill) with alpha, but edgecolor (lines) without alpha.
265
- plt.bar(
266
- features,
267
- missing_pct,
268
- bottom=present_pct,
269
- facecolor=(1.0, 1.0, 1.0, 0.2), # RGBA
270
- edgecolor='tab:red', # Solid red for the hatch lines
271
- hatch='///', # hatch pattern
272
- linewidth=0.4, # Ensure lines are thick enough to see
273
- label='Missing',
274
- width=0.6,
275
- zorder=3
276
- )
277
-
278
- # Styling
279
- plt.ylim(0, 100)
280
- plt.ylabel("Data Completeness (%)", fontsize=13)
281
- plt.yticks(np.arange(0, 101, 10))
282
- plot_title = f"Missing Data - {plot_filename.replace('_', ' ')}" if plot_filename else "Missing Data"
283
- plt.title(plot_title)
284
- plt.xticks(rotation=45, ha='right', fontsize=9)
285
-
286
- # Reference line
287
- plt.axhline(y=100, color='black', linestyle='-', linewidth=0.5, alpha=0.3)
288
-
289
- plt.legend(loc='lower right', framealpha=0.95)
290
- plt.tight_layout()
291
-
292
- # Save
293
- if plot_filename is None or plot_filename.strip() == "":
294
- plot_filename = "Missing_Data_Profile"
295
- else:
296
- plot_filename = "Missing_Data_" + sanitize_filename(plot_filename)
297
-
298
- full_filename = plot_filename + ".svg"
299
- plt.savefig(save_path / full_filename, format='svg', bbox_inches="tight")
300
- plt.close()
301
-
302
- _LOGGER.info(f"Saved missing data plot as '{full_filename}'")
303
-
304
- except Exception as e:
305
- _LOGGER.error(f"Failed to generate missing data plot. Error: {e}")
306
- plt.close()
307
-
308
- return null_summary
309
-
310
-
311
- def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[List[str]]=None) -> pd.DataFrame:
312
- """
313
- Drops columns with more than `threshold` fraction of missing values.
314
-
315
- Parameters:
316
- df (pd.DataFrame): The input DataFrame.
317
- threshold (float): Fraction of missing values above which columns are dropped.
318
- show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
319
- skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
320
-
321
- Returns:
322
- pd.DataFrame: A new DataFrame without the dropped columns.
323
- """
324
- # If skip_columns is provided, create a list of columns to check.
325
- # Otherwise, check all columns.
326
- cols_to_check = df.columns
327
- if skip_columns:
328
- # Use set difference for efficient exclusion
329
- cols_to_check = df.columns.difference(skip_columns)
330
-
331
- # Calculate the missing fraction only on the columns to be checked
332
- missing_fraction = df[cols_to_check].isnull().mean()
333
-
334
-
335
- cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
336
-
337
- if len(cols_to_drop) > 0:
338
- _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
339
-
340
- result_df = df.drop(columns=cols_to_drop)
341
- if show_nulls_after:
342
- print(show_null_columns(df=result_df))
343
-
344
- return result_df
345
- else:
346
- _LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
347
- return df
348
-
349
-
350
- def drop_macro(df: pd.DataFrame,
351
- log_directory: Union[str,Path],
352
- targets: list[str],
353
- skip_targets: bool=False,
354
- threshold: float=0.7) -> pd.DataFrame:
355
- """
356
- Iteratively removes rows and columns with excessive missing data.
357
-
358
- This function performs a comprehensive cleaning cycle on a DataFrame. It
359
- repeatedly drops columns with constant values, followed by rows and columns that exceed
360
- a specified threshold of missing values. The process continues until the
361
- DataFrame's dimensions stabilize, ensuring that the interdependency between
362
- row and column deletions is handled.
363
-
364
- Initial and final missing data reports are saved to the specified log directory.
365
-
366
- Args:
367
- df (pd.DataFrame): The input pandas DataFrame to be cleaned.
368
- log_directory (Union[str, Path]): Path to the directory where the missing data reports
369
- and plots will be saved inside a "Missing Report" subdirectory.
370
- targets (list[str]): A list of column names to be treated as target
371
- variables. This list guides the row-dropping logic.
372
- skip_targets (bool, optional): If True, the columns listed in `targets`
373
- will be exempt from being dropped, even if they exceed the missing
374
- data threshold.
375
- threshold (float, optional): The proportion of missing data required to drop
376
- a row or column. For example, 0.7 means a row/column will be
377
- dropped if 70% or more of its data is missing.
378
-
379
- Returns:
380
- pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
381
- """
382
- # make a deep copy to work with
383
- df_clean = df.copy()
384
-
385
- base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
386
- full_path = base_dir_path / "Missing Report"
387
-
388
- # Log initial state + Plot
389
- missing_data_start = show_null_columns(
390
- df=df_clean,
391
- plot_to_dir=full_path,
392
- plot_filename="Original",
393
- use_all_columns=True
394
- )
395
- save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
396
- save_dir=full_path,
397
- filename="Missing_Data_Original")
398
-
399
- # Clean cycles for rows and columns
400
- master = True
401
- while master:
402
- # track rows and columns
403
- initial_rows, initial_columns = df_clean.shape
404
-
405
- # drop constant columns
406
- df_clean = drop_constant_columns(df=df_clean)
407
-
408
- # clean rows
409
- df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
410
-
411
- # clean columns
412
- if skip_targets:
413
- df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
414
- else:
415
- df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
416
-
417
- # cleaned?
418
- remaining_rows, remaining_columns = df_clean.shape
419
- if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
420
- master = False
421
-
422
- # log final state + plot
423
- missing_data_final = show_null_columns(
424
- df=df_clean,
425
- plot_to_dir=full_path,
426
- plot_filename="Processed",
427
- use_all_columns=True
428
- )
429
- save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
430
- save_dir=full_path,
431
- filename="Missing_Data_Processed")
432
-
433
- # return cleaned dataframe
434
- return df_clean
435
-
436
-
437
- def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacement_pattern: str = r'[\[\]{}<>,:"]', verbose: bool = True) -> pd.DataFrame:
438
- """
439
- Cleans DataFrame column names by replacing special characters.
440
-
441
- This function is useful for ensuring compatibility with libraries like LightGBM,
442
- which do not support special JSON characters such as `[]{}<>,:"` in feature names.
443
-
444
- Args:
445
- df (pd.DataFrame): The input DataFrame.
446
- replacement_char (str): The character to use for replacing characters.
447
- replacement_pattern (str): Regex pattern to use for the replacement logic.
448
- verbose (bool): If True, prints the renamed columns.
449
-
450
- Returns:
451
- pd.DataFrame: A new DataFrame with cleaned column names.
452
- """
453
- new_df = df.copy()
454
-
455
- original_columns = new_df.columns
456
- new_columns = original_columns.str.replace(replacement_pattern, replacement_char, regex=True)
457
-
458
- # Create a map of changes for logging
459
- rename_map = {old: new for old, new in zip(original_columns, new_columns) if old != new}
460
-
461
- if verbose:
462
- if rename_map:
463
- _LOGGER.info(f"Cleaned {len(rename_map)} column name(s) containing special characters:")
464
- for old, new in rename_map.items():
465
- print(f" '{old}' -> '{new}'")
466
- else:
467
- _LOGGER.info("No column names required cleaning.")
468
-
469
- new_df.columns = new_columns
470
- return new_df
471
-
472
-
473
- def plot_value_distributions(
474
- df: pd.DataFrame,
475
- save_dir: Union[str, Path],
476
- categorical_columns: Optional[List[str]] = None,
477
- max_categories: int = 100,
478
- fill_na_with: str = "MISSING DATA"
479
- ):
480
- """
481
- Plots and saves the value distributions for all columns in a DataFrame,
482
- using the best plot type for each column (histogram or count plot).
483
-
484
- Plots are saved as SVG files under two subdirectories in `save_dir`:
485
- - "Distribution_Continuous" for continuous numeric features (histograms).
486
- - "Distribution_Categorical" for categorical features (count plots).
487
-
488
- Args:
489
- df (pd.DataFrame): The input DataFrame to analyze.
490
- save_dir (str | Path): Directory path to save the plots.
491
- categorical_columns (List[str] | None): If provided, these will be treated as categorical, and all other columns will be treated as continuous.
492
- max_categories (int): The maximum number of unique categories a categorical feature can have to be plotted. Features exceeding this limit will be skipped.
493
- fill_na_with (str): A string to replace NaN values in categorical columns. This allows plotting 'missingness' as its own category.
494
-
495
- Notes:
496
- - `seaborn.histplot` with KDE is used for continuous features.
497
- - `seaborn.countplot` is used for categorical features.
498
- """
499
- # 1. Setup save directories
500
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
501
- numeric_dir = base_save_path / "Distribution_Continuous"
502
- categorical_dir = base_save_path / "Distribution_Categorical"
503
- numeric_dir.mkdir(parents=True, exist_ok=True)
504
- categorical_dir.mkdir(parents=True, exist_ok=True)
505
-
506
- # 2. Filter columns to plot
507
- columns_to_plot = df.columns.to_list()
508
-
509
- # Setup for forced categorical logic
510
- categorical_set = set(categorical_columns) if categorical_columns is not None else None
511
-
512
- numeric_plots_saved = 0
513
- categorical_plots_saved = 0
514
-
515
- for col_name in columns_to_plot:
516
- try:
517
- is_numeric = is_numeric_dtype(df[col_name])
518
- n_unique = df[col_name].nunique()
519
-
520
- # --- 3. Determine Plot Type ---
521
- is_continuous = False
522
- if categorical_set is not None:
523
- # Use the explicit list
524
- if col_name not in categorical_set:
525
- is_continuous = True
526
- else:
527
- # Use auto-detection
528
- if is_numeric:
529
- is_continuous = True
530
-
531
- # --- Case 1: Continuous Numeric (Histogram) ---
532
- if is_continuous:
533
- plt.figure(figsize=(10, 6))
534
- # Drop NaNs for histogram, as they can't be plotted on a numeric axis
535
- sns.histplot(x=df[col_name].dropna(), kde=True, bins=30)
536
- plt.title(f"Distribution of '{col_name}' (Continuous)")
537
- plt.xlabel(col_name)
538
- plt.ylabel("Count")
539
-
540
- save_path = numeric_dir / f"{sanitize_filename(col_name)}.svg"
541
- numeric_plots_saved += 1
542
-
543
- # --- Case 2: Categorical (Count Plot) ---
544
- else:
545
- # Check max categories
546
- if n_unique > max_categories:
547
- _LOGGER.warning(f"Skipping plot for '{col_name}': {n_unique} unique values > {max_categories} max_categories.")
548
- continue
549
-
550
- # Adaptive figure size
551
- fig_width = max(10, n_unique * 0.5)
552
- plt.figure(figsize=(fig_width, 8))
553
-
554
- # Make a temporary copy for plotting to handle NaNs
555
- temp_series = df[col_name].copy()
556
-
557
- # Handle NaNs by replacing them with the specified string
558
- if temp_series.isnull().any():
559
- # Convert to object type first to allow string replacement
560
- temp_series = temp_series.astype(object).fillna(fill_na_with)
561
-
562
- # Convert all to string to be safe (handles low-card numeric)
563
- temp_series = temp_series.astype(str)
564
-
565
- # Get category order by frequency
566
- order = temp_series.value_counts().index
567
- sns.countplot(x=temp_series, order=order, palette="Oranges", hue=temp_series, legend=False)
568
-
569
- plt.title(f"Distribution of '{col_name}' (Categorical)")
570
- plt.xlabel(col_name)
571
- plt.ylabel("Count")
572
-
573
- # Smart tick rotation
574
- max_label_len = 0
575
- if n_unique > 0:
576
- max_label_len = max(len(str(s)) for s in order)
577
-
578
- # Rotate if labels are long OR there are many categories
579
- if max_label_len > 10 or n_unique > 25:
580
- plt.xticks(rotation=45, ha='right')
581
-
582
- save_path = categorical_dir / f"{sanitize_filename(col_name)}.svg"
583
- categorical_plots_saved += 1
584
-
585
- # --- 4. Save Plot ---
586
- plt.grid(True, linestyle='--', alpha=0.6, axis='y')
587
- plt.tight_layout()
588
- # Save as .svg
589
- plt.savefig(save_path, format='svg', bbox_inches="tight")
590
- plt.close()
591
-
592
- except Exception as e:
593
- _LOGGER.error(f"Failed to plot distribution for '{col_name}'. Error: {e}")
594
- plt.close()
595
-
596
- _LOGGER.info(f"Saved {numeric_plots_saved} continuous distribution plots to '{numeric_dir.name}'.")
597
- _LOGGER.info(f"Saved {categorical_plots_saved} categorical distribution plots to '{categorical_dir.name}'.")
598
-
599
-
600
- def plot_continuous_vs_target(
601
- df_continuous: pd.DataFrame,
602
- df_targets: pd.DataFrame,
603
- save_dir: Union[str, Path],
604
- verbose: int = 1
605
- ):
606
- """
607
- Plots each continuous feature from df_continuous against each target in df_targets.
608
-
609
- This function creates a scatter plot for each feature-target pair, overlays a
610
- simple linear regression line, and saves each plot as an individual .svg file.
611
-
612
- Plots are saved in a structured way, with a subdirectory created for
613
- each target variable.
614
-
615
- Args:
616
- df_continuous (pd.DataFrame): DataFrame containing continuous feature columns (x-axis).
617
- df_targets (pd.DataFrame): DataFrame containing target columns (y-axis).
618
- save_dir (str | Path): The base directory where plots will be saved.
619
- verbose (int): Verbosity level for logging warnings.
620
-
621
- Notes:
622
- - Only numeric features and numeric targets are processed.
623
- - Rows with NaN in either the feature or the target are dropped pairwise.
624
- - Assumes df_continuous and df_targets share the same index.
625
- """
626
- # 1. Validate the base save directory
627
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
628
-
629
- # 2. Validation helper
630
- def _get_valid_numeric_cols(df: pd.DataFrame, df_name: str) -> List[str]:
631
- valid_cols = []
632
- for col in df.columns:
633
- if not is_numeric_dtype(df[col]):
634
- if verbose > 0:
635
- _LOGGER.warning(f"Column '{col}' in {df_name} is not numeric. Skipping.")
636
- else:
637
- valid_cols.append(col)
638
- return valid_cols
639
-
640
- # 3. Validate target columns
641
- valid_targets = _get_valid_numeric_cols(df_targets, "df_targets")
642
- if not valid_targets:
643
- _LOGGER.error("No valid numeric target columns provided in df_targets.")
644
- return
645
-
646
- # 4. Validate feature columns
647
- valid_features = _get_valid_numeric_cols(df_continuous, "df_continuous")
648
- if not valid_features:
649
- _LOGGER.error("No valid numeric feature columns provided in df_continuous.")
650
- return
651
-
652
- # 5. Main plotting loop
653
- total_plots_saved = 0
654
-
655
- for target_name in valid_targets:
656
- # Create a sanitized subdirectory for this target
657
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Continuous")
658
- target_save_dir = base_save_path / safe_target_dir_name
659
- target_save_dir.mkdir(parents=True, exist_ok=True)
660
-
661
- if verbose > 0:
662
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
663
-
664
- for feature_name in valid_features:
665
-
666
- # Align data and drop NaNs pairwise - use concat to ensure we respect the index alignment between the two DFs
667
- temp_df = pd.concat([
668
- df_continuous[feature_name],
669
- df_targets[target_name]
670
- ], axis=1).dropna()
671
-
672
- if temp_df.empty:
673
- if verbose > 1:
674
- _LOGGER.warning(f"No non-null data for '{feature_name}' vs '{target_name}'. Skipping plot.")
675
- continue
676
-
677
- x = temp_df[feature_name]
678
- y = temp_df[target_name]
679
-
680
- # 6. Perform linear fit
681
- try:
682
- # Modern replacement for np.polyfit + np.poly1d
683
- p = np.polynomial.Polynomial.fit(x, y, deg=1)
684
- plot_regression_line = True
685
- except (np.linalg.LinAlgError, ValueError):
686
- if verbose > 0:
687
- _LOGGER.warning(f"Linear regression failed for '{feature_name}' vs '{target_name}'. Plotting scatter only.")
688
- plot_regression_line = False
689
-
690
- # 7. Create the plot
691
- plt.figure(figsize=(10, 6))
692
- ax = plt.gca()
693
-
694
- # Plot the raw data points
695
- ax.plot(x, y, 'o', alpha=0.5, label='Data points', markersize=5)
696
-
697
- # Plot the regression line
698
- if plot_regression_line:
699
- ax.plot(x, p(x), "r--", label='Linear Fit') # type: ignore
700
-
701
- ax.set_title(f'{feature_name} vs {target_name}')
702
- ax.set_xlabel(feature_name)
703
- ax.set_ylabel(target_name)
704
- ax.legend()
705
- plt.grid(True, linestyle='--', alpha=0.6)
706
- plt.tight_layout()
707
-
708
- # 8. Save the plot
709
- safe_feature_name = sanitize_filename(feature_name)
710
- plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
711
- plot_path = target_save_dir / plot_filename
712
-
713
- try:
714
- plt.savefig(plot_path, bbox_inches="tight", format='svg')
715
- total_plots_saved += 1
716
- except Exception as e:
717
- _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
718
-
719
- # Close the figure to free up memory
720
- plt.close()
721
-
722
- if verbose > 0:
723
- _LOGGER.info(f"Successfully saved {total_plots_saved} feature-vs-target plots to '{base_save_path}'.")
724
-
725
-
726
- def plot_categorical_vs_target(
727
- df_categorical: pd.DataFrame,
728
- df_targets: pd.DataFrame,
729
- save_dir: Union[str, Path],
730
- max_categories: int = 50,
731
- fill_na_with: str = "MISSING DATA",
732
- drop_empty_targets: bool = True,
733
- verbose: int = 1
734
- ):
735
- """
736
- Plots each feature in df_categorical against each numeric target in df_targets using box plots.
737
-
738
- Automatically aligns the two DataFrames by index. If a numeric
739
- column is passed within df_categorical, it will be cast to object type to treat it as a category.
740
-
741
- Args:
742
- df_categorical (pd.DataFrame): DataFrame containing categorical feature columns (x-axis).
743
- df_targets (pd.DataFrame): DataFrame containing numeric target columns (y-axis).
744
- save_dir (str | Path): Base directory for saving plots.
745
- max_categories (int): The maximum number of unique categories a feature can have to be plotted.
746
- fill_na_with (str): String to replace NaN values in categorical columns.
747
- drop_empty_targets (bool): If True, drops rows where the target value is NaN before plotting.
748
- verbose (int): Verbosity level for logging warnings.
749
-
750
- Notes:
751
- - Assumes df_categorical and df_targets share the same index.
752
- """
753
- # 1. Validate the base save directory
754
- base_save_path = make_fullpath(save_dir, make=True, enforce="directory")
755
-
756
- # 2. Validate target columns (must be numeric)
757
- valid_targets = []
758
- for col in df_targets.columns:
759
- if not is_numeric_dtype(df_targets[col]):
760
- if verbose > 0:
761
- _LOGGER.warning(f"Target column '{col}' in df_targets is not numeric. Skipping.")
762
- else:
763
- valid_targets.append(col)
764
-
765
- if not valid_targets:
766
- _LOGGER.error("No valid numeric target columns provided in df_targets.")
767
- return
768
-
769
- # 3. Validate feature columns (Flexible: Allow numeric but warn)
770
- valid_features = []
771
- for col in df_categorical.columns:
772
- # If numeric, warn but accept it (will be cast to object later)
773
- if is_numeric_dtype(df_categorical[col]):
774
- if verbose > 0:
775
- _LOGGER.warning(f"Feature '{col}' in df_categorical is numeric. It will be cast to 'object' and treated as categorical.")
776
- valid_features.append(col)
777
- else:
778
- # Assume it is already object/category
779
- valid_features.append(col)
780
-
781
- if not valid_features:
782
- _LOGGER.error("No valid feature columns provided in df_categorical.")
783
- return
784
-
785
- # 4. Main plotting loop
786
- total_plots_saved = 0
787
-
788
- for target_name in valid_targets:
789
- # Create a sanitized subdirectory for this target
790
- safe_target_dir_name = sanitize_filename(f"{target_name}_vs_Categorical")
791
- target_save_dir = base_save_path / safe_target_dir_name
792
- target_save_dir.mkdir(parents=True, exist_ok=True)
793
-
794
- if verbose > 0:
795
- _LOGGER.info(f"Generating plots for target: '{target_name}' -> Saving to '{target_save_dir.name}'")
796
-
797
- for feature_name in valid_features:
798
-
799
- # Align data using concat to respect indices
800
- feature_series = df_categorical[feature_name]
801
- target_series = df_targets[target_name]
802
-
803
- # Create a temporary DataFrame for this pair
804
- temp_df = pd.concat([feature_series, target_series], axis=1)
805
-
806
- # Optional: Drop rows where the target is NaN
807
- if drop_empty_targets:
808
- temp_df = temp_df.dropna(subset=[target_name])
809
- if temp_df.empty:
810
- if verbose > 1:
811
- _LOGGER.warning(f"No valid data left for '{feature_name}' vs '{target_name}' after dropping empty targets. Skipping.")
812
- continue
813
-
814
- # Force feature to object if it isn't already (handling the numeric flexibility)
815
- if not is_object_dtype(temp_df[feature_name]):
816
- temp_df[feature_name] = temp_df[feature_name].astype(object)
817
-
818
- # Handle NaNs in the feature column (treat as a category)
819
- if temp_df[feature_name].isnull().any():
820
- temp_df[feature_name] = temp_df[feature_name].fillna(fill_na_with)
821
-
822
- # Convert to string to ensure consistent plotting and cardinality check
823
- temp_df[feature_name] = temp_df[feature_name].astype(str)
824
-
825
- # Check cardinality
826
- n_unique = temp_df[feature_name].nunique()
827
- if n_unique > max_categories:
828
- if verbose > 1:
829
- _LOGGER.warning(f"Skipping '{feature_name}': {n_unique} unique categories > {max_categories} max_categories.")
830
- continue
831
-
832
- # 5. Create the plot
833
- # Dynamic figure width based on number of categories
834
- plt.figure(figsize=(max(10, n_unique * 0.8), 10))
835
-
836
- sns.boxplot(x=feature_name, y=target_name, data=temp_df)
837
-
838
- plt.title(f'{target_name} vs {feature_name}')
839
- plt.xlabel(feature_name)
840
- plt.ylabel(target_name)
841
- plt.xticks(rotation=45, ha='right')
842
- plt.grid(True, linestyle='--', alpha=0.6, axis='y')
843
- plt.tight_layout()
844
-
845
- # 6. Save the plot
846
- safe_feature_name = sanitize_filename(feature_name)
847
- plot_filename = f"{safe_feature_name}_vs_{safe_target_dir_name}.svg"
848
- plot_path = target_save_dir / plot_filename
849
-
850
- try:
851
- plt.savefig(plot_path, bbox_inches="tight", format='svg')
852
- total_plots_saved += 1
853
- except Exception as e:
854
- _LOGGER.error(f"Failed to save plot: {plot_path}. Error: {e}")
855
-
856
- plt.close()
857
-
858
- if verbose > 0:
859
- _LOGGER.info(f"Successfully saved {total_plots_saved} categorical-vs-target plots to '{base_save_path}'.")
860
-
861
-
862
- def encode_categorical_features(
863
- df: pd.DataFrame,
864
- columns_to_encode: List[str],
865
- encode_nulls: bool,
866
- null_label: str = "Other",
867
- split_resulting_dataset: bool = True,
868
- verbose: bool = True
869
- ) -> Tuple[Dict[str, Dict[str, int]], pd.DataFrame, Optional[pd.DataFrame]]:
870
- """
871
- Finds unique values in specified categorical columns, encodes them into integers,
872
- and returns a dictionary containing the mappings for each column.
873
-
874
- This function automates the label encoding process and generates a simple,
875
- human-readable dictionary of the mappings.
876
-
877
- Args:
878
- df (pd.DataFrame): The input DataFrame.
879
- columns_to_encode (List[str]): A list of column names to be encoded.
880
- encode_nulls (bool):
881
- - If True, encodes Null values as a distinct category 'null_label' with a value of 0. Other categories start from 1.
882
- - If False, Nulls are ignored and categories start from 0.
883
-
884
- null_label (str): Category to encode Nulls to if `encode_nulls` is True. If a name collision with `null_label` occurs, the fallback key will be "__NULL__".
885
- split_resulting_dataset (bool):
886
- - If True, returns two separate DataFrames, one with non-categorical columns and one with the encoded columns.
887
- - If False, returns a single DataFrame with all columns.
888
- verbose (bool): If True, prints encoding progress.
889
-
890
- Returns:
891
- Tuple:
892
-
893
- - Dict[str, Dict[str, int]]: A dictionary where each key is a column name and the value is its category-to-integer mapping.
894
-
895
- - pd.DataFrame: The original dataframe with or without encoded columns (see `split_resulting_dataset`).
896
-
897
- - pd.DataFrame | None: If `split_resulting_dataset` is True, the encoded columns as a new dataframe.
898
-
899
- ## **Important:**
900
- 1. Do not encode 'Ordinal Features' (e.g., Low=1, Med=2, High=3), these must be treated as numerical (continuous).
901
- 2. Use `encode_nulls=False` when encoding binary values with missing entries or a malformed encoding will be returned silently.
902
- """
903
- df_encoded = df.copy()
904
-
905
- # Validate columns
906
- valid_columns = [col for col in columns_to_encode if col in df_encoded.columns]
907
- missing_columns = set(columns_to_encode) - set(valid_columns)
908
- if missing_columns:
909
- _LOGGER.warning(f"Columns not found and will be skipped: {list(missing_columns)}")
910
-
911
- mappings: Dict[str, Dict[str, int]] = {}
912
-
913
- _LOGGER.info(f"Encoding {len(valid_columns)} categorical column(s).")
914
- for col_name in valid_columns:
915
- has_nulls = df_encoded[col_name].isnull().any()
916
-
917
- # Get unique values once to check cardinality and generate categories
918
- raw_unique_values = df_encoded[col_name].dropna().unique()
919
-
920
- # --- Check for constant columns ---
921
- if len(raw_unique_values) <= 1:
922
- # Exception: If we are encoding nulls and nulls exist, this is effectively a binary feature (Null vs Value)
923
- is_effectively_binary = encode_nulls and has_nulls
924
-
925
- if not is_effectively_binary:
926
- _LOGGER.warning(f"Column '{col_name}' has only {len(raw_unique_values)} unique value(s). Consider dropping it before encoding as it offers no predictive variance.")
927
-
928
- # Prepare categories (sorted string representation)
929
- categories = sorted([str(cat) for cat in raw_unique_values])
930
-
931
- if encode_nulls and has_nulls:
932
- # Handle nulls: "Other" -> 0, other categories -> 1, 2, 3...
933
- # Start mapping from 1 for non-null values
934
- mapping = {category: i + 1 for i, category in enumerate(categories)}
935
-
936
- # Apply mapping and fill remaining NaNs with 0
937
- mapped_series = df_encoded[col_name].astype(str).map(mapping)
938
- df_encoded[col_name] = mapped_series.fillna(0).astype(int)
939
-
940
- # --- Validate nulls category---
941
- # Ensure the key for 0 doesn't collide with a real category.
942
- if null_label in mapping.keys():
943
- # COLLISION! null_label is a real category
944
- original_label = null_label
945
- null_label = "__NULL__" # fallback
946
- _LOGGER.warning(f"Column '{col_name}': '{original_label}' is a real category. Mapping nulls (0) to '{null_label}' instead.")
947
-
948
- # Create the complete user-facing map including "Other"
949
- user_mapping = {**mapping, null_label: 0}
950
- mappings[col_name] = user_mapping
951
- else:
952
- # ignore nulls: categories start from 0
953
- mapping = {category: i for i, category in enumerate(categories)}
954
-
955
- df_encoded[col_name] = df_encoded[col_name].astype(str).map(mapping)
956
-
957
- mappings[col_name] = mapping
958
-
959
- if verbose:
960
- cardinality = len(mappings[col_name])
961
- print(f" - Encoded '{col_name}' with {cardinality} unique values.")
962
-
963
- # Handle the dataset splitting logic
964
- if split_resulting_dataset:
965
- df_categorical = df_encoded[valid_columns]
966
- df_non_categorical = df.drop(columns=valid_columns)
967
- return mappings, df_non_categorical, df_categorical
968
- else:
969
- return mappings, df_encoded, None
970
-
971
-
972
- def split_features_targets(df: pd.DataFrame, targets: list[str]):
973
- """
974
- Splits a DataFrame's columns into features and targets.
975
-
976
- Args:
977
- df (pd.DataFrame): Pandas DataFrame containing the dataset.
978
- targets (list[str]): List of column names to be treated as target variables.
979
-
980
- Returns:
981
- tuple: A tuple containing:
982
- - pd.DataFrame: Features dataframe.
983
- - pd.DataFrame: Targets dataframe.
984
-
985
- Prints:
986
- - Shape of the original dataframe.
987
- - Shape of the features dataframe.
988
- - Shape of the targets dataframe.
989
- """
990
- valid_targets = _validate_columns(df, targets)
991
- df_targets = df[valid_targets]
992
- df_features = df.drop(columns=valid_targets)
993
- print(f"Original shape: {df.shape}\nFeatures shape: {df_features.shape}\nTargets shape: {df_targets.shape}")
994
- return df_features, df_targets
995
-
996
-
997
- def split_continuous_binary(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
998
- """
999
- Split DataFrame into two DataFrames: one with continuous columns, one with binary columns.
1000
- Normalize binary values like 0.0/1.0 to 0/1 if detected.
1001
-
1002
- Parameters:
1003
- df (pd.DataFrame): Input DataFrame with only numeric columns.
1004
-
1005
- Returns:
1006
- Tuple(pd.DataFrame, pd.DataFrame): (continuous_columns_df, binary_columns_df)
1007
-
1008
- Raises:
1009
- TypeError: If any column is not numeric.
1010
- """
1011
- if not all(np.issubdtype(dtype, np.number) for dtype in df.dtypes):
1012
- _LOGGER.error("All columns must be numeric (int or float).")
1013
- raise TypeError()
1014
-
1015
- binary_cols = []
1016
- continuous_cols = []
1017
-
1018
- for col in df.columns:
1019
- series = df[col]
1020
- unique_values = set(series[~series.isna()].unique())
1021
-
1022
- if unique_values.issubset({0, 1}):
1023
- binary_cols.append(col)
1024
- elif unique_values.issubset({0.0, 1.0}):
1025
- df[col] = df[col].apply(lambda x: 0 if x == 0.0 else (1 if x == 1.0 else x))
1026
- binary_cols.append(col)
1027
- else:
1028
- continuous_cols.append(col)
1029
-
1030
- binary_cols.sort()
1031
-
1032
- df_cont = df[continuous_cols]
1033
- df_bin = df[binary_cols]
1034
-
1035
- print(f"Continuous columns shape: {df_cont.shape}")
1036
- print(f"Binary columns shape: {df_bin.shape}")
1037
-
1038
- return df_cont, df_bin # type: ignore
1039
-
1040
-
1041
- def plot_correlation_heatmap(df: pd.DataFrame,
1042
- plot_title: str,
1043
- save_dir: Union[str, Path, None] = None,
1044
- method: Literal["pearson", "kendall", "spearman"]="pearson"):
1045
- """
1046
- Plots a heatmap of pairwise correlations between numeric features in a DataFrame.
1047
-
1048
- Args:
1049
- df (pd.DataFrame): The input dataset.
1050
- save_dir (str | Path | None): If provided, the heatmap will be saved to this directory as a svg file.
1051
- plot_title: The suffix "`method` Correlation Heatmap" will be automatically appended.
1052
- method (str): Correlation method to use. Must be one of:
1053
- - 'pearson' (default): measures linear correlation (assumes normally distributed data),
1054
- - 'kendall': rank correlation (non-parametric),
1055
- - 'spearman': monotonic relationship (non-parametric).
1056
-
1057
- Notes:
1058
- - Only numeric columns are included.
1059
- - Annotations are disabled if there are more than 20 features.
1060
- - Missing values are handled via pairwise complete observations.
1061
- """
1062
- numeric_df = df.select_dtypes(include='number')
1063
- if numeric_df.empty:
1064
- _LOGGER.warning("No numeric columns found. Heatmap not generated.")
1065
- return
1066
- if method not in ["pearson", "kendall", "spearman"]:
1067
- _LOGGER.error(f"'method' must be pearson, kendall, or spearman.")
1068
- raise ValueError()
1069
-
1070
- corr = numeric_df.corr(method=method)
1071
-
1072
- # Create a mask for the upper triangle
1073
- mask = np.triu(np.ones_like(corr, dtype=bool))
1074
-
1075
- # Plot setup
1076
- size = max(10, numeric_df.shape[1])
1077
- plt.figure(figsize=(size, size * 0.8))
1078
-
1079
- annot_bool = numeric_df.shape[1] <= 20
1080
- sns.heatmap(
1081
- corr,
1082
- mask=mask,
1083
- annot=annot_bool,
1084
- cmap='coolwarm',
1085
- fmt=".2f",
1086
- cbar_kws={"shrink": 0.8},
1087
- vmin=-1, # Anchors minimum color to -1
1088
- vmax=1, # Anchors maximum color to 1
1089
- center=0 # Ensures 0 corresponds to the neutral color (white)
1090
- )
1091
-
1092
- # add suffix to title
1093
- full_plot_title = f"{plot_title} - {method.title()} Correlation Heatmap"
1094
-
1095
- plt.title(full_plot_title)
1096
- plt.xticks(rotation=45, ha='right')
1097
- plt.yticks(rotation=0)
1098
-
1099
- plt.tight_layout()
1100
-
1101
- if save_dir:
1102
- save_path = make_fullpath(save_dir, make=True)
1103
- # sanitize the plot title to save the file
1104
- sanitized_plot_title = sanitize_filename(plot_title)
1105
- plot_filename = sanitized_plot_title + ".svg"
1106
-
1107
- full_path = save_path / plot_filename
1108
-
1109
- plt.savefig(full_path, bbox_inches="tight", format='svg')
1110
- _LOGGER.info(f"Saved correlation heatmap: '{plot_filename}'")
1111
-
1112
- plt.show()
1113
- plt.close()
1114
-
1115
-
1116
- def clip_outliers_single(
1117
- df: pd.DataFrame,
1118
- column: str,
1119
- min_val: float,
1120
- max_val: float
1121
- ) -> Union[pd.DataFrame, None]:
1122
- """
1123
- Clips values in the specified numeric column to the range [min_val, max_val],
1124
- and returns a new DataFrame where the original column is replaced by the clipped version.
1125
-
1126
- Args:
1127
- df (pd.DataFrame): The input DataFrame.
1128
- column (str): The name of the column to clip.
1129
- min_val (float): Minimum allowable value; values below are clipped to this.
1130
- max_val (float): Maximum allowable value; values above are clipped to this.
1131
-
1132
- Returns:
1133
- pd.DataFrame: A new DataFrame with the specified column clipped in place.
1134
-
1135
- None: if a problem with the dataframe column occurred.
1136
- """
1137
- if column not in df.columns:
1138
- _LOGGER.warning(f"Column '{column}' not found in DataFrame.")
1139
- return None
1140
-
1141
- if not pd.api.types.is_numeric_dtype(df[column]):
1142
- _LOGGER.warning(f"Column '{column}' must be numeric.")
1143
- return None
1144
-
1145
- new_df = df.copy(deep=True)
1146
- new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
1147
-
1148
- _LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
1149
- return new_df
1150
-
1151
-
1152
- def clip_outliers_multi(
1153
- df: pd.DataFrame,
1154
- clip_dict: Union[Dict[str, Tuple[int, int]], Dict[str, Tuple[float, float]]],
1155
- verbose: bool=False
1156
- ) -> pd.DataFrame:
1157
- """
1158
- Clips values in multiple specified numeric columns to given [min, max] ranges,
1159
- updating values (deep copy) and skipping invalid entries.
1160
-
1161
- Args:
1162
- df (pd.DataFrame): The input DataFrame.
1163
- clip_dict (dict): A dictionary where keys are column names and values are (min_val, max_val) tuples.
1164
- verbose (bool): prints clipped range for each column.
1165
-
1166
- Returns:
1167
- pd.DataFrame: A new DataFrame with specified columns clipped.
1168
-
1169
- Notes:
1170
- - Invalid specifications (missing column, non-numeric type, wrong tuple length)
1171
- will be reported but skipped.
1172
- """
1173
- new_df = df.copy()
1174
- skipped_columns = []
1175
- clipped_columns = 0
1176
-
1177
- for col, bounds in clip_dict.items():
1178
- try:
1179
- if col not in df.columns:
1180
- _LOGGER.error(f"Column '{col}' not found in DataFrame.")
1181
- raise ValueError()
1182
-
1183
- if not pd.api.types.is_numeric_dtype(df[col]):
1184
- _LOGGER.error(f"Column '{col}' is not numeric.")
1185
- raise TypeError()
1186
-
1187
- if not (isinstance(bounds, tuple) and len(bounds) == 2):
1188
- _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
1189
- raise ValueError()
1190
-
1191
- min_val, max_val = bounds
1192
- new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
1193
- if verbose:
1194
- print(f"Clipped '{col}' to range [{min_val}, {max_val}].")
1195
- clipped_columns += 1
1196
-
1197
- except Exception as e:
1198
- skipped_columns.append((col, str(e)))
1199
- continue
1200
-
1201
- _LOGGER.info(f"Clipped {clipped_columns} columns.")
1202
-
1203
- if skipped_columns:
1204
- _LOGGER.warning("Skipped columns:")
1205
- for col, msg in skipped_columns:
1206
- print(f" - {col}")
1207
-
1208
- return new_df
1209
-
1210
-
1211
- def drop_outlier_samples(
1212
- df: pd.DataFrame,
1213
- bounds_dict: Dict[str, Tuple[Union[int, float], Union[int, float]]],
1214
- drop_on_nulls: bool = False,
1215
- verbose: bool = True
1216
- ) -> pd.DataFrame:
1217
- """
1218
- Drops entire rows where values in specified numeric columns fall outside
1219
- a given [min, max] range.
1220
-
1221
- This function processes a copy of the DataFrame, ensuring the original is
1222
- not modified. It skips columns with invalid specifications.
1223
-
1224
- Args:
1225
- df (pd.DataFrame): The input DataFrame.
1226
- bounds_dict (dict): A dictionary where keys are column names and values
1227
- are (min_val, max_val) tuples defining the valid range.
1228
- drop_on_nulls (bool): If True, rows with NaN/None in a checked column
1229
- will also be dropped. If False, NaN/None are ignored.
1230
- verbose (bool): If True, prints the number of rows dropped for each column.
1231
-
1232
- Returns:
1233
- pd.DataFrame: A new DataFrame with the outlier rows removed.
1234
-
1235
- Notes:
1236
- - Invalid specifications (e.g., missing column, non-numeric type,
1237
- incorrectly formatted bounds) will be reported and skipped.
1238
- """
1239
- new_df = df.copy()
1240
- skipped_columns: List[Tuple[str, str]] = []
1241
- initial_rows = len(new_df)
1242
-
1243
- for col, bounds in bounds_dict.items():
1244
- try:
1245
- # --- Validation Checks ---
1246
- if col not in df.columns:
1247
- _LOGGER.error(f"Column '{col}' not found in DataFrame.")
1248
- raise ValueError()
1249
-
1250
- if not pd.api.types.is_numeric_dtype(df[col]):
1251
- _LOGGER.error(f"Column '{col}' is not of a numeric data type.")
1252
- raise TypeError()
1253
-
1254
- if not (isinstance(bounds, tuple) and len(bounds) == 2):
1255
- _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
1256
- raise ValueError()
1257
-
1258
- # --- Filtering Logic ---
1259
- min_val, max_val = bounds
1260
- rows_before_drop = len(new_df)
1261
-
1262
- # Create the base mask for values within the specified range
1263
- # .between() is inclusive and evaluates to False for NaN
1264
- mask_in_bounds = new_df[col].between(min_val, max_val)
1265
-
1266
- if drop_on_nulls:
1267
- # Keep only rows that are within bounds.
1268
- # Since mask_in_bounds is False for NaN, nulls are dropped.
1269
- final_mask = mask_in_bounds
1270
- else:
1271
- # Keep rows that are within bounds OR are null.
1272
- mask_is_null = new_df[col].isnull()
1273
- final_mask = mask_in_bounds | mask_is_null
1274
-
1275
- # Apply the final mask
1276
- new_df = new_df[final_mask]
1277
-
1278
- rows_after_drop = len(new_df)
1279
-
1280
- if verbose:
1281
- dropped_count = rows_before_drop - rows_after_drop
1282
- if dropped_count > 0:
1283
- print(
1284
- f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
1285
- )
1286
-
1287
- except (ValueError, TypeError) as e:
1288
- skipped_columns.append((col, str(e)))
1289
- continue
1290
-
1291
- total_dropped = initial_rows - len(new_df)
1292
- _LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
1293
-
1294
- if skipped_columns:
1295
- _LOGGER.warning("Skipped the following columns due to errors:")
1296
- for col, msg in skipped_columns:
1297
- # Only print the column name for cleaner output as the error was already logged
1298
- print(f" - {col}")
1299
-
1300
- # if new_df is a series, convert to dataframe
1301
- if isinstance(new_df, pd.Series):
1302
- new_df = new_df.to_frame()
1303
-
1304
- return new_df
1305
-
1306
-
1307
- def match_and_filter_columns_by_regex(
1308
- df: pd.DataFrame,
1309
- pattern: str,
1310
- case_sensitive: bool = False,
1311
- escape_pattern: bool = False
1312
- ) -> Tuple[pd.DataFrame, List[str]]:
1313
- """
1314
- Return a tuple of (filtered DataFrame, matched column names) based on a regex pattern.
1315
-
1316
- Parameters:
1317
- df (pd.DataFrame): The DataFrame to search.
1318
- pattern (str): The regex pattern to match column names (use a raw string).
1319
- case_sensitive (bool): Whether matching is case-sensitive.
1320
- escape_pattern (bool): If True, the pattern is escaped with `re.escape()` to treat it literally.
1321
-
1322
- Returns:
1323
- (Tuple[pd.DataFrame, list[str]]): A DataFrame filtered to matched columns, and a list of matching column names.
1324
- """
1325
- if escape_pattern:
1326
- pattern = re.escape(pattern)
1327
-
1328
- mask = df.columns.str.contains(pattern, case=case_sensitive, regex=True)
1329
- matched_columns = df.columns[mask].to_list()
1330
- filtered_df = df.loc[:, mask]
1331
-
1332
- _LOGGER.info(f"{len(matched_columns)} columns match the regex pattern '{pattern}'.")
1333
-
1334
- # if filtered df is a series, convert to dataframe
1335
- if isinstance(filtered_df, pd.Series):
1336
- filtered_df = filtered_df.to_frame()
1337
-
1338
- return filtered_df, matched_columns
1339
-
1340
-
1341
- def standardize_percentages(
1342
- df: pd.DataFrame,
1343
- columns: list[str],
1344
- treat_one_as_proportion: bool = True,
1345
- round_digits: int = 2,
1346
- verbose: bool=True
1347
- ) -> pd.DataFrame:
1348
- """
1349
- Standardizes numeric columns containing mixed-format percentages.
1350
-
1351
- This function cleans columns where percentages might be entered as whole
1352
- numbers (55) and as proportions (0.55). It assumes values
1353
- between 0 and 1 are proportions and multiplies them by 100.
1354
-
1355
- Args:
1356
- df (pd.Dataframe): The input pandas DataFrame.
1357
- columns (list[str]): A list of column names to standardize.
1358
- treat_one_as_proportion (bool):
1359
- - If True (default): The value `1` is treated as a proportion and converted to `100%`.
1360
- - If False: The value `1` is treated as `1%`.
1361
- round_digits (int): The number of decimal places to round the final result to.
1362
-
1363
- Returns:
1364
- (pd.Dataframe):
1365
- A new DataFrame with the specified columns cleaned and standardized.
1366
- """
1367
- df_copy = df.copy()
1368
-
1369
- if df_copy.empty:
1370
- return df_copy
1371
-
1372
- # This helper function contains the core cleaning logic
1373
- def _clean_value(x: float) -> float:
1374
- """Applies the standardization rule to a single value."""
1375
- if pd.isna(x):
1376
- return x
1377
-
1378
- # If treat_one_as_proportion is True, the range for proportions is [0, 1]
1379
- if treat_one_as_proportion and 0 <= x <= 1:
1380
- return x * 100
1381
- # If False, the range for proportions is [0, 1) (1 is excluded)
1382
- elif not treat_one_as_proportion and 0 <= x < 1:
1383
- return x * 100
1384
-
1385
- # Otherwise, the value is assumed to be a correctly formatted percentage
1386
- return x
1387
-
1388
- fixed_columns: list[str] = list()
1389
-
1390
- for col in columns:
1391
- # --- Robustness Checks ---
1392
- if col not in df_copy.columns:
1393
- _LOGGER.warning(f"Column '{col}' not found. Skipping.")
1394
- continue
1395
-
1396
- if not is_numeric_dtype(df_copy[col]):
1397
- _LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
1398
- continue
1399
-
1400
- # --- Applying the Logic ---
1401
- # Apply the cleaning function to every value in the column
1402
- df_copy[col] = df_copy[col].apply(_clean_value)
1403
-
1404
- # Round the result
1405
- df_copy[col] = df_copy[col].round(round_digits)
1406
-
1407
- fixed_columns.append(col)
1408
-
1409
- if verbose:
1410
- _LOGGER.info(f"Columns standardized:")
1411
- for fixed_col in fixed_columns:
1412
- print(f" '{fixed_col}'")
1413
-
1414
- return df_copy
1415
-
1416
-
1417
- def reconstruct_one_hot(
1418
- df: pd.DataFrame,
1419
- features_to_reconstruct: List[Union[str, Tuple[str, Optional[str]]]],
1420
- separator: str = '_',
1421
- baseline_category_name: Optional[str] = "Other",
1422
- drop_original: bool = True,
1423
- verbose: bool = True
1424
- ) -> pd.DataFrame:
1425
- """
1426
- Reconstructs original categorical columns from a one-hot encoded DataFrame.
1427
-
1428
- This function identifies groups of one-hot encoded columns based on a common
1429
- prefix (base feature name) and a separator. It then collapses each group
1430
- into a single column containing the categorical value.
1431
-
1432
- Args:
1433
- df (pd.DataFrame):
1434
- The input DataFrame with one-hot encoded columns.
1435
- features_to_reconstruct (List[str | Tuple[str, str | None]]):
1436
- A list defining the features to reconstruct. This list can contain:
1437
-
1438
- - A string: (e.g., "Color")
1439
- This reconstructs the feature 'Color' and assumes all-zero rows represent the baseline category ("Other" by default).
1440
- - A tuple: (e.g., ("Pet", "Dog"))
1441
- This reconstructs 'Pet' and maps all-zero rows to the baseline category "Dog".
1442
- - A tuple with None: (e.g., ("Size", None))
1443
- This reconstructs 'Size' and maps all-zero rows to the NaN value.
1444
- Example:
1445
- [
1446
- "Mood", # All-zeros -> "Other"
1447
- ("Color", "Red"), # All-zeros -> "Red"
1448
- ("Size", None) # All-zeros -> NaN
1449
- ]
1450
- separator (str):
1451
- The character separating the base name from the categorical value in
1452
- the column names (e.g., '_' in 'B_a').
1453
- baseline_category_name (str | None):
1454
- The baseline category name to use by default if it is not explicitly provided.
1455
- drop_original (bool):
1456
- If True, the original one-hot encoded columns will be dropped from
1457
- the returned DataFrame.
1458
-
1459
- Returns:
1460
- pd.DataFrame:
1461
- A new DataFrame with the specified one-hot encoded features
1462
- reconstructed into single categorical columns.
1463
-
1464
- <br>
1465
-
1466
- ## Note:
1467
-
1468
- This function is designed to be robust, but users should be aware of two key edge cases:
1469
-
1470
- 1. **Ambiguous Base Feature Prefixes**: If `base_feature_names` list contains names where one is a prefix of another (e.g., `['feat', 'feat_ext']`), the order is critical. The function will match columns greedily. To avoid incorrect grouping, always list the **most specific base names first** (e.g., `['feat_ext', 'feat']`).
1471
-
1472
- 2. **Malformed One-Hot Data**: If a row contains multiple `1`s within the same feature group (e.g., both `B_a` and `B_c` are `1`), the function will not raise an error. It uses `.idxmax()`, which returns the first column that contains the maximum value. This means it will silently select the first category it encounters and ignore the others, potentially masking an upstream data issue.
1473
- """
1474
- if not isinstance(df, pd.DataFrame):
1475
- _LOGGER.error("Input must be a pandas DataFrame.")
1476
- raise TypeError()
1477
-
1478
- if not (baseline_category_name is None or isinstance(baseline_category_name, str)):
1479
- _LOGGER.error("The baseline_category must be None or a string.")
1480
- raise TypeError()
1481
-
1482
- new_df = df.copy()
1483
- all_ohe_cols_to_drop = []
1484
- reconstructed_count = 0
1485
-
1486
- # --- 1. Parse and validate the reconstruction config ---
1487
- # This normalizes the input into a clean {base_name: baseline_val} dict
1488
- reconstruction_config: Dict[str, Optional[str]] = {}
1489
- try:
1490
- for item in features_to_reconstruct:
1491
- if isinstance(item, str):
1492
- # Case 1: "Color"
1493
- base_name = item
1494
- baseline_val = baseline_category_name
1495
- elif isinstance(item, tuple) and len(item) == 2:
1496
- # Case 2: ("Pet", "dog") or ("Size", None)
1497
- base_name, baseline_val = item
1498
- if not (isinstance(base_name, str) and (isinstance(baseline_val, str) or baseline_val is None)):
1499
- _LOGGER.error(f"Invalid tuple format for '{item}'. Must be (str, str|None).")
1500
- raise ValueError()
1501
- else:
1502
- _LOGGER.error(f"Invalid item '{item}'. Must be str or (str, str|None) tuple.")
1503
- raise ValueError()
1504
-
1505
- if base_name in reconstruction_config and verbose:
1506
- _LOGGER.warning(f"Duplicate entry for '{base_name}' found. Using the last provided configuration.")
1507
-
1508
- reconstruction_config[base_name] = baseline_val
1509
-
1510
- except Exception as e:
1511
- _LOGGER.error(f"Failed to parse 'features_to_reconstruct' argument: {e}")
1512
- raise ValueError("Invalid configuration for 'features_to_reconstruct'.") from e
1513
-
1514
- _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_config)} one-hot encoded feature(s).")
1515
-
1516
- # Main logic
1517
- for base_name, baseline_category in reconstruction_config.items():
1518
- # Regex to find all columns belonging to this base feature.
1519
- pattern = f"^{re.escape(base_name)}{re.escape(separator)}"
1520
-
1521
- # Find matching columns
1522
- ohe_cols = [col for col in df.columns if re.match(pattern, col)]
1523
-
1524
- if not ohe_cols:
1525
- _LOGGER.warning(f"No one-hot encoded columns found for base feature '{base_name}'. Skipping.")
1526
- continue
1527
-
1528
- # For each row, find the column name with the maximum value (which is 1)
1529
- reconstructed_series = new_df[ohe_cols].idxmax(axis=1) # type: ignore
1530
-
1531
- # Extract the categorical value (the suffix) from the column name
1532
- # Use n=1 in split to handle cases where the category itself might contain the separator
1533
- new_column_values = reconstructed_series.str.split(separator, n=1).str[1] # type: ignore
1534
-
1535
- # Handle rows where all OHE columns were 0 (e.g., original value was NaN or a dropped baseline).
1536
- all_zero_mask = new_df[ohe_cols].sum(axis=1) == 0 # type: ignore
1537
-
1538
- if baseline_category is not None:
1539
- # A baseline category was provided
1540
- new_column_values.loc[all_zero_mask] = baseline_category
1541
- else:
1542
- # No baseline provided: assign NaN
1543
- new_column_values.loc[all_zero_mask] = np.nan # type: ignore
1544
-
1545
- if verbose:
1546
- print(f" - Mapped 'all-zero' rows for '{base_name}' to baseline: '{baseline_category}'.")
1547
-
1548
- # Assign the new reconstructed column to the DataFrame
1549
- new_df[base_name] = new_column_values
1550
-
1551
- all_ohe_cols_to_drop.extend(ohe_cols)
1552
- reconstructed_count += 1
1553
- if verbose:
1554
- print(f" - Reconstructed '{base_name}' from {len(ohe_cols)} columns.")
1555
-
1556
- # Cleanup
1557
- if drop_original and all_ohe_cols_to_drop:
1558
- # Drop the original OHE columns, ensuring no duplicates in the drop list
1559
- unique_cols_to_drop = list(set(all_ohe_cols_to_drop))
1560
- new_df.drop(columns=unique_cols_to_drop, inplace=True)
1561
- _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original one-hot encoded columns.")
1562
-
1563
- _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1564
-
1565
- return new_df
1566
-
1567
-
1568
- def reconstruct_binary(
1569
- df: pd.DataFrame,
1570
- reconstruction_map: Dict[str, Tuple[str, Any, Any]],
1571
- drop_original: bool = True,
1572
- verbose: bool = True
1573
- ) -> pd.DataFrame:
1574
- """
1575
- Reconstructs new categorical columns from existing binary (0/1) columns.
1576
-
1577
- Used to reverse a binary encoding by mapping 0 and 1 back to
1578
- descriptive categorical labels.
1579
-
1580
- Args:
1581
- df (pd.DataFrame):
1582
- The input DataFrame.
1583
- reconstruction_map (Dict[str, Tuple[str, Any, Any]]):
1584
- A dictionary defining the reconstructions.
1585
- Format:
1586
- { "new_col_name": ("source_col_name", "label_for_0", "label_for_1") }
1587
- Example:
1588
- {
1589
- "Sex": ("Sex_male", "Female", "Male"),
1590
- "Smoker": ("Is_Smoker", "No", "Yes")
1591
- }
1592
- drop_original (bool):
1593
- If True, the original binary source columns (e.g., "Sex_male")
1594
- will be dropped from the returned DataFrame.
1595
- verbose (bool):
1596
- If True, prints the details of each reconstruction.
1597
-
1598
- Returns:
1599
- pd.DataFrame:
1600
- A new DataFrame with the reconstructed categorical columns.
1601
-
1602
- Raises:
1603
- TypeError: If `df` is not a pandas DataFrame.
1604
- ValueError: If `reconstruction_map` is not a dictionary or a
1605
- configuration is invalid (e.g., column name collision).
1606
-
1607
- Notes:
1608
- - The function operates on a copy of the DataFrame.
1609
- - Rows with `NaN` in the source column will have `NaN` in the
1610
- new column.
1611
- - Values in the source column other than 0 or 1 (e.g., 2) will
1612
- result in `NaN` in the new column.
1613
- """
1614
- if not isinstance(df, pd.DataFrame):
1615
- _LOGGER.error("Input must be a pandas DataFrame.")
1616
- raise TypeError()
1617
-
1618
- if not isinstance(reconstruction_map, dict):
1619
- _LOGGER.error("`reconstruction_map` must be a dictionary with the required format.")
1620
- raise ValueError()
1621
-
1622
- new_df = df.copy()
1623
- source_cols_to_drop: List[str] = []
1624
- reconstructed_count = 0
1625
-
1626
- _LOGGER.info(f"Attempting to reconstruct {len(reconstruction_map)} binary feature(s).")
1627
-
1628
- for new_col_name, config in reconstruction_map.items():
1629
-
1630
- # --- 1. Validation ---
1631
- if not (isinstance(config, tuple) and len(config) == 3):
1632
- _LOGGER.error(f"Config for '{new_col_name}' is invalid. Must be a 3-item tuple. Skipping.")
1633
- raise ValueError()
1634
-
1635
- source_col, label_for_0, label_for_1 = config
1636
-
1637
- if source_col not in new_df.columns:
1638
- _LOGGER.error(f"Source column '{source_col}' for new column '{new_col_name}' not found. Skipping.")
1639
- raise ValueError()
1640
-
1641
- if new_col_name in new_df.columns and new_col_name != source_col and verbose:
1642
- _LOGGER.warning(f"New column '{new_col_name}' already exists and will be overwritten.")
1643
-
1644
- # --- 2. Reconstruction ---
1645
- mapping_dict = {0: label_for_0, 1: label_for_1}
1646
- new_df[new_col_name] = new_df[source_col].map(mapping_dict)
1647
-
1648
- # --- 3. Logging/Tracking ---
1649
- # Only mark source for dropping if it's NOT the same as the new column
1650
- if source_col != new_col_name:
1651
- source_cols_to_drop.append(source_col)
1652
-
1653
- reconstructed_count += 1
1654
- if verbose:
1655
- print(f" - Reconstructed '{new_col_name}' from '{source_col}' (0='{label_for_0}', 1='{label_for_1}').")
1656
-
1657
- # --- 4. Cleanup ---
1658
- if drop_original and source_cols_to_drop:
1659
- unique_cols_to_drop = list(set(source_cols_to_drop))
1660
- new_df.drop(columns=unique_cols_to_drop, inplace=True)
1661
- _LOGGER.info(f"Dropped {len(unique_cols_to_drop)} original binary source column(s).")
1662
-
1663
- _LOGGER.info(f"Successfully reconstructed {reconstructed_count} feature(s).")
1664
-
1665
- return new_df
1666
-
1667
-
1668
- def reconstruct_multibinary(
1669
- df: pd.DataFrame,
1670
- pattern: str,
1671
- pos_label: str = "Yes",
1672
- neg_label: str = "No",
1673
- case_sensitive: bool = False,
1674
- verbose: bool = True
1675
- ) -> Tuple[pd.DataFrame, List[str]]:
1676
- """
1677
- Identifies binary columns matching a regex pattern and converts their numeric
1678
- values (0/1) into categorical string labels (e.g., "No"/"Yes").
1679
-
1680
- This allows mass-labeling of binary features so they are treated as proper
1681
- categorical variables with meaningful keys during subsequent encoding steps.
1682
-
1683
- Args:
1684
- df (pd.DataFrame): The input DataFrame.
1685
- pattern (str): Regex pattern to identify the group of binary columns.
1686
- pos_label (str): The label to assign to 1 or True (default "Yes").
1687
- neg_label (str): The label to assign to 0 or False (default "No").
1688
- case_sensitive (bool): If True, regex matching is case-sensitive.
1689
- verbose (bool): If True, prints a summary of the operation.
1690
-
1691
- Returns:
1692
- Tuple(pd.DataFrame, List[str]):
1693
- - A new DataFrame with the matched columns converted to Strings.
1694
- - A list of the column names that were modified.
1695
- """
1696
- if not isinstance(df, pd.DataFrame):
1697
- _LOGGER.error("Input must be a pandas DataFrame.")
1698
- raise TypeError()
1699
-
1700
- new_df = df.copy()
1701
-
1702
- # 1. Find columns matching the regex
1703
- mask = new_df.columns.str.contains(pattern, case=case_sensitive, regex=True)
1704
- target_columns = new_df.columns[mask].to_list()
1705
-
1706
- if not target_columns:
1707
- _LOGGER.warning(f"No columns found matching pattern '{pattern}'. Returning original DataFrame.")
1708
- return new_df, list()
1709
-
1710
- # 2. Define robust mapping (handles ints, floats, and booleans)
1711
- # Note: Any value not in this map will become NaN
1712
- mapping_dict = {
1713
- 0: neg_label,
1714
- 0.0: neg_label,
1715
- False: neg_label,
1716
- 1: pos_label,
1717
- 1.0: pos_label,
1718
- True: pos_label
1719
- }
1720
-
1721
- converted_count = 0
1722
-
1723
- # 3. Apply mapping
1724
- for col in target_columns:
1725
- # Check if column is numeric or boolean before attempting map to avoid destroying existing strings
1726
- if is_numeric_dtype(new_df[col]) or is_object_dtype(new_df[col]):
1727
- # We cast to object implicitly by mapping to strings
1728
- new_df[col] = new_df[col].map(mapping_dict)
1729
- converted_count += 1
1730
-
1731
- if verbose:
1732
- _LOGGER.info(f"Reconstructed {converted_count} binary columns matching '{pattern}'.")
1733
-
1734
- return new_df, target_columns
1735
-
1736
-
1737
- def finalize_feature_schema(
1738
- df_features: pd.DataFrame,
1739
- categorical_mappings: Optional[Dict[str, Dict[str, int]]]
1740
- ) -> FeatureSchema:
1741
- """
1742
- Analyzes the final features DataFrame to create a definitive schema.
1743
-
1744
- This function is the "single source of truth" for column order
1745
- and type (categorical vs. continuous) for the entire ML pipeline.
1746
-
1747
- It should be called at the end of the feature engineering process.
1748
-
1749
- Args:
1750
- df_features (pd.DataFrame):
1751
- The final, processed DataFrame containing *only* feature columns
1752
- in the exact order they will be fed to the model.
1753
- categorical_mappings (Dict[str, Dict[str, int]] | None):
1754
- The mappings dictionary generated by
1755
- `encode_categorical_features`. Can be None if no
1756
- categorical features exist.
1757
-
1758
- Returns:
1759
- FeatureSchema: A NamedTuple containing all necessary metadata for the pipeline.
1760
- """
1761
- feature_names: List[str] = df_features.columns.to_list()
1762
-
1763
- # Intermediate lists for building
1764
- continuous_feature_names_list: List[str] = []
1765
- categorical_feature_names_list: List[str] = []
1766
- categorical_index_map_dict: Dict[int, int] = {}
1767
-
1768
- # _LOGGER.info("Finalizing feature schema...")
1769
-
1770
- if categorical_mappings:
1771
- # --- Categorical features are present ---
1772
- categorical_names_set = set(categorical_mappings.keys())
1773
-
1774
- for index, name in enumerate(feature_names):
1775
- if name in categorical_names_set:
1776
- # This is a categorical feature
1777
- cardinality = len(categorical_mappings[name])
1778
- categorical_index_map_dict[index] = cardinality
1779
- categorical_feature_names_list.append(name)
1780
- else:
1781
- # This is a continuous feature
1782
- continuous_feature_names_list.append(name)
1783
-
1784
- # Use the populated dict, or None if it's empty
1785
- final_index_map = categorical_index_map_dict if categorical_index_map_dict else None
1786
-
1787
- else:
1788
- # --- No categorical features ---
1789
- _LOGGER.info("No categorical mappings provided. Treating all features as continuous.")
1790
- continuous_feature_names_list = list(feature_names)
1791
- # categorical_feature_names_list remains empty
1792
- # categorical_index_map_dict remains empty
1793
- final_index_map = None # Explicitly set to None to match Optional type
1794
-
1795
- _LOGGER.info(f"Schema created: {len(continuous_feature_names_list)} continuous, {len(categorical_feature_names_list)} categorical.")
1796
-
1797
- # Create the final immutable instance
1798
- schema_instance = FeatureSchema(
1799
- feature_names=tuple(feature_names),
1800
- continuous_feature_names=tuple(continuous_feature_names_list),
1801
- categorical_feature_names=tuple(categorical_feature_names_list),
1802
- categorical_index_map=final_index_map,
1803
- categorical_mappings=categorical_mappings
1804
- )
1805
-
1806
- return schema_instance
1807
-
1808
-
1809
- def apply_feature_schema(
1810
- df: pd.DataFrame,
1811
- schema: FeatureSchema,
1812
- targets: Optional[List[str]] = None,
1813
- unknown_value: int = 99999,
1814
- verbose: bool = True
1815
- ) -> pd.DataFrame:
1816
- """
1817
- Aligns the input DataFrame with the provided FeatureSchema.
1818
-
1819
- This function aligns data for inference/fine-tuning by enforcing the schema's
1820
- structure and encoding.
1821
-
1822
- Args:
1823
- df (pd.DataFrame): The input DataFrame.
1824
- schema (FeatureSchema): The schema defining feature names, types, and mappings.
1825
- targets (list[str] | None): Optional list of target column names.
1826
- unknown_value (int): Integer value to assign to unknown categorical levels.
1827
- Defaults to 99999 to avoid collision with existing categories.
1828
- verbose (bool): If True, logs info about dropped extra columns.
1829
-
1830
- Returns:
1831
- pd.DataFrame: A new DataFrame with the exact column order and encoding defined by the schema.
1832
-
1833
- Raises:
1834
- ValueError: If any required feature or target column is missing.
1835
- """
1836
- # 1. Setup
1837
- df_processed = df.copy()
1838
- targets = targets if targets is not None else []
1839
-
1840
- # 2. Validation: Strict Column Presence
1841
- missing_features = [col for col in schema.feature_names if col not in df_processed.columns]
1842
- if missing_features:
1843
- _LOGGER.error(f"Schema Mismatch: Missing required features: {missing_features}")
1844
- raise ValueError()
1845
-
1846
- # target columns should not be part of feature columns
1847
- if targets:
1848
- overlapping_columns = set(schema.feature_names).intersection(set(targets))
1849
- if overlapping_columns:
1850
- _LOGGER.error(f"Schema Mismatch: Target columns overlap with feature columns: {overlapping_columns}")
1851
- raise ValueError()
1852
-
1853
- # targets were provided, check their presence
1854
- missing_targets = [col for col in targets if col not in df_processed.columns]
1855
- if missing_targets:
1856
- _LOGGER.error(f"Target Mismatch: Missing target columns: {missing_targets}")
1857
- raise ValueError()
1858
-
1859
- # 3. Apply Categorical Encoding
1860
- if schema.categorical_feature_names and schema.categorical_mappings:
1861
- for col_name in schema.categorical_feature_names:
1862
- # Should never happen due to schema construction, but double-check and raise
1863
- if col_name not in schema.categorical_mappings:
1864
- _LOGGER.error(f"Schema Inconsistency: No mapping found for categorical feature '{col_name}'.")
1865
- raise ValueError()
1866
-
1867
- mapping = schema.categorical_mappings[col_name]
1868
-
1869
- # Apply mapping (unknowns become NaN)
1870
- df_processed[col_name] = df_processed[col_name].astype(str).map(mapping)
1871
-
1872
- # Handle Unknown Categories
1873
- if df_processed[col_name].isnull().any():
1874
- n_missing = df_processed[col_name].isnull().sum()
1875
- _LOGGER.warning(f"Feature '{col_name}': Found {n_missing} unknown categories. Mapping to {unknown_value}.")
1876
-
1877
- # Fill unknowns with the specified integer
1878
- df_processed[col_name] = df_processed[col_name].fillna(unknown_value)
1879
-
1880
- df_processed[col_name] = df_processed[col_name].astype(int)
1881
-
1882
- # 4. Reorder and Filter
1883
- final_column_order = list(schema.feature_names) + targets
1884
-
1885
- extra_cols = set(df_processed.columns) - set(final_column_order)
1886
- if extra_cols:
1887
- _LOGGER.info(f"Dropping {len(extra_cols)} extra columns not present in schema.")
1888
- if verbose:
1889
- for extra_column in extra_cols:
1890
- print(f" - Dropping column: '{extra_column}'")
1891
-
1892
- df_final = df_processed[final_column_order]
1893
-
1894
- _LOGGER.info(f"Schema applied successfully. Final shape: {df_final.shape}")
1895
-
1896
- # df_final should be a dataframe
1897
- if isinstance(df_final, pd.Series):
1898
- df_final = df_final.to_frame()
1899
-
1900
- return df_final
1901
-
1902
-
1903
- def _validate_columns(df: pd.DataFrame, columns: list[str]):
1904
- valid_columns = [column for column in columns if column in df.columns]
1905
- return valid_columns
1906
-
1907
-
1908
- def info():
1909
- _script_info(__all__)