dragon-ml-toolbox 19.14.0__py3-none-any.whl → 20.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/METADATA +29 -46
  2. dragon_ml_toolbox-20.0.0.dist-info/RECORD +178 -0
  3. ml_tools/{ETL_cleaning.py → ETL_cleaning/__init__.py} +13 -5
  4. ml_tools/ETL_cleaning/_basic_clean.py +351 -0
  5. ml_tools/ETL_cleaning/_clean_tools.py +128 -0
  6. ml_tools/ETL_cleaning/_dragon_cleaner.py +245 -0
  7. ml_tools/ETL_cleaning/_imprimir.py +13 -0
  8. ml_tools/{ETL_engineering.py → ETL_engineering/__init__.py} +8 -4
  9. ml_tools/ETL_engineering/_dragon_engineering.py +261 -0
  10. ml_tools/ETL_engineering/_imprimir.py +24 -0
  11. ml_tools/{_core/_ETL_engineering.py → ETL_engineering/_transforms.py} +14 -267
  12. ml_tools/{_core → GUI_tools}/_GUI_tools.py +37 -40
  13. ml_tools/{GUI_tools.py → GUI_tools/__init__.py} +7 -5
  14. ml_tools/GUI_tools/_imprimir.py +12 -0
  15. ml_tools/IO_tools/_IO_loggers.py +235 -0
  16. ml_tools/IO_tools/_IO_save_load.py +151 -0
  17. ml_tools/IO_tools/_IO_utils.py +140 -0
  18. ml_tools/{IO_tools.py → IO_tools/__init__.py} +13 -5
  19. ml_tools/IO_tools/_imprimir.py +14 -0
  20. ml_tools/MICE/_MICE_imputation.py +132 -0
  21. ml_tools/{MICE_imputation.py → MICE/__init__.py} +6 -7
  22. ml_tools/{_core/_MICE_imputation.py → MICE/_dragon_mice.py} +243 -322
  23. ml_tools/MICE/_imprimir.py +11 -0
  24. ml_tools/{ML_callbacks.py → ML_callbacks/__init__.py} +12 -4
  25. ml_tools/ML_callbacks/_base.py +101 -0
  26. ml_tools/ML_callbacks/_checkpoint.py +232 -0
  27. ml_tools/ML_callbacks/_early_stop.py +208 -0
  28. ml_tools/ML_callbacks/_imprimir.py +12 -0
  29. ml_tools/ML_callbacks/_scheduler.py +197 -0
  30. ml_tools/{ML_chaining_utilities.py → ML_chain/__init__.py} +8 -3
  31. ml_tools/{_core/_ML_chaining_utilities.py → ML_chain/_chaining_tools.py} +5 -129
  32. ml_tools/ML_chain/_dragon_chain.py +140 -0
  33. ml_tools/ML_chain/_imprimir.py +11 -0
  34. ml_tools/ML_configuration/__init__.py +90 -0
  35. ml_tools/ML_configuration/_base_model_config.py +69 -0
  36. ml_tools/ML_configuration/_finalize.py +366 -0
  37. ml_tools/ML_configuration/_imprimir.py +47 -0
  38. ml_tools/ML_configuration/_metrics.py +593 -0
  39. ml_tools/ML_configuration/_models.py +206 -0
  40. ml_tools/ML_configuration/_training.py +124 -0
  41. ml_tools/ML_datasetmaster/__init__.py +28 -0
  42. ml_tools/ML_datasetmaster/_base_datasetmaster.py +337 -0
  43. ml_tools/{_core/_ML_datasetmaster.py → ML_datasetmaster/_datasetmaster.py} +9 -329
  44. ml_tools/ML_datasetmaster/_imprimir.py +15 -0
  45. ml_tools/{_core/_ML_sequence_datasetmaster.py → ML_datasetmaster/_sequence_datasetmaster.py} +13 -15
  46. ml_tools/{_core/_ML_vision_datasetmaster.py → ML_datasetmaster/_vision_datasetmaster.py} +63 -65
  47. ml_tools/ML_evaluation/__init__.py +53 -0
  48. ml_tools/ML_evaluation/_classification.py +629 -0
  49. ml_tools/ML_evaluation/_feature_importance.py +409 -0
  50. ml_tools/ML_evaluation/_imprimir.py +25 -0
  51. ml_tools/ML_evaluation/_loss.py +92 -0
  52. ml_tools/ML_evaluation/_regression.py +273 -0
  53. ml_tools/{_core/_ML_sequence_evaluation.py → ML_evaluation/_sequence.py} +8 -11
  54. ml_tools/{_core/_ML_vision_evaluation.py → ML_evaluation/_vision.py} +12 -17
  55. ml_tools/{_core → ML_evaluation_captum}/_ML_evaluation_captum.py +11 -38
  56. ml_tools/{ML_evaluation_captum.py → ML_evaluation_captum/__init__.py} +6 -4
  57. ml_tools/ML_evaluation_captum/_imprimir.py +10 -0
  58. ml_tools/{_core → ML_finalize_handler}/_ML_finalize_handler.py +3 -7
  59. ml_tools/ML_finalize_handler/__init__.py +10 -0
  60. ml_tools/ML_finalize_handler/_imprimir.py +8 -0
  61. ml_tools/ML_inference/__init__.py +22 -0
  62. ml_tools/ML_inference/_base_inference.py +166 -0
  63. ml_tools/{_core/_ML_chaining_inference.py → ML_inference/_chain_inference.py} +14 -17
  64. ml_tools/ML_inference/_dragon_inference.py +332 -0
  65. ml_tools/ML_inference/_imprimir.py +11 -0
  66. ml_tools/ML_inference/_multi_inference.py +180 -0
  67. ml_tools/ML_inference_sequence/__init__.py +10 -0
  68. ml_tools/ML_inference_sequence/_imprimir.py +8 -0
  69. ml_tools/{_core/_ML_sequence_inference.py → ML_inference_sequence/_sequence_inference.py} +11 -15
  70. ml_tools/ML_inference_vision/__init__.py +10 -0
  71. ml_tools/ML_inference_vision/_imprimir.py +8 -0
  72. ml_tools/{_core/_ML_vision_inference.py → ML_inference_vision/_vision_inference.py} +15 -19
  73. ml_tools/ML_models/__init__.py +32 -0
  74. ml_tools/{_core/_ML_models_advanced.py → ML_models/_advanced_models.py} +22 -18
  75. ml_tools/ML_models/_base_mlp_attention.py +198 -0
  76. ml_tools/{_core/_models_advanced_base.py → ML_models/_base_save_load.py} +73 -49
  77. ml_tools/ML_models/_dragon_tabular.py +248 -0
  78. ml_tools/ML_models/_imprimir.py +18 -0
  79. ml_tools/ML_models/_mlp_attention.py +134 -0
  80. ml_tools/{_core → ML_models}/_models_advanced_helpers.py +13 -13
  81. ml_tools/ML_models_sequence/__init__.py +10 -0
  82. ml_tools/ML_models_sequence/_imprimir.py +8 -0
  83. ml_tools/{_core/_ML_sequence_models.py → ML_models_sequence/_sequence_models.py} +5 -8
  84. ml_tools/ML_models_vision/__init__.py +29 -0
  85. ml_tools/ML_models_vision/_base_wrapper.py +254 -0
  86. ml_tools/ML_models_vision/_image_classification.py +182 -0
  87. ml_tools/ML_models_vision/_image_segmentation.py +108 -0
  88. ml_tools/ML_models_vision/_imprimir.py +16 -0
  89. ml_tools/ML_models_vision/_object_detection.py +135 -0
  90. ml_tools/ML_optimization/__init__.py +21 -0
  91. ml_tools/ML_optimization/_imprimir.py +13 -0
  92. ml_tools/{_core/_ML_optimization_pareto.py → ML_optimization/_multi_dragon.py} +18 -24
  93. ml_tools/ML_optimization/_single_dragon.py +203 -0
  94. ml_tools/{_core/_ML_optimization.py → ML_optimization/_single_manual.py} +75 -213
  95. ml_tools/{_core → ML_scaler}/_ML_scaler.py +8 -11
  96. ml_tools/ML_scaler/__init__.py +10 -0
  97. ml_tools/ML_scaler/_imprimir.py +8 -0
  98. ml_tools/ML_trainer/__init__.py +20 -0
  99. ml_tools/ML_trainer/_base_trainer.py +297 -0
  100. ml_tools/ML_trainer/_dragon_detection_trainer.py +402 -0
  101. ml_tools/ML_trainer/_dragon_sequence_trainer.py +540 -0
  102. ml_tools/ML_trainer/_dragon_trainer.py +1160 -0
  103. ml_tools/ML_trainer/_imprimir.py +10 -0
  104. ml_tools/{ML_utilities.py → ML_utilities/__init__.py} +14 -6
  105. ml_tools/ML_utilities/_artifact_finder.py +382 -0
  106. ml_tools/ML_utilities/_imprimir.py +16 -0
  107. ml_tools/ML_utilities/_inspection.py +325 -0
  108. ml_tools/ML_utilities/_train_tools.py +205 -0
  109. ml_tools/{ML_vision_transformers.py → ML_vision_transformers/__init__.py} +9 -6
  110. ml_tools/{_core/_ML_vision_transformers.py → ML_vision_transformers/_core_transforms.py} +11 -155
  111. ml_tools/ML_vision_transformers/_imprimir.py +14 -0
  112. ml_tools/ML_vision_transformers/_offline_augmentation.py +159 -0
  113. ml_tools/{_core/_PSO_optimization.py → PSO_optimization/_PSO.py} +58 -15
  114. ml_tools/{PSO_optimization.py → PSO_optimization/__init__.py} +5 -3
  115. ml_tools/PSO_optimization/_imprimir.py +10 -0
  116. ml_tools/SQL/__init__.py +7 -0
  117. ml_tools/{_core/_SQL.py → SQL/_dragon_SQL.py} +7 -11
  118. ml_tools/SQL/_imprimir.py +8 -0
  119. ml_tools/{_core → VIF}/_VIF_factor.py +5 -8
  120. ml_tools/{VIF_factor.py → VIF/__init__.py} +4 -2
  121. ml_tools/VIF/_imprimir.py +10 -0
  122. ml_tools/_core/__init__.py +7 -1
  123. ml_tools/_core/_logger.py +8 -18
  124. ml_tools/_core/_schema_load_ops.py +43 -0
  125. ml_tools/_core/_script_info.py +2 -2
  126. ml_tools/{data_exploration.py → data_exploration/__init__.py} +32 -16
  127. ml_tools/data_exploration/_analysis.py +214 -0
  128. ml_tools/data_exploration/_cleaning.py +566 -0
  129. ml_tools/data_exploration/_features.py +583 -0
  130. ml_tools/data_exploration/_imprimir.py +32 -0
  131. ml_tools/data_exploration/_plotting.py +487 -0
  132. ml_tools/data_exploration/_schema_ops.py +176 -0
  133. ml_tools/{ensemble_evaluation.py → ensemble_evaluation/__init__.py} +6 -4
  134. ml_tools/{_core → ensemble_evaluation}/_ensemble_evaluation.py +3 -7
  135. ml_tools/ensemble_evaluation/_imprimir.py +14 -0
  136. ml_tools/{ensemble_inference.py → ensemble_inference/__init__.py} +5 -3
  137. ml_tools/{_core → ensemble_inference}/_ensemble_inference.py +15 -18
  138. ml_tools/ensemble_inference/_imprimir.py +9 -0
  139. ml_tools/{ensemble_learning.py → ensemble_learning/__init__.py} +4 -6
  140. ml_tools/{_core → ensemble_learning}/_ensemble_learning.py +7 -10
  141. ml_tools/ensemble_learning/_imprimir.py +10 -0
  142. ml_tools/{excel_handler.py → excel_handler/__init__.py} +5 -3
  143. ml_tools/{_core → excel_handler}/_excel_handler.py +6 -10
  144. ml_tools/excel_handler/_imprimir.py +13 -0
  145. ml_tools/{keys.py → keys/__init__.py} +4 -1
  146. ml_tools/keys/_imprimir.py +11 -0
  147. ml_tools/{_core → keys}/_keys.py +2 -0
  148. ml_tools/{math_utilities.py → math_utilities/__init__.py} +5 -2
  149. ml_tools/math_utilities/_imprimir.py +11 -0
  150. ml_tools/{_core → math_utilities}/_math_utilities.py +1 -5
  151. ml_tools/{optimization_tools.py → optimization_tools/__init__.py} +9 -4
  152. ml_tools/optimization_tools/_imprimir.py +13 -0
  153. ml_tools/optimization_tools/_optimization_bounds.py +236 -0
  154. ml_tools/optimization_tools/_optimization_plots.py +218 -0
  155. ml_tools/{path_manager.py → path_manager/__init__.py} +6 -3
  156. ml_tools/{_core/_path_manager.py → path_manager/_dragonmanager.py} +11 -347
  157. ml_tools/path_manager/_imprimir.py +15 -0
  158. ml_tools/path_manager/_path_tools.py +346 -0
  159. ml_tools/plot_fonts/__init__.py +8 -0
  160. ml_tools/plot_fonts/_imprimir.py +8 -0
  161. ml_tools/{_core → plot_fonts}/_plot_fonts.py +2 -5
  162. ml_tools/schema/__init__.py +15 -0
  163. ml_tools/schema/_feature_schema.py +223 -0
  164. ml_tools/schema/_gui_schema.py +191 -0
  165. ml_tools/schema/_imprimir.py +10 -0
  166. ml_tools/{serde.py → serde/__init__.py} +4 -2
  167. ml_tools/serde/_imprimir.py +10 -0
  168. ml_tools/{_core → serde}/_serde.py +3 -8
  169. ml_tools/{utilities.py → utilities/__init__.py} +11 -6
  170. ml_tools/utilities/_imprimir.py +18 -0
  171. ml_tools/{_core/_utilities.py → utilities/_utility_save_load.py} +13 -190
  172. ml_tools/utilities/_utility_tools.py +192 -0
  173. dragon_ml_toolbox-19.14.0.dist-info/RECORD +0 -111
  174. ml_tools/ML_chaining_inference.py +0 -8
  175. ml_tools/ML_configuration.py +0 -86
  176. ml_tools/ML_configuration_pytab.py +0 -14
  177. ml_tools/ML_datasetmaster.py +0 -10
  178. ml_tools/ML_evaluation.py +0 -16
  179. ml_tools/ML_evaluation_multi.py +0 -12
  180. ml_tools/ML_finalize_handler.py +0 -8
  181. ml_tools/ML_inference.py +0 -12
  182. ml_tools/ML_models.py +0 -14
  183. ml_tools/ML_models_advanced.py +0 -14
  184. ml_tools/ML_models_pytab.py +0 -14
  185. ml_tools/ML_optimization.py +0 -14
  186. ml_tools/ML_optimization_pareto.py +0 -8
  187. ml_tools/ML_scaler.py +0 -8
  188. ml_tools/ML_sequence_datasetmaster.py +0 -8
  189. ml_tools/ML_sequence_evaluation.py +0 -10
  190. ml_tools/ML_sequence_inference.py +0 -8
  191. ml_tools/ML_sequence_models.py +0 -8
  192. ml_tools/ML_trainer.py +0 -12
  193. ml_tools/ML_vision_datasetmaster.py +0 -12
  194. ml_tools/ML_vision_evaluation.py +0 -10
  195. ml_tools/ML_vision_inference.py +0 -8
  196. ml_tools/ML_vision_models.py +0 -18
  197. ml_tools/SQL.py +0 -8
  198. ml_tools/_core/_ETL_cleaning.py +0 -694
  199. ml_tools/_core/_IO_tools.py +0 -498
  200. ml_tools/_core/_ML_callbacks.py +0 -702
  201. ml_tools/_core/_ML_configuration.py +0 -1332
  202. ml_tools/_core/_ML_configuration_pytab.py +0 -102
  203. ml_tools/_core/_ML_evaluation.py +0 -867
  204. ml_tools/_core/_ML_evaluation_multi.py +0 -544
  205. ml_tools/_core/_ML_inference.py +0 -646
  206. ml_tools/_core/_ML_models.py +0 -668
  207. ml_tools/_core/_ML_models_pytab.py +0 -693
  208. ml_tools/_core/_ML_trainer.py +0 -2323
  209. ml_tools/_core/_ML_utilities.py +0 -886
  210. ml_tools/_core/_ML_vision_models.py +0 -644
  211. ml_tools/_core/_data_exploration.py +0 -1909
  212. ml_tools/_core/_optimization_tools.py +0 -493
  213. ml_tools/_core/_schema.py +0 -359
  214. ml_tools/plot_fonts.py +0 -8
  215. ml_tools/schema.py +0 -12
  216. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/WHEEL +0 -0
  217. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE +0 -0
  218. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/licenses/LICENSE-THIRD-PARTY.md +0 -0
  219. {dragon_ml_toolbox-19.14.0.dist-info → dragon_ml_toolbox-20.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,566 @@
1
+ import pandas as pd
2
+ from pandas.api.types import is_numeric_dtype
3
+ from typing import Optional, Union
4
+ from pathlib import Path
5
+
6
+ from ..utilities import save_dataframe_filename
7
+
8
+ from ..path_manager import make_fullpath
9
+ from .._core import get_logger
10
+
11
+ from ._analysis import show_null_columns
12
+
13
+
14
+ _LOGGER = get_logger("Data Exploration: Cleaning")
15
+
16
+
17
+ __all__ = [
18
+ "drop_constant_columns",
19
+ "drop_rows_with_missing_data",
20
+ "drop_columns_with_missing_data",
21
+ "drop_macro",
22
+ "clean_column_names",
23
+ "clip_outliers_single",
24
+ "clip_outliers_multi",
25
+ "drop_outlier_samples",
26
+ "standardize_percentages",
27
+ ]
28
+
29
+
30
+ def drop_constant_columns(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
31
+ """
32
+ Removes columns from a pandas DataFrame that contain only a single unique
33
+ value or are entirely null/NaN.
34
+
35
+ This utility is useful for cleaning data by removing constant features that
36
+ have no predictive value.
37
+
38
+ Args:
39
+ df (pd.DataFrame):
40
+ The pandas DataFrame to clean.
41
+ verbose (bool):
42
+ If True, prints the names of the columns that were dropped.
43
+ Defaults to True.
44
+
45
+ Returns:
46
+ pd.DataFrame:
47
+ A new DataFrame with the constant columns removed.
48
+ """
49
+ if not isinstance(df, pd.DataFrame):
50
+ _LOGGER.error("Input must be a pandas DataFrame.")
51
+ raise TypeError()
52
+
53
+ # make copy to avoid modifying original
54
+ df_clean = df.copy()
55
+
56
+ original_columns = set(df.columns)
57
+ cols_to_keep = []
58
+
59
+ for col_name in df_clean.columns:
60
+ column = df_clean[col_name]
61
+
62
+ # Keep a column if it has more than one unique value (nunique ignores NaNs by default)
63
+ if column.nunique(dropna=True) > 1:
64
+ cols_to_keep.append(col_name)
65
+
66
+ dropped_columns = original_columns - set(cols_to_keep)
67
+ if verbose:
68
+ if dropped_columns:
69
+ _LOGGER.info(f"🧹 Dropped {len(dropped_columns)} constant columns: {list(dropped_columns)}")
70
+ else:
71
+ _LOGGER.info("No constant columns found.")
72
+
73
+ # Return a new DataFrame with only the columns to keep
74
+ df_clean = df_clean[cols_to_keep]
75
+
76
+ if isinstance(df_clean, pd.Series):
77
+ df_clean = df_clean.to_frame()
78
+
79
+ return df_clean
80
+
81
+
82
+ def drop_rows_with_missing_data(df: pd.DataFrame, targets: Optional[list[str]], threshold: float = 0.7) -> pd.DataFrame:
83
+ """
84
+ Drops rows from the DataFrame using a two-stage strategy:
85
+
86
+ 1. If `targets`, remove any row where all target columns are missing.
87
+ 2. Among features, drop those with more than `threshold` fraction of missing values.
88
+
89
+ Parameters:
90
+ df (pd.DataFrame): The input DataFrame.
91
+ targets (list[str] | None): List of target column names.
92
+ threshold (float): Maximum allowed fraction of missing values in feature columns.
93
+
94
+ Returns:
95
+ pd.DataFrame: A cleaned DataFrame with problematic rows removed.
96
+ """
97
+ df_clean = df.copy()
98
+
99
+ # Stage 1: Drop rows with all target columns missing
100
+ valid_targets = []
101
+ if targets:
102
+ # validate targets
103
+ missing_targets = [t for t in targets if t not in df_clean.columns]
104
+ if missing_targets:
105
+ _LOGGER.error(f"Target columns not found in DataFrame: {missing_targets}")
106
+ raise ValueError()
107
+ else:
108
+ valid_targets = targets
109
+
110
+ # Only proceed if we actually have columns to check
111
+ if valid_targets:
112
+ target_na = df_clean[valid_targets].isnull().all(axis=1)
113
+ if target_na.any():
114
+ _LOGGER.info(f"🧹 Dropping {target_na.sum()} rows with all target columns missing.")
115
+ df_clean = df_clean[~target_na]
116
+ else:
117
+ _LOGGER.info("No rows found where all targets are missing.")
118
+ else:
119
+ _LOGGER.error("Targets list provided but no matching columns found in DataFrame.")
120
+ raise ValueError()
121
+
122
+ # Stage 2: Drop rows based on feature column missing values
123
+ feature_cols = [col for col in df_clean.columns if col not in valid_targets]
124
+ if feature_cols:
125
+ feature_na_frac = df_clean[feature_cols].isnull().mean(axis=1)
126
+ rows_to_drop = feature_na_frac[feature_na_frac > threshold].index # type: ignore
127
+ if len(rows_to_drop) > 0:
128
+ _LOGGER.info(f"🧹 Dropping {len(rows_to_drop)} rows with more than {threshold*100:.0f}% missing feature data.")
129
+ df_clean = df_clean.drop(index=rows_to_drop)
130
+ else:
131
+ _LOGGER.info(f"No rows exceed the {threshold*100:.0f}% missing feature data threshold.")
132
+ else:
133
+ _LOGGER.warning("No feature columns available to evaluate.")
134
+
135
+ return df_clean
136
+
137
+
138
+ def drop_columns_with_missing_data(df: pd.DataFrame, threshold: float = 0.7, show_nulls_after: bool = True, skip_columns: Optional[list[str]]=None) -> pd.DataFrame:
139
+ """
140
+ Drops columns with more than `threshold` fraction of missing values.
141
+
142
+ Parameters:
143
+ df (pd.DataFrame): The input DataFrame.
144
+ threshold (float): Fraction of missing values above which columns are dropped.
145
+ show_nulls_after (bool): Prints `show_null_columns` after dropping columns.
146
+ skip_columns (list[str] | None): If given, these columns wont be included in the drop process.
147
+
148
+ Returns:
149
+ pd.DataFrame: A new DataFrame without the dropped columns.
150
+ """
151
+ # If skip_columns is provided, create a list of columns to check.
152
+ # Otherwise, check all columns.
153
+ cols_to_check = df.columns
154
+ if skip_columns:
155
+ # Use set difference for efficient exclusion
156
+ cols_to_check = df.columns.difference(skip_columns)
157
+
158
+ # Calculate the missing fraction only on the columns to be checked
159
+ missing_fraction = df[cols_to_check].isnull().mean()
160
+
161
+
162
+ cols_to_drop = missing_fraction[missing_fraction > threshold].index # type: ignore
163
+
164
+ if len(cols_to_drop) > 0:
165
+ _LOGGER.info(f"🧹 Dropping columns with more than {threshold*100:.0f}% missing data: {list(cols_to_drop)}")
166
+
167
+ result_df = df.drop(columns=cols_to_drop)
168
+ if show_nulls_after:
169
+ print(show_null_columns(df=result_df))
170
+
171
+ return result_df
172
+ else:
173
+ _LOGGER.info(f"No columns have more than {threshold*100:.0f}% missing data.")
174
+ return df
175
+
176
+
177
+ def drop_macro(df: pd.DataFrame,
178
+ log_directory: Union[str,Path],
179
+ targets: list[str],
180
+ skip_targets: bool=False,
181
+ threshold: float=0.7) -> pd.DataFrame:
182
+ """
183
+ Iteratively removes rows and columns with excessive missing data.
184
+
185
+ This function performs a comprehensive cleaning cycle on a DataFrame. It
186
+ repeatedly drops columns with constant values, followed by rows and columns that exceed
187
+ a specified threshold of missing values. The process continues until the
188
+ DataFrame's dimensions stabilize, ensuring that the interdependency between
189
+ row and column deletions is handled.
190
+
191
+ Initial and final missing data reports are saved to the specified log directory.
192
+
193
+ Args:
194
+ df (pd.DataFrame): The input pandas DataFrame to be cleaned.
195
+ log_directory (Union[str, Path]): Path to the directory where the missing data reports
196
+ and plots will be saved inside a "Missing Report" subdirectory.
197
+ targets (list[str]): A list of column names to be treated as target
198
+ variables. This list guides the row-dropping logic.
199
+ skip_targets (bool, optional): If True, the columns listed in `targets`
200
+ will be exempt from being dropped, even if they exceed the missing
201
+ data threshold.
202
+ threshold (float, optional): The proportion of missing data required to drop
203
+ a row or column. For example, 0.7 means a row/column will be
204
+ dropped if 70% or more of its data is missing.
205
+
206
+ Returns:
207
+ pd.DataFrame: A new, cleaned DataFrame with offending rows and columns removed.
208
+ """
209
+ # make a deep copy to work with
210
+ df_clean = df.copy()
211
+
212
+ base_dir_path = make_fullpath(log_directory, make=True, enforce="directory")
213
+ full_path = base_dir_path / "Missing Report"
214
+
215
+ # Log initial state + Plot
216
+ missing_data_start = show_null_columns(
217
+ df=df_clean,
218
+ plot_to_dir=full_path,
219
+ plot_filename="Original",
220
+ use_all_columns=True
221
+ )
222
+ save_dataframe_filename(df=missing_data_start.reset_index(drop=False),
223
+ save_dir=full_path,
224
+ filename="Missing_Data_Original")
225
+
226
+ # Clean cycles for rows and columns
227
+ master = True
228
+ while master:
229
+ # track rows and columns
230
+ initial_rows, initial_columns = df_clean.shape
231
+
232
+ # drop constant columns
233
+ df_clean = drop_constant_columns(df=df_clean)
234
+
235
+ # clean rows
236
+ df_clean = drop_rows_with_missing_data(df=df_clean, targets=targets, threshold=threshold)
237
+
238
+ # clean columns
239
+ if skip_targets:
240
+ df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False, skip_columns=targets)
241
+ else:
242
+ df_clean = drop_columns_with_missing_data(df=df_clean, threshold=threshold, show_nulls_after=False)
243
+
244
+ # cleaned?
245
+ remaining_rows, remaining_columns = df_clean.shape
246
+ if remaining_rows >= initial_rows and remaining_columns >= initial_columns:
247
+ master = False
248
+
249
+ # log final state + plot
250
+ missing_data_final = show_null_columns(
251
+ df=df_clean,
252
+ plot_to_dir=full_path,
253
+ plot_filename="Processed",
254
+ use_all_columns=True
255
+ )
256
+ save_dataframe_filename(df=missing_data_final.reset_index(drop=False),
257
+ save_dir=full_path,
258
+ filename="Missing_Data_Processed")
259
+
260
+ # return cleaned dataframe
261
+ return df_clean
262
+
263
+
264
+ def clean_column_names(df: pd.DataFrame, replacement_char: str = '-', replacement_pattern: str = r'[\[\]{}<>,:"]', verbose: bool = True) -> pd.DataFrame:
265
+ """
266
+ Cleans DataFrame column names by replacing special characters.
267
+
268
+ This function is useful for ensuring compatibility with libraries like LightGBM,
269
+ which do not support special JSON characters such as `[]{}<>,:"` in feature names.
270
+
271
+ Args:
272
+ df (pd.DataFrame): The input DataFrame.
273
+ replacement_char (str): The character to use for replacing characters.
274
+ replacement_pattern (str): Regex pattern to use for the replacement logic.
275
+ verbose (bool): If True, prints the renamed columns.
276
+
277
+ Returns:
278
+ pd.DataFrame: A new DataFrame with cleaned column names.
279
+ """
280
+ new_df = df.copy()
281
+
282
+ original_columns = new_df.columns
283
+ new_columns = original_columns.str.replace(replacement_pattern, replacement_char, regex=True)
284
+
285
+ # Create a map of changes for logging
286
+ rename_map = {old: new for old, new in zip(original_columns, new_columns) if old != new}
287
+
288
+ if verbose:
289
+ if rename_map:
290
+ _LOGGER.info(f"Cleaned {len(rename_map)} column name(s) containing special characters:")
291
+ for old, new in rename_map.items():
292
+ print(f" '{old}' -> '{new}'")
293
+ else:
294
+ _LOGGER.info("No column names required cleaning.")
295
+
296
+ new_df.columns = new_columns
297
+ return new_df
298
+
299
+
300
+
301
+ def clip_outliers_single(
302
+ df: pd.DataFrame,
303
+ column: str,
304
+ min_val: float,
305
+ max_val: float
306
+ ) -> Union[pd.DataFrame, None]:
307
+ """
308
+ Clips values in the specified numeric column to the range [min_val, max_val],
309
+ and returns a new DataFrame where the original column is replaced by the clipped version.
310
+
311
+ Args:
312
+ df (pd.DataFrame): The input DataFrame.
313
+ column (str): The name of the column to clip.
314
+ min_val (float): Minimum allowable value; values below are clipped to this.
315
+ max_val (float): Maximum allowable value; values above are clipped to this.
316
+
317
+ Returns:
318
+ pd.DataFrame: A new DataFrame with the specified column clipped in place.
319
+
320
+ None: if a problem with the dataframe column occurred.
321
+ """
322
+ if column not in df.columns:
323
+ _LOGGER.warning(f"Column '{column}' not found in DataFrame.")
324
+ return None
325
+
326
+ if not pd.api.types.is_numeric_dtype(df[column]):
327
+ _LOGGER.warning(f"Column '{column}' must be numeric.")
328
+ return None
329
+
330
+ new_df = df.copy(deep=True)
331
+ new_df[column] = new_df[column].clip(lower=min_val, upper=max_val)
332
+
333
+ _LOGGER.info(f"Column '{column}' clipped to range [{min_val}, {max_val}].")
334
+ return new_df
335
+
336
+
337
+ def clip_outliers_multi(
338
+ df: pd.DataFrame,
339
+ clip_dict: Union[dict[str, tuple[int, int]], dict[str, tuple[float, float]]],
340
+ verbose: bool=False
341
+ ) -> pd.DataFrame:
342
+ """
343
+ Clips values in multiple specified numeric columns to given [min, max] ranges,
344
+ updating values (deep copy) and skipping invalid entries.
345
+
346
+ Args:
347
+ df (pd.DataFrame): The input DataFrame.
348
+ clip_dict (dict): A dictionary where keys are column names and values are (min_val, max_val) tuples.
349
+ verbose (bool): prints clipped range for each column.
350
+
351
+ Returns:
352
+ pd.DataFrame: A new DataFrame with specified columns clipped.
353
+
354
+ Notes:
355
+ - Invalid specifications (missing column, non-numeric type, wrong tuple length)
356
+ will be reported but skipped.
357
+ """
358
+ new_df = df.copy()
359
+ skipped_columns = []
360
+ clipped_columns = 0
361
+
362
+ for col, bounds in clip_dict.items():
363
+ try:
364
+ if col not in df.columns:
365
+ _LOGGER.error(f"Column '{col}' not found in DataFrame.")
366
+ raise ValueError()
367
+
368
+ if not pd.api.types.is_numeric_dtype(df[col]):
369
+ _LOGGER.error(f"Column '{col}' is not numeric.")
370
+ raise TypeError()
371
+
372
+ if not (isinstance(bounds, tuple) and len(bounds) == 2):
373
+ _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
374
+ raise ValueError()
375
+
376
+ min_val, max_val = bounds
377
+ new_df[col] = new_df[col].clip(lower=min_val, upper=max_val)
378
+ if verbose:
379
+ print(f"Clipped '{col}' to range [{min_val}, {max_val}].")
380
+ clipped_columns += 1
381
+
382
+ except Exception as e:
383
+ skipped_columns.append((col, str(e)))
384
+ continue
385
+
386
+ _LOGGER.info(f"Clipped {clipped_columns} columns.")
387
+
388
+ if skipped_columns:
389
+ _LOGGER.warning("Skipped columns:")
390
+ for col, msg in skipped_columns:
391
+ print(f" - {col}")
392
+
393
+ return new_df
394
+
395
+
396
+ def drop_outlier_samples(
397
+ df: pd.DataFrame,
398
+ bounds_dict: dict[str, tuple[Union[int, float], Union[int, float]]],
399
+ drop_on_nulls: bool = False,
400
+ verbose: bool = True
401
+ ) -> pd.DataFrame:
402
+ """
403
+ Drops entire rows where values in specified numeric columns fall outside
404
+ a given [min, max] range.
405
+
406
+ This function processes a copy of the DataFrame, ensuring the original is
407
+ not modified. It skips columns with invalid specifications.
408
+
409
+ Args:
410
+ df (pd.DataFrame): The input DataFrame.
411
+ bounds_dict (dict): A dictionary where keys are column names and values
412
+ are (min_val, max_val) tuples defining the valid range.
413
+ drop_on_nulls (bool): If True, rows with NaN/None in a checked column
414
+ will also be dropped. If False, NaN/None are ignored.
415
+ verbose (bool): If True, prints the number of rows dropped for each column.
416
+
417
+ Returns:
418
+ pd.DataFrame: A new DataFrame with the outlier rows removed.
419
+
420
+ Notes:
421
+ - Invalid specifications (e.g., missing column, non-numeric type,
422
+ incorrectly formatted bounds) will be reported and skipped.
423
+ """
424
+ new_df = df.copy()
425
+ skipped_columns: list[tuple[str, str]] = []
426
+ initial_rows = len(new_df)
427
+
428
+ for col, bounds in bounds_dict.items():
429
+ try:
430
+ # --- Validation Checks ---
431
+ if col not in df.columns:
432
+ _LOGGER.error(f"Column '{col}' not found in DataFrame.")
433
+ raise ValueError()
434
+
435
+ if not pd.api.types.is_numeric_dtype(df[col]):
436
+ _LOGGER.error(f"Column '{col}' is not of a numeric data type.")
437
+ raise TypeError()
438
+
439
+ if not (isinstance(bounds, tuple) and len(bounds) == 2):
440
+ _LOGGER.error(f"Bounds for '{col}' must be a tuple of (min, max).")
441
+ raise ValueError()
442
+
443
+ # --- Filtering Logic ---
444
+ min_val, max_val = bounds
445
+ rows_before_drop = len(new_df)
446
+
447
+ # Create the base mask for values within the specified range
448
+ # .between() is inclusive and evaluates to False for NaN
449
+ mask_in_bounds = new_df[col].between(min_val, max_val)
450
+
451
+ if drop_on_nulls:
452
+ # Keep only rows that are within bounds.
453
+ # Since mask_in_bounds is False for NaN, nulls are dropped.
454
+ final_mask = mask_in_bounds
455
+ else:
456
+ # Keep rows that are within bounds OR are null.
457
+ mask_is_null = new_df[col].isnull()
458
+ final_mask = mask_in_bounds | mask_is_null
459
+
460
+ # Apply the final mask
461
+ new_df = new_df[final_mask]
462
+
463
+ rows_after_drop = len(new_df)
464
+
465
+ if verbose:
466
+ dropped_count = rows_before_drop - rows_after_drop
467
+ if dropped_count > 0:
468
+ print(
469
+ f" - Column '{col}': Dropped {dropped_count} rows with values outside range [{min_val}, {max_val}]."
470
+ )
471
+
472
+ except (ValueError, TypeError) as e:
473
+ skipped_columns.append((col, str(e)))
474
+ continue
475
+
476
+ total_dropped = initial_rows - len(new_df)
477
+ _LOGGER.info(f"Finished processing. Total rows dropped: {total_dropped}.")
478
+
479
+ if skipped_columns:
480
+ _LOGGER.warning("Skipped the following columns due to errors:")
481
+ for col, msg in skipped_columns:
482
+ # Only print the column name for cleaner output as the error was already logged
483
+ print(f" - {col}")
484
+
485
+ # if new_df is a series, convert to dataframe
486
+ if isinstance(new_df, pd.Series):
487
+ new_df = new_df.to_frame()
488
+
489
+ return new_df
490
+
491
+
492
+ def standardize_percentages(
493
+ df: pd.DataFrame,
494
+ columns: list[str],
495
+ treat_one_as_proportion: bool = True,
496
+ round_digits: int = 2,
497
+ verbose: bool=True
498
+ ) -> pd.DataFrame:
499
+ """
500
+ Standardizes numeric columns containing mixed-format percentages.
501
+
502
+ This function cleans columns where percentages might be entered as whole
503
+ numbers (55) and as proportions (0.55). It assumes values
504
+ between 0 and 1 are proportions and multiplies them by 100.
505
+
506
+ Args:
507
+ df (pd.Dataframe): The input pandas DataFrame.
508
+ columns (list[str]): A list of column names to standardize.
509
+ treat_one_as_proportion (bool):
510
+ - If True (default): The value `1` is treated as a proportion and converted to `100%`.
511
+ - If False: The value `1` is treated as `1%`.
512
+ round_digits (int): The number of decimal places to round the final result to.
513
+
514
+ Returns:
515
+ (pd.Dataframe):
516
+ A new DataFrame with the specified columns cleaned and standardized.
517
+ """
518
+ df_copy = df.copy()
519
+
520
+ if df_copy.empty:
521
+ return df_copy
522
+
523
+ # This helper function contains the core cleaning logic
524
+ def _clean_value(x: float) -> float:
525
+ """Applies the standardization rule to a single value."""
526
+ if pd.isna(x):
527
+ return x
528
+
529
+ # If treat_one_as_proportion is True, the range for proportions is [0, 1]
530
+ if treat_one_as_proportion and 0 <= x <= 1:
531
+ return x * 100
532
+ # If False, the range for proportions is [0, 1) (1 is excluded)
533
+ elif not treat_one_as_proportion and 0 <= x < 1:
534
+ return x * 100
535
+
536
+ # Otherwise, the value is assumed to be a correctly formatted percentage
537
+ return x
538
+
539
+ fixed_columns: list[str] = list()
540
+
541
+ for col in columns:
542
+ # --- Robustness Checks ---
543
+ if col not in df_copy.columns:
544
+ _LOGGER.warning(f"Column '{col}' not found. Skipping.")
545
+ continue
546
+
547
+ if not is_numeric_dtype(df_copy[col]):
548
+ _LOGGER.warning(f"Column '{col}' is not numeric. Skipping.")
549
+ continue
550
+
551
+ # --- Applying the Logic ---
552
+ # Apply the cleaning function to every value in the column
553
+ df_copy[col] = df_copy[col].apply(_clean_value)
554
+
555
+ # Round the result
556
+ df_copy[col] = df_copy[col].round(round_digits)
557
+
558
+ fixed_columns.append(col)
559
+
560
+ if verbose:
561
+ _LOGGER.info(f"Columns standardized:")
562
+ for fixed_col in fixed_columns:
563
+ print(f" '{fixed_col}'")
564
+
565
+ return df_copy
566
+