clean-data-ml 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """clean_data_ml: automated data cleaning and standardization for ML pipelines."""
2
+
3
+ from .cleaner import CleanPipeline, DataCleaner
4
+ from . import stats
5
+
6
+ __all__ = ["DataCleaner", "CleanPipeline", "stats"]
7
+ __version__ = "1.2.0"
@@ -0,0 +1,18 @@
1
+ """Entry point for ``python -m clean_data_ml``."""
2
+
3
+ from . import __version__
4
+
5
+
6
+ def main() -> None:
7
+ """Print version and usage information."""
8
+ print(f"clean_data_ml v{__version__}")
9
+ print("Automated data cleaning and standardization for ML pipelines.")
10
+ print()
11
+ print("Usage:")
12
+ print(" from clean_data_ml import DataCleaner")
13
+ print(' dc = DataCleaner()')
14
+ print(' dc.load("data.csv").set_target("target").prepare()')
15
+
16
+
17
+ if __name__ == "__main__":
18
+ main()
@@ -0,0 +1,79 @@
1
+ """Automatic scaler selection for numeric columns.
2
+
3
+ Tests each column for normality, outliers, bounds, and sparsity
4
+ to pick the most appropriate sklearn scaler.
5
+ """
6
+
7
+ from typing import Union
8
+
9
+ import pandas as pd
10
+ from scipy import stats as sp_stats
11
+ from sklearn.preprocessing import (
12
+ MaxAbsScaler,
13
+ MinMaxScaler,
14
+ RobustScaler,
15
+ StandardScaler,
16
+ )
17
+
18
+ Scaler = Union[StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler]
19
+
20
+
21
+ def select_best_scaler(series: pd.Series) -> Scaler:
22
+ """Select the optimal scaler for a numeric column based on its distribution.
23
+
24
+ Tests normality (Shapiro-Wilk / D'Agostino), outlier presence (IQR),
25
+ value bounds, and sparsity, then returns the best-fit sklearn scaler.
26
+
27
+ Parameters
28
+ ----------
29
+ series : pd.Series
30
+ Numeric column with potential null values.
31
+
32
+ Returns
33
+ -------
34
+ StandardScaler, RobustScaler, MinMaxScaler, or MaxAbsScaler
35
+ The selected scaler instance (unfitted).
36
+
37
+ Selection logic
38
+ ---------------
39
+ - Normal + no outliers : StandardScaler
40
+ - Has outliers : RobustScaler
41
+ - Bounded in [0, 1] : MinMaxScaler
42
+ - Sparse (>40% zeros) : MaxAbsScaler
43
+ - Default : StandardScaler
44
+ """
45
+ series = series.dropna()
46
+ if len(series) < 10:
47
+ return StandardScaler()
48
+
49
+ q1 = series.quantile(0.25)
50
+ q3 = series.quantile(0.75)
51
+ iqr = q3 - q1
52
+
53
+ lower = q1 - 1.5 * iqr
54
+ upper = q3 + 1.5 * iqr
55
+ has_outliers = bool((series.min() < lower) or (series.max() > upper))
56
+
57
+ is_normal = False
58
+ if len(series) < 5000:
59
+ _, p_value = sp_stats.shapiro(series.sample(min(len(series), 500), random_state=42))
60
+ is_normal = bool(p_value > 0.05)
61
+ else:
62
+ _, p_value = sp_stats.normaltest(series.sample(1000, random_state=42))
63
+ is_normal = bool(p_value > 0.05)
64
+
65
+ col_min, col_max = float(series.min()), float(series.max())
66
+ data_range = col_max - col_min
67
+ is_bounded_01 = bool(data_range > 0 and col_min >= 0 and col_max <= 1)
68
+ is_sparse = bool((series == 0).mean() > 0.4)
69
+
70
+ if is_sparse:
71
+ return MaxAbsScaler()
72
+ if is_normal and not has_outliers:
73
+ return StandardScaler()
74
+ if has_outliers:
75
+ return RobustScaler()
76
+ if is_bounded_01:
77
+ return MinMaxScaler()
78
+
79
+ return StandardScaler()