clean-data-ml 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clean_data_ml/__init__.py +7 -0
- clean_data_ml/__main__.py +18 -0
- clean_data_ml/auto_scaler.py +79 -0
- clean_data_ml/cleaner.py +1568 -0
- clean_data_ml/plotting.py +155 -0
- clean_data_ml/py.typed +0 -0
- clean_data_ml/stats.py +1085 -0
- clean_data_ml-1.2.0.dist-info/METADATA +494 -0
- clean_data_ml-1.2.0.dist-info/RECORD +13 -0
- clean_data_ml-1.2.0.dist-info/WHEEL +5 -0
- clean_data_ml-1.2.0.dist-info/entry_points.txt +2 -0
- clean_data_ml-1.2.0.dist-info/licenses/LICENSE +21 -0
- clean_data_ml-1.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Entry point for ``python -m clean_data_ml``."""
|
|
2
|
+
|
|
3
|
+
from . import __version__
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main() -> None:
|
|
7
|
+
"""Print version and usage information."""
|
|
8
|
+
print(f"clean_data_ml v{__version__}")
|
|
9
|
+
print("Automated data cleaning and standardization for ML pipelines.")
|
|
10
|
+
print()
|
|
11
|
+
print("Usage:")
|
|
12
|
+
print(" from clean_data_ml import DataCleaner")
|
|
13
|
+
print(' dc = DataCleaner()')
|
|
14
|
+
print(' dc.load("data.csv").set_target("target").prepare()')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
if __name__ == "__main__":
|
|
18
|
+
main()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Automatic scaler selection for numeric columns.
|
|
2
|
+
|
|
3
|
+
Tests each column for normality, outliers, bounds, and sparsity
|
|
4
|
+
to pick the most appropriate sklearn scaler.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from scipy import stats as sp_stats
|
|
11
|
+
from sklearn.preprocessing import (
|
|
12
|
+
MaxAbsScaler,
|
|
13
|
+
MinMaxScaler,
|
|
14
|
+
RobustScaler,
|
|
15
|
+
StandardScaler,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
Scaler = Union[StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def select_best_scaler(series: pd.Series) -> Scaler:
|
|
22
|
+
"""Select the optimal scaler for a numeric column based on its distribution.
|
|
23
|
+
|
|
24
|
+
Tests normality (Shapiro-Wilk / D'Agostino), outlier presence (IQR),
|
|
25
|
+
value bounds, and sparsity, then returns the best-fit sklearn scaler.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
series : pd.Series
|
|
30
|
+
Numeric column with potential null values.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
StandardScaler, RobustScaler, MinMaxScaler, or MaxAbsScaler
|
|
35
|
+
The selected scaler instance (unfitted).
|
|
36
|
+
|
|
37
|
+
Selection logic
|
|
38
|
+
---------------
|
|
39
|
+
- Normal + no outliers : StandardScaler
|
|
40
|
+
- Has outliers : RobustScaler
|
|
41
|
+
- Bounded in [0, 1] : MinMaxScaler
|
|
42
|
+
- Sparse (>40% zeros) : MaxAbsScaler
|
|
43
|
+
- Default : StandardScaler
|
|
44
|
+
"""
|
|
45
|
+
series = series.dropna()
|
|
46
|
+
if len(series) < 10:
|
|
47
|
+
return StandardScaler()
|
|
48
|
+
|
|
49
|
+
q1 = series.quantile(0.25)
|
|
50
|
+
q3 = series.quantile(0.75)
|
|
51
|
+
iqr = q3 - q1
|
|
52
|
+
|
|
53
|
+
lower = q1 - 1.5 * iqr
|
|
54
|
+
upper = q3 + 1.5 * iqr
|
|
55
|
+
has_outliers = bool((series.min() < lower) or (series.max() > upper))
|
|
56
|
+
|
|
57
|
+
is_normal = False
|
|
58
|
+
if len(series) < 5000:
|
|
59
|
+
_, p_value = sp_stats.shapiro(series.sample(min(len(series), 500), random_state=42))
|
|
60
|
+
is_normal = bool(p_value > 0.05)
|
|
61
|
+
else:
|
|
62
|
+
_, p_value = sp_stats.normaltest(series.sample(1000, random_state=42))
|
|
63
|
+
is_normal = bool(p_value > 0.05)
|
|
64
|
+
|
|
65
|
+
col_min, col_max = float(series.min()), float(series.max())
|
|
66
|
+
data_range = col_max - col_min
|
|
67
|
+
is_bounded_01 = bool(data_range > 0 and col_min >= 0 and col_max <= 1)
|
|
68
|
+
is_sparse = bool((series == 0).mean() > 0.4)
|
|
69
|
+
|
|
70
|
+
if is_sparse:
|
|
71
|
+
return MaxAbsScaler()
|
|
72
|
+
if is_normal and not has_outliers:
|
|
73
|
+
return StandardScaler()
|
|
74
|
+
if has_outliers:
|
|
75
|
+
return RobustScaler()
|
|
76
|
+
if is_bounded_01:
|
|
77
|
+
return MinMaxScaler()
|
|
78
|
+
|
|
79
|
+
return StandardScaler()
|