datacleaner-vb 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacleaner_vb-0.1.0/PKG-INFO +11 -0
- datacleaner_vb-0.1.0/datacleaner_vb.egg-info/PKG-INFO +11 -0
- datacleaner_vb-0.1.0/datacleaner_vb.egg-info/SOURCES.txt +15 -0
- datacleaner_vb-0.1.0/datacleaner_vb.egg-info/dependency_links.txt +1 -0
- datacleaner_vb-0.1.0/datacleaner_vb.egg-info/requires.txt +3 -0
- datacleaner_vb-0.1.0/datacleaner_vb.egg-info/top_level.txt +1 -0
- datacleaner_vb-0.1.0/preprocessing/API_Token.py +2 -0
- datacleaner_vb-0.1.0/preprocessing/__init__.py +0 -0
- datacleaner_vb-0.1.0/preprocessing/config.py +63 -0
- datacleaner_vb-0.1.0/preprocessing/datatype.py +35 -0
- datacleaner_vb-0.1.0/preprocessing/encoding.py +25 -0
- datacleaner_vb-0.1.0/preprocessing/null_handling.py +29 -0
- datacleaner_vb-0.1.0/preprocessing/outliers_handling.py +163 -0
- datacleaner_vb-0.1.0/preprocessing/validation.py +29 -0
- datacleaner_vb-0.1.0/pyproject.toml +3 -0
- datacleaner_vb-0.1.0/setup.cfg +4 -0
- datacleaner_vb-0.1.0/setup.py +14 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacleaner-vb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Custom data preprocessing library
|
|
5
|
+
Author: Bharathan
|
|
6
|
+
Requires-Dist: pandas
|
|
7
|
+
Requires-Dist: numpy
|
|
8
|
+
Requires-Dist: scikit-learn
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: requires-dist
|
|
11
|
+
Dynamic: summary
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datacleaner-vb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Custom data preprocessing library
|
|
5
|
+
Author: Bharathan
|
|
6
|
+
Requires-Dist: pandas
|
|
7
|
+
Requires-Dist: numpy
|
|
8
|
+
Requires-Dist: scikit-learn
|
|
9
|
+
Dynamic: author
|
|
10
|
+
Dynamic: requires-dist
|
|
11
|
+
Dynamic: summary
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
setup.py
|
|
3
|
+
datacleaner_vb.egg-info/PKG-INFO
|
|
4
|
+
datacleaner_vb.egg-info/SOURCES.txt
|
|
5
|
+
datacleaner_vb.egg-info/dependency_links.txt
|
|
6
|
+
datacleaner_vb.egg-info/requires.txt
|
|
7
|
+
datacleaner_vb.egg-info/top_level.txt
|
|
8
|
+
preprocessing/API_Token.py
|
|
9
|
+
preprocessing/__init__.py
|
|
10
|
+
preprocessing/config.py
|
|
11
|
+
preprocessing/datatype.py
|
|
12
|
+
preprocessing/encoding.py
|
|
13
|
+
preprocessing/null_handling.py
|
|
14
|
+
preprocessing/outliers_handling.py
|
|
15
|
+
preprocessing/validation.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
preprocessing
|
|
File without changes
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
def structure():
|
|
2
|
+
config="""config = {
|
|
3
|
+
|
|
4
|
+
#1. Validation
|
|
5
|
+
"fixes": {
|
|
6
|
+
"Unit_Price": {
|
|
7
|
+
"method": "clip",
|
|
8
|
+
"min": 0,
|
|
9
|
+
"max": 10000
|
|
10
|
+
},
|
|
11
|
+
"Payment_Mode": {
|
|
12
|
+
"method": "replace",
|
|
13
|
+
"values": {"Crypto": "Cash"}
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
|
|
17
|
+
#2. MISSING
|
|
18
|
+
"missing": {
|
|
19
|
+
"Unit_Price": "median",
|
|
20
|
+
"Quantity": "mean",
|
|
21
|
+
"Payment_Mode": "mode"
|
|
22
|
+
},
|
|
23
|
+
|
|
24
|
+
# 3. OUTLIERS (Z-score)
|
|
25
|
+
"zscore": {
|
|
26
|
+
"Unit_Price": {"threshold": 2.5, "action": "cap"},
|
|
27
|
+
"Quantity": {"threshold": 2, "action": "remove"}
|
|
28
|
+
},
|
|
29
|
+
|
|
30
|
+
# 4 Isolation Forest
|
|
31
|
+
"isolation_forest": {
|
|
32
|
+
"columns": ["Unit_Price", "Quantity"],
|
|
33
|
+
"contamination": 0.1,
|
|
34
|
+
"action": "remove" # remove / nan / flag
|
|
35
|
+
},
|
|
36
|
+
|
|
37
|
+
# 5. SCALING (StandardScaler)
|
|
38
|
+
"scaling": {
|
|
39
|
+
"Unit_Price": "standard",
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"Quantity": "standard"
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
|
|
46
|
+
# 6. ENCODING
|
|
47
|
+
"encoding": {
|
|
48
|
+
"Payment_Mode": "onehot",
|
|
49
|
+
"Customer_Feedback": "label"
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
} """
|
|
53
|
+
|
|
54
|
+
print("Validation → Fixes → Missing → Outliers → Scaling → Encoding")
|
|
55
|
+
print(config)
|
|
56
|
+
|
|
57
|
+
def functionName():
|
|
58
|
+
print("""
|
|
59
|
+
from preprocessing.outliers_handling import zscore, iqr, isolation_forest_outliers, standard_scaler
|
|
60
|
+
from preprocessing.null_handling import replace_nulls
|
|
61
|
+
from preprocessing.validation import validate
|
|
62
|
+
from preprocessing.encoding import encode
|
|
63
|
+
from preprocessing.datatype import dtypeconversion """)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def dtypeconversion(df):
|
|
4
|
+
df = df.copy()
|
|
5
|
+
|
|
6
|
+
for col in df.columns:
|
|
7
|
+
non_null = df[col].dropna()
|
|
8
|
+
|
|
9
|
+
# skip empty columns
|
|
10
|
+
if len(non_null) == 0:
|
|
11
|
+
continue
|
|
12
|
+
|
|
13
|
+
col_lower = col.lower()
|
|
14
|
+
|
|
15
|
+
# skip id-like columns
|
|
16
|
+
if any(x in col_lower for x in ["id"]):
|
|
17
|
+
continue
|
|
18
|
+
|
|
19
|
+
# numeric check
|
|
20
|
+
num = pd.to_numeric(non_null, errors="coerce")
|
|
21
|
+
if num.notna().mean() > 0.9:
|
|
22
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
# datetime check (FIXED warning)
|
|
26
|
+
dt = pd.to_datetime(non_null, errors="coerce", format="mixed")
|
|
27
|
+
if dt.notna().mean() > 0.9:
|
|
28
|
+
df[col] = pd.to_datetime(df[col], errors="coerce", format="mixed")
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
# category
|
|
32
|
+
if df[col].nunique() < 50:
|
|
33
|
+
df[col] = df[col].astype("category")
|
|
34
|
+
|
|
35
|
+
return df
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
|
|
3
|
+
def encode(df, config):
|
|
4
|
+
df = df.copy()
|
|
5
|
+
|
|
6
|
+
if "encoding" not in config:
|
|
7
|
+
return df
|
|
8
|
+
|
|
9
|
+
for col, method in config["encoding"].items():
|
|
10
|
+
|
|
11
|
+
if col not in df.columns:
|
|
12
|
+
continue
|
|
13
|
+
|
|
14
|
+
if method == "label":
|
|
15
|
+
df[col] = df[col].astype("category").cat.codes
|
|
16
|
+
|
|
17
|
+
elif method == "onehot":
|
|
18
|
+
dummies = pd.get_dummies(df[col], prefix=col)
|
|
19
|
+
df = pd.concat([df, dummies], axis=1)
|
|
20
|
+
df.drop(columns=[col], inplace=True)
|
|
21
|
+
|
|
22
|
+
else:
|
|
23
|
+
raise ValueError(f"Invalid encoding method: {method}")
|
|
24
|
+
|
|
25
|
+
return df
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
def replace_nulls(df, config):
|
|
3
|
+
df = df.copy()
|
|
4
|
+
|
|
5
|
+
# fix fake nulls
|
|
6
|
+
df = df.replace(["", "NA", "null"], np.nan)
|
|
7
|
+
|
|
8
|
+
if "iqr" in config:
|
|
9
|
+
for col, method in config["iqr"].items():
|
|
10
|
+
|
|
11
|
+
print(f"{col} nulls before:", df[col].isnull().sum())
|
|
12
|
+
|
|
13
|
+
method = method.lower()
|
|
14
|
+
|
|
15
|
+
if method == "mean":
|
|
16
|
+
df[col] = df[col].fillna(df[col].mean())
|
|
17
|
+
|
|
18
|
+
elif method == "median":
|
|
19
|
+
df[col] = df[col].fillna(df[col].median())
|
|
20
|
+
|
|
21
|
+
elif method == "mode":
|
|
22
|
+
df[col] = df[col].fillna(df[col].mode()[0])
|
|
23
|
+
|
|
24
|
+
else:
|
|
25
|
+
raise ValueError("Invalid method")
|
|
26
|
+
|
|
27
|
+
print(f"{col} nulls after:", df[col].isnull().sum())
|
|
28
|
+
|
|
29
|
+
return df
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from null_handling import replace_nulls
|
|
4
|
+
|
|
5
|
+
#Detect Outliers Method IQR(inter-quantile Range)
|
|
6
|
+
names=["id","datetime","date","timestamp"]
|
|
7
|
+
def iqr(df):
|
|
8
|
+
df = df.copy()
|
|
9
|
+
missing = {}
|
|
10
|
+
order = []
|
|
11
|
+
|
|
12
|
+
# filter columns
|
|
13
|
+
for col in df.columns:
|
|
14
|
+
col_lower = col.lower()
|
|
15
|
+
|
|
16
|
+
if not any(name in col_lower for name in names):
|
|
17
|
+
order.append(col)
|
|
18
|
+
|
|
19
|
+
# process only valid columns
|
|
20
|
+
for col in order:
|
|
21
|
+
|
|
22
|
+
# numeric columns
|
|
23
|
+
if df[col].dtype in ["float64", "int64"]:
|
|
24
|
+
|
|
25
|
+
q1 = df[col].quantile(0.25)
|
|
26
|
+
q3 = df[col].quantile(0.75)
|
|
27
|
+
iqr = q3 - q1
|
|
28
|
+
|
|
29
|
+
lower = q1 - 1.5 * iqr
|
|
30
|
+
upper = q3 + 1.5 * iqr
|
|
31
|
+
|
|
32
|
+
outlier = df[(df[col] < lower) | (df[col] > upper)]
|
|
33
|
+
|
|
34
|
+
if len(outlier) > 0:
|
|
35
|
+
missing[col] = "median"
|
|
36
|
+
else:
|
|
37
|
+
missing[col] = "mean"
|
|
38
|
+
|
|
39
|
+
# categorical
|
|
40
|
+
else:
|
|
41
|
+
missing[col] = "mode"
|
|
42
|
+
|
|
43
|
+
return {"iqr":missing}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
#Z Scorce Detection
|
|
47
|
+
def zscore(df, config):
|
|
48
|
+
df = df.copy()
|
|
49
|
+
|
|
50
|
+
if "zscore" not in config:
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
mask = pd.Series(True, index=df.index)
|
|
54
|
+
|
|
55
|
+
for col, rules in config["zscore"].items():
|
|
56
|
+
|
|
57
|
+
if col not in df.columns:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
threshold = rules.get("threshold", 3)
|
|
61
|
+
|
|
62
|
+
mean = df[col].mean()
|
|
63
|
+
std = df[col].std()
|
|
64
|
+
|
|
65
|
+
if std == 0:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
z = (df[col] - mean) / std
|
|
69
|
+
outliers = np.abs(z) > threshold
|
|
70
|
+
|
|
71
|
+
action = rules.get("action", "cap")
|
|
72
|
+
|
|
73
|
+
if action == "remove":
|
|
74
|
+
mask &= ~outliers
|
|
75
|
+
|
|
76
|
+
elif action == "cap":
|
|
77
|
+
upper = mean + threshold * std
|
|
78
|
+
lower = mean - threshold * std
|
|
79
|
+
df[col] = np.where(df[col] > upper, upper, df[col])
|
|
80
|
+
df[col] = np.where(df[col] < lower, lower, df[col])
|
|
81
|
+
|
|
82
|
+
elif action == "nan":
|
|
83
|
+
df.loc[outliers, col] = np.nan
|
|
84
|
+
|
|
85
|
+
else:
|
|
86
|
+
raise ValueError(f"Invalid action: {action}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
df = df[mask]
|
|
90
|
+
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
#Standard Scaler Purpose for Train ML models
|
|
94
|
+
def standard_scaler(df, config):
|
|
95
|
+
df = df.copy()
|
|
96
|
+
|
|
97
|
+
if "scaling" not in config:
|
|
98
|
+
return df
|
|
99
|
+
|
|
100
|
+
for col, method in config["scaling"].items():
|
|
101
|
+
|
|
102
|
+
if col not in df.columns:
|
|
103
|
+
print(f"{col} not found")
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# ensure numeric
|
|
107
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
108
|
+
|
|
109
|
+
#Lower Case Conversion
|
|
110
|
+
method=method.lower()
|
|
111
|
+
|
|
112
|
+
if method == "standard":
|
|
113
|
+
mean = df[col].mean()
|
|
114
|
+
std = df[col].std()
|
|
115
|
+
|
|
116
|
+
if std == 0:
|
|
117
|
+
print(f"{col} std is 0, skipped")
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
df[col] = (df[col] - mean) / std
|
|
121
|
+
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError(f"Invalid scaling method: {method}")
|
|
124
|
+
|
|
125
|
+
return df
|
|
126
|
+
|
|
127
|
+
from sklearn.ensemble import IsolationForest
|
|
128
|
+
|
|
129
|
+
def isolation_forest_outliers(df, config):
|
|
130
|
+
df = df.copy()
|
|
131
|
+
|
|
132
|
+
if "isolation_forest" not in config:
|
|
133
|
+
return df
|
|
134
|
+
|
|
135
|
+
rules = config["isolation_forest"]
|
|
136
|
+
|
|
137
|
+
cols = rules.get("columns", [])
|
|
138
|
+
contamination = rules.get("contamination", 0.1)
|
|
139
|
+
action = rules.get("action", "remove")
|
|
140
|
+
|
|
141
|
+
# ensure numeric
|
|
142
|
+
X = df[cols].apply(pd.to_numeric, errors="coerce")
|
|
143
|
+
|
|
144
|
+
model = IsolationForest(contamination=contamination, random_state=42)
|
|
145
|
+
preds = model.fit_predict(X)
|
|
146
|
+
|
|
147
|
+
# -1 = outlier
|
|
148
|
+
outliers = preds == -1
|
|
149
|
+
|
|
150
|
+
if action == "remove":
|
|
151
|
+
df = df[~outliers]
|
|
152
|
+
|
|
153
|
+
elif action == "nan":
|
|
154
|
+
df.loc[outliers, cols] = None
|
|
155
|
+
|
|
156
|
+
elif action == "flag":
|
|
157
|
+
df["is_outlier"] = outliers
|
|
158
|
+
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError("Invalid action")
|
|
161
|
+
|
|
162
|
+
return df
|
|
163
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
def validate(df, config):
|
|
2
|
+
df = df.copy()
|
|
3
|
+
|
|
4
|
+
if "fixes" not in config:
|
|
5
|
+
return df
|
|
6
|
+
|
|
7
|
+
for col, rules in config["fixes"].items():
|
|
8
|
+
|
|
9
|
+
if col not in df.columns:
|
|
10
|
+
continue
|
|
11
|
+
|
|
12
|
+
method = rules.get("method")
|
|
13
|
+
|
|
14
|
+
# CLIP
|
|
15
|
+
if method == "clip":
|
|
16
|
+
min_val = rules.get("min", None)
|
|
17
|
+
max_val = rules.get("max", None)
|
|
18
|
+
|
|
19
|
+
df[col] = df[col].clip(lower=min_val, upper=max_val)
|
|
20
|
+
|
|
21
|
+
# REPLACE
|
|
22
|
+
elif method == "replace":
|
|
23
|
+
mapping = rules.get("values", {})
|
|
24
|
+
df[col] = df[col].replace(mapping)
|
|
25
|
+
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError(f"Invalid method: {method}")
|
|
28
|
+
|
|
29
|
+
return df
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="datacleaner-vb",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
packages=find_packages(),
|
|
7
|
+
install_requires=[
|
|
8
|
+
"pandas",
|
|
9
|
+
"numpy",
|
|
10
|
+
"scikit-learn"
|
|
11
|
+
],
|
|
12
|
+
author="Bharathan",
|
|
13
|
+
description="Custom data preprocessing library",
|
|
14
|
+
)
|