datacleaner-vb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacleaner-vb
3
+ Version: 0.1.0
4
+ Summary: Custom data preprocessing library
5
+ Author: Bharathan
6
+ Requires-Dist: pandas
7
+ Requires-Dist: numpy
8
+ Requires-Dist: scikit-learn
9
+ Dynamic: author
10
+ Dynamic: requires-dist
11
+ Dynamic: summary
@@ -0,0 +1,11 @@
1
+ Metadata-Version: 2.4
2
+ Name: datacleaner-vb
3
+ Version: 0.1.0
4
+ Summary: Custom data preprocessing library
5
+ Author: Bharathan
6
+ Requires-Dist: pandas
7
+ Requires-Dist: numpy
8
+ Requires-Dist: scikit-learn
9
+ Dynamic: author
10
+ Dynamic: requires-dist
11
+ Dynamic: summary
@@ -0,0 +1,15 @@
1
+ pyproject.toml
2
+ setup.py
3
+ datacleaner_vb.egg-info/PKG-INFO
4
+ datacleaner_vb.egg-info/SOURCES.txt
5
+ datacleaner_vb.egg-info/dependency_links.txt
6
+ datacleaner_vb.egg-info/requires.txt
7
+ datacleaner_vb.egg-info/top_level.txt
8
+ preprocessing/API_Token.py
9
+ preprocessing/__init__.py
10
+ preprocessing/config.py
11
+ preprocessing/datatype.py
12
+ preprocessing/encoding.py
13
+ preprocessing/null_handling.py
14
+ preprocessing/outliers_handling.py
15
+ preprocessing/validation.py
@@ -0,0 +1,3 @@
1
+ pandas
2
+ numpy
3
+ scikit-learn
@@ -0,0 +1 @@
1
+ preprocessing
@@ -0,0 +1,2 @@
1
+ def passCode():
2
+ print("pypi-AgEIcHlwaS5vcmcCJDZkNjNmYjI5LWFiNzYtNDQyMi1hNDgzLWU3NjA0N2JjNjdlMwACKlszLCI0OWMwOTRkZC1mNGUyLTQzNjMtYmZiMC0zNDVjMmEzNGRjNWQiXQAABiB0hchHzjZBLBa8gmz4zlFghfcLU_bkSN1H5skcpZvFuw")
File without changes
@@ -0,0 +1,63 @@
1
+ def structure():
2
+ config="""config = {
3
+
4
+ #1. Validation
5
+ "fixes": {
6
+ "Unit_Price": {
7
+ "method": "clip",
8
+ "min": 0,
9
+ "max": 10000
10
+ },
11
+ "Payment_Mode": {
12
+ "method": "replace",
13
+ "values": {"Crypto": "Cash"}
14
+ }
15
+ },
16
+
17
+ #2. MISSING
18
+ "missing": {
19
+ "Unit_Price": "median",
20
+ "Quantity": "mean",
21
+ "Payment_Mode": "mode"
22
+ },
23
+
24
+ # 3. OUTLIERS (Z-score)
25
+ "zscore": {
26
+ "Unit_Price": {"threshold": 2.5, "action": "cap"},
27
+ "Quantity": {"threshold": 2, "action": "remove"}
28
+ },
29
+
30
+ # 4 Isolation Forest
31
+ "isolation_forest": {
32
+ "columns": ["Unit_Price", "Quantity"],
33
+ "contamination": 0.1,
34
+ "action": "remove" # remove / nan / flag
35
+ },
36
+
37
+ # 5. SCALING (StandardScaler)
38
+ "scaling": {
39
+ "Unit_Price": "standard",
40
+ },
41
+ {
42
+ "Quantity": "standard"
43
+ }
44
+ },
45
+
46
+ # 6. ENCODING
47
+ "encoding": {
48
+ "Payment_Mode": "onehot",
49
+ "Customer_Feedback": "label"
50
+ }
51
+ }
52
+ } """
53
+
54
+ print("Validation → Fixes → Missing → Outliers → Scaling → Encoding")
55
+ print(config)
56
+
57
+ def functionName():
58
+ print("""
59
+ from preprocessing.outliers_handling import zscore, iqr, isolation_forest_outliers, standard_scaler
60
+ from preprocessing.null_handling import replace_nulls
61
+ from preprocessing.validation import validate
62
+ from preprocessing.encoding import encode
63
+ from preprocessing.datatype import dtypeconversion """)
@@ -0,0 +1,35 @@
1
+ import pandas as pd
2
+
3
+ def dtypeconversion(df):
4
+ df = df.copy()
5
+
6
+ for col in df.columns:
7
+ non_null = df[col].dropna()
8
+
9
+ # skip empty columns
10
+ if len(non_null) == 0:
11
+ continue
12
+
13
+ col_lower = col.lower()
14
+
15
+ # skip id-like columns
16
+ if any(x in col_lower for x in ["id"]):
17
+ continue
18
+
19
+ # numeric check
20
+ num = pd.to_numeric(non_null, errors="coerce")
21
+ if num.notna().mean() > 0.9:
22
+ df[col] = pd.to_numeric(df[col], errors="coerce")
23
+ continue
24
+
25
+ # datetime check (FIXED warning)
26
+ dt = pd.to_datetime(non_null, errors="coerce", format="mixed")
27
+ if dt.notna().mean() > 0.9:
28
+ df[col] = pd.to_datetime(df[col], errors="coerce", format="mixed")
29
+ continue
30
+
31
+ # category
32
+ if df[col].nunique() < 50:
33
+ df[col] = df[col].astype("category")
34
+
35
+ return df
@@ -0,0 +1,25 @@
1
+ import pandas as pd
2
+
3
+ def encode(df, config):
4
+ df = df.copy()
5
+
6
+ if "encoding" not in config:
7
+ return df
8
+
9
+ for col, method in config["encoding"].items():
10
+
11
+ if col not in df.columns:
12
+ continue
13
+
14
+ if method == "label":
15
+ df[col] = df[col].astype("category").cat.codes
16
+
17
+ elif method == "onehot":
18
+ dummies = pd.get_dummies(df[col], prefix=col)
19
+ df = pd.concat([df, dummies], axis=1)
20
+ df.drop(columns=[col], inplace=True)
21
+
22
+ else:
23
+ raise ValueError(f"Invalid encoding method: {method}")
24
+
25
+ return df
@@ -0,0 +1,29 @@
1
+ import numpy as np
2
+ def replace_nulls(df, config):
3
+ df = df.copy()
4
+
5
+ # fix fake nulls
6
+ df = df.replace(["", "NA", "null"], np.nan)
7
+
8
+ if "iqr" in config:
9
+ for col, method in config["iqr"].items():
10
+
11
+ print(f"{col} nulls before:", df[col].isnull().sum())
12
+
13
+ method = method.lower()
14
+
15
+ if method == "mean":
16
+ df[col] = df[col].fillna(df[col].mean())
17
+
18
+ elif method == "median":
19
+ df[col] = df[col].fillna(df[col].median())
20
+
21
+ elif method == "mode":
22
+ df[col] = df[col].fillna(df[col].mode()[0])
23
+
24
+ else:
25
+ raise ValueError("Invalid method")
26
+
27
+ print(f"{col} nulls after:", df[col].isnull().sum())
28
+
29
+ return df
@@ -0,0 +1,163 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ from null_handling import replace_nulls
4
+
5
+ #Detect Outliers Method IQR(inter-quantile Range)
6
+ names=["id","datetime","date","timestamp"]
7
+ def iqr(df):
8
+ df = df.copy()
9
+ missing = {}
10
+ order = []
11
+
12
+ # filter columns
13
+ for col in df.columns:
14
+ col_lower = col.lower()
15
+
16
+ if not any(name in col_lower for name in names):
17
+ order.append(col)
18
+
19
+ # process only valid columns
20
+ for col in order:
21
+
22
+ # numeric columns
23
+ if df[col].dtype in ["float64", "int64"]:
24
+
25
+ q1 = df[col].quantile(0.25)
26
+ q3 = df[col].quantile(0.75)
27
+ iqr = q3 - q1
28
+
29
+ lower = q1 - 1.5 * iqr
30
+ upper = q3 + 1.5 * iqr
31
+
32
+ outlier = df[(df[col] < lower) | (df[col] > upper)]
33
+
34
+ if len(outlier) > 0:
35
+ missing[col] = "median"
36
+ else:
37
+ missing[col] = "mean"
38
+
39
+ # categorical
40
+ else:
41
+ missing[col] = "mode"
42
+
43
+ return {"iqr":missing}
44
+
45
+
46
+ #Z Scorce Detection
47
+ def zscore(df, config):
48
+ df = df.copy()
49
+
50
+ if "zscore" not in config:
51
+ return df
52
+
53
+ mask = pd.Series(True, index=df.index)
54
+
55
+ for col, rules in config["zscore"].items():
56
+
57
+ if col not in df.columns:
58
+ continue
59
+
60
+ threshold = rules.get("threshold", 3)
61
+
62
+ mean = df[col].mean()
63
+ std = df[col].std()
64
+
65
+ if std == 0:
66
+ continue
67
+
68
+ z = (df[col] - mean) / std
69
+ outliers = np.abs(z) > threshold
70
+
71
+ action = rules.get("action", "cap")
72
+
73
+ if action == "remove":
74
+ mask &= ~outliers
75
+
76
+ elif action == "cap":
77
+ upper = mean + threshold * std
78
+ lower = mean - threshold * std
79
+ df[col] = np.where(df[col] > upper, upper, df[col])
80
+ df[col] = np.where(df[col] < lower, lower, df[col])
81
+
82
+ elif action == "nan":
83
+ df.loc[outliers, col] = np.nan
84
+
85
+ else:
86
+ raise ValueError(f"Invalid action: {action}")
87
+
88
+
89
+ df = df[mask]
90
+
91
+ return df
92
+
93
+ #Standard Scaler Purpose for Train ML models
94
+ def standard_scaler(df, config):
95
+ df = df.copy()
96
+
97
+ if "scaling" not in config:
98
+ return df
99
+
100
+ for col, method in config["scaling"].items():
101
+
102
+ if col not in df.columns:
103
+ print(f"{col} not found")
104
+ continue
105
+
106
+ # ensure numeric
107
+ df[col] = pd.to_numeric(df[col], errors="coerce")
108
+
109
+ #Lower Case Conversion
110
+ method=method.lower()
111
+
112
+ if method == "standard":
113
+ mean = df[col].mean()
114
+ std = df[col].std()
115
+
116
+ if std == 0:
117
+ print(f"{col} std is 0, skipped")
118
+ continue
119
+
120
+ df[col] = (df[col] - mean) / std
121
+
122
+ else:
123
+ raise ValueError(f"Invalid scaling method: {method}")
124
+
125
+ return df
126
+
127
+ from sklearn.ensemble import IsolationForest
128
+
129
+ def isolation_forest_outliers(df, config):
130
+ df = df.copy()
131
+
132
+ if "isolation_forest" not in config:
133
+ return df
134
+
135
+ rules = config["isolation_forest"]
136
+
137
+ cols = rules.get("columns", [])
138
+ contamination = rules.get("contamination", 0.1)
139
+ action = rules.get("action", "remove")
140
+
141
+ # ensure numeric
142
+ X = df[cols].apply(pd.to_numeric, errors="coerce")
143
+
144
+ model = IsolationForest(contamination=contamination, random_state=42)
145
+ preds = model.fit_predict(X)
146
+
147
+ # -1 = outlier
148
+ outliers = preds == -1
149
+
150
+ if action == "remove":
151
+ df = df[~outliers]
152
+
153
+ elif action == "nan":
154
+ df.loc[outliers, cols] = None
155
+
156
+ elif action == "flag":
157
+ df["is_outlier"] = outliers
158
+
159
+ else:
160
+ raise ValueError("Invalid action")
161
+
162
+ return df
163
+
@@ -0,0 +1,29 @@
1
+ def validate(df, config):
2
+ df = df.copy()
3
+
4
+ if "fixes" not in config:
5
+ return df
6
+
7
+ for col, rules in config["fixes"].items():
8
+
9
+ if col not in df.columns:
10
+ continue
11
+
12
+ method = rules.get("method")
13
+
14
+ # CLIP
15
+ if method == "clip":
16
+ min_val = rules.get("min", None)
17
+ max_val = rules.get("max", None)
18
+
19
+ df[col] = df[col].clip(lower=min_val, upper=max_val)
20
+
21
+ # REPLACE
22
+ elif method == "replace":
23
+ mapping = rules.get("values", {})
24
+ df[col] = df[col].replace(mapping)
25
+
26
+ else:
27
+ raise ValueError(f"Invalid method: {method}")
28
+
29
+ return df
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,14 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="datacleaner-vb",
5
+ version="0.1.0",
6
+ packages=find_packages(),
7
+ install_requires=[
8
+ "pandas",
9
+ "numpy",
10
+ "scikit-learn"
11
+ ],
12
+ author="Bharathan",
13
+ description="Custom data preprocessing library",
14
+ )