PyPI - datacleaner-vb - Versions diffs - 0.1.0__tar.gz - Mend

datacleaner-vb 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

datacleaner_vb-0.1.0/PKG-INFO +11 -0
datacleaner_vb-0.1.0/datacleaner_vb.egg-info/PKG-INFO +11 -0
datacleaner_vb-0.1.0/datacleaner_vb.egg-info/SOURCES.txt +15 -0
datacleaner_vb-0.1.0/datacleaner_vb.egg-info/dependency_links.txt +1 -0
datacleaner_vb-0.1.0/datacleaner_vb.egg-info/requires.txt +3 -0
datacleaner_vb-0.1.0/datacleaner_vb.egg-info/top_level.txt +1 -0
datacleaner_vb-0.1.0/preprocessing/API_Token.py +2 -0
datacleaner_vb-0.1.0/preprocessing/__init__.py +0 -0
datacleaner_vb-0.1.0/preprocessing/config.py +63 -0
datacleaner_vb-0.1.0/preprocessing/datatype.py +35 -0
datacleaner_vb-0.1.0/preprocessing/encoding.py +25 -0
datacleaner_vb-0.1.0/preprocessing/null_handling.py +29 -0
datacleaner_vb-0.1.0/preprocessing/outliers_handling.py +163 -0
datacleaner_vb-0.1.0/preprocessing/validation.py +29 -0
datacleaner_vb-0.1.0/pyproject.toml +3 -0
datacleaner_vb-0.1.0/setup.cfg +4 -0
datacleaner_vb-0.1.0/setup.py +14 -0

datacleaner_vb-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: datacleaner-vb
+Version: 0.1.0
+Summary: Custom data preprocessing library
+Author: Bharathan
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: scikit-learn
+Dynamic: author
+Dynamic: requires-dist
+Dynamic: summary

datacleaner_vb-0.1.0/datacleaner_vb.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,11 @@
+Metadata-Version: 2.4
+Name: datacleaner-vb
+Version: 0.1.0
+Summary: Custom data preprocessing library
+Author: Bharathan
+Requires-Dist: pandas
+Requires-Dist: numpy
+Requires-Dist: scikit-learn
+Dynamic: author
+Dynamic: requires-dist
+Dynamic: summary

datacleaner_vb-0.1.0/datacleaner_vb.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,15 @@
+pyproject.toml
+setup.py
+datacleaner_vb.egg-info/PKG-INFO
+datacleaner_vb.egg-info/SOURCES.txt
+datacleaner_vb.egg-info/dependency_links.txt
+datacleaner_vb.egg-info/requires.txt
+datacleaner_vb.egg-info/top_level.txt
+preprocessing/API_Token.py
+preprocessing/__init__.py
+preprocessing/config.py
+preprocessing/datatype.py
+preprocessing/encoding.py
+preprocessing/null_handling.py
+preprocessing/outliers_handling.py
+preprocessing/validation.py

datacleaner_vb-0.1.0/datacleaner_vb.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

datacleaner_vb-0.1.0/datacleaner_vb.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+pandas
+numpy
+scikit-learn

datacleaner_vb-0.1.0/datacleaner_vb.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ preprocessing

datacleaner_vb-0.1.0/preprocessing/API_Token.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ def passCode():
2	+ print("pypi-AgEIcHlwaS5vcmcCJDZkNjNmYjI5LWFiNzYtNDQyMi1hNDgzLWU3NjA0N2JjNjdlMwACKlszLCI0OWMwOTRkZC1mNGUyLTQzNjMtYmZiMC0zNDVjMmEzNGRjNWQiXQAABiB0hchHzjZBLBa8gmz4zlFghfcLU_bkSN1H5skcpZvFuw")

datacleaner_vb-0.1.0/preprocessing/__init__.py ADDED Viewed

File without changes

datacleaner_vb-0.1.0/preprocessing/config.py ADDED Viewed

@@ -0,0 +1,63 @@
+def structure():
+    config="""config = {
+    #1. Validation
+    "fixes": {
+        "Unit_Price": {
+            "method": "clip",
+            "min": 0,
+            "max": 10000
+        },
+        "Payment_Mode": {
+            "method": "replace",
+            "values": {"Crypto": "Cash"}
+        }
+    },
+    #2. MISSING
+    "missing": {
+        "Unit_Price": "median",
+        "Quantity": "mean",
+        "Payment_Mode": "mode"
+    },
+    # 3. OUTLIERS (Z-score)
+    "zscore": {
+        "Unit_Price": {"threshold": 2.5, "action": "cap"},
+        "Quantity": {"threshold": 2, "action": "remove"}
+    },
+    # 4 Isolation Forest
+    "isolation_forest": {
+        "columns": ["Unit_Price", "Quantity"],
+        "contamination": 0.1,
+        "action": "remove"   # remove / nan / flag
+    },
+    # 5. SCALING (StandardScaler)
+    "scaling": {
+        "Unit_Price": "standard",
+        },
+        {
+        "Quantity": "standard"
+        }
+    },
+    # 6. ENCODING
+    "encoding": {
+        "Payment_Mode": "onehot",
+        "Customer_Feedback": "label"
+    }
+}
+    } """
+    print("Validation → Fixes → Missing → Outliers → Scaling → Encoding")
+    print(config)
+def functionName():
+    print("""
+from preprocessing.outliers_handling import zscore, iqr, isolation_forest_outliers, standard_scaler
+from preprocessing.null_handling import replace_nulls
+from preprocessing.validation import validate
+from preprocessing.encoding import encode
+from preprocessing.datatype import dtypeconversion """)

datacleaner_vb-0.1.0/preprocessing/datatype.py ADDED Viewed

@@ -0,0 +1,35 @@
+import pandas as pd
+def dtypeconversion(df):
+    df = df.copy()
+    for col in df.columns:
+        non_null = df[col].dropna()
+        # skip empty columns
+        if len(non_null) == 0:
+            continue
+        col_lower = col.lower()
+        # skip id-like columns
+        if any(x in col_lower for x in ["id"]):
+            continue
+        # numeric check
+        num = pd.to_numeric(non_null, errors="coerce")
+        if num.notna().mean() > 0.9:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+            continue
+        # datetime check (FIXED warning)
+        dt = pd.to_datetime(non_null, errors="coerce", format="mixed")
+        if dt.notna().mean() > 0.9:
+            df[col] = pd.to_datetime(df[col], errors="coerce", format="mixed")
+            continue
+        #  category
+        if df[col].nunique() < 50:
+            df[col] = df[col].astype("category")
+    return df

datacleaner_vb-0.1.0/preprocessing/encoding.py ADDED Viewed

@@ -0,0 +1,25 @@
+import pandas as pd
+def encode(df, config):
+    df = df.copy()
+    if "encoding" not in config:
+        return df
+    for col, method in config["encoding"].items():
+        if col not in df.columns:
+            continue
+        if method == "label":
+            df[col] = df[col].astype("category").cat.codes
+        elif method == "onehot":
+            dummies = pd.get_dummies(df[col], prefix=col)
+            df = pd.concat([df, dummies], axis=1)
+            df.drop(columns=[col], inplace=True)
+        else:
+            raise ValueError(f"Invalid encoding method: {method}")
+    return df

datacleaner_vb-0.1.0/preprocessing/null_handling.py ADDED Viewed

@@ -0,0 +1,29 @@
+import numpy as np
+def replace_nulls(df, config):
+    df = df.copy()
+    # fix fake nulls
+    df = df.replace(["", "NA", "null"], np.nan)
+    if "iqr" in config:
+        for col, method in config["iqr"].items():
+            print(f"{col} nulls before:", df[col].isnull().sum())
+            method = method.lower()
+            if method == "mean":
+                df[col] = df[col].fillna(df[col].mean())
+            elif method == "median":
+                df[col] = df[col].fillna(df[col].median())
+            elif method == "mode":
+                df[col] = df[col].fillna(df[col].mode()[0])
+            else:
+                raise ValueError("Invalid method")
+            print(f"{col} nulls after:", df[col].isnull().sum())
+    return df

datacleaner_vb-0.1.0/preprocessing/outliers_handling.py ADDED Viewed

@@ -0,0 +1,163 @@
+import numpy as np
+import pandas as pd
+from null_handling import replace_nulls
+#Detect Outliers Method IQR(inter-quantile Range)
+names=["id","datetime","date","timestamp"]
+def iqr(df):
+    df = df.copy()
+    missing = {}
+    order = []
+    # filter columns
+    for col in df.columns:
+        col_lower = col.lower()
+        if not any(name in col_lower for name in names):
+            order.append(col)
+    # process only valid columns
+    for col in order:
+        # numeric columns
+        if df[col].dtype in ["float64", "int64"]:
+            q1 = df[col].quantile(0.25)
+            q3 = df[col].quantile(0.75)
+            iqr = q3 - q1
+            lower = q1 - 1.5 * iqr
+            upper = q3 + 1.5 * iqr
+            outlier = df[(df[col] < lower) | (df[col] > upper)]
+            if len(outlier) > 0:
+                missing[col] = "median"
+            else:
+                missing[col] = "mean"
+        # categorical
+        else:
+            missing[col] = "mode"
+    return {"iqr":missing}
+#Z Scorce Detection
+def zscore(df, config):
+    df = df.copy()
+    if "zscore" not in config:
+        return df
+    mask = pd.Series(True, index=df.index)
+    for col, rules in config["zscore"].items():
+        if col not in df.columns:
+            continue
+        threshold = rules.get("threshold", 3)
+        mean = df[col].mean()
+        std = df[col].std()
+        if std == 0:
+            continue
+        z = (df[col] - mean) / std
+        outliers = np.abs(z) > threshold
+        action = rules.get("action", "cap")
+        if action == "remove":
+            mask &= ~outliers
+        elif action == "cap":
+            upper = mean + threshold * std
+            lower = mean - threshold * std
+            df[col] = np.where(df[col] > upper, upper, df[col])
+            df[col] = np.where(df[col] < lower, lower, df[col])
+        elif action == "nan":
+            df.loc[outliers, col] = np.nan
+        else:
+            raise ValueError(f"Invalid action: {action}")
+    df = df[mask]
+    return df
+#Standard Scaler Purpose for Train ML models
+def standard_scaler(df, config):
+    df = df.copy()
+    if "scaling" not in config:
+        return df
+    for col, method in config["scaling"].items():
+        if col not in df.columns:
+            print(f"{col} not found")
+            continue
+        # ensure numeric
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+        #Lower Case Conversion
+        method=method.lower()
+        if method == "standard":
+            mean = df[col].mean()
+            std = df[col].std()
+            if std == 0:
+                print(f"{col} std is 0, skipped")
+                continue
+            df[col] = (df[col] - mean) / std
+        else:
+            raise ValueError(f"Invalid scaling method: {method}")
+    return df
+from sklearn.ensemble import IsolationForest
+def isolation_forest_outliers(df, config):
+    df = df.copy()
+    if "isolation_forest" not in config:
+        return df
+    rules = config["isolation_forest"]
+    cols = rules.get("columns", [])
+    contamination = rules.get("contamination", 0.1)
+    action = rules.get("action", "remove")
+    # ensure numeric
+    X = df[cols].apply(pd.to_numeric, errors="coerce")
+    model = IsolationForest(contamination=contamination, random_state=42)
+    preds = model.fit_predict(X)
+    # -1 = outlier
+    outliers = preds == -1
+    if action == "remove":
+        df = df[~outliers]
+    elif action == "nan":
+        df.loc[outliers, cols] = None
+    elif action == "flag":
+        df["is_outlier"] = outliers
+    else:
+        raise ValueError("Invalid action")
+    return df

datacleaner_vb-0.1.0/preprocessing/validation.py ADDED Viewed

@@ -0,0 +1,29 @@
+def validate(df, config):
+    df = df.copy()
+    if "fixes" not in config:
+        return df
+    for col, rules in config["fixes"].items():
+        if col not in df.columns:
+            continue
+        method = rules.get("method")
+        #  CLIP
+        if method == "clip":
+            min_val = rules.get("min", None)
+            max_val = rules.get("max", None)
+            df[col] = df[col].clip(lower=min_val, upper=max_val)
+        #  REPLACE
+        elif method == "replace":
+            mapping = rules.get("values", {})
+            df[col] = df[col].replace(mapping)
+        else:
+            raise ValueError(f"Invalid method: {method}")
+    return df

datacleaner_vb-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"

datacleaner_vb-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

datacleaner_vb-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,14 @@
+from setuptools import setup, find_packages
+setup(
+    name="datacleaner-vb",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "pandas",
+        "numpy",
+        "scikit-learn"
+    ],
+    author="Bharathan",
+    description="Custom data preprocessing library",
+)