PyPI - datawash-inspector - Versions diffs - 0.1.0__tar.gz - Mend

datawash-inspector 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

datawash_inspector-0.1.0/LICENSE +21 -0
datawash_inspector-0.1.0/MANIFEST.in +5 -0
datawash_inspector-0.1.0/PKG-INFO +76 -0
datawash_inspector-0.1.0/README.md +51 -0
datawash_inspector-0.1.0/datawash/__init__.py +51 -0
datawash_inspector-0.1.0/datawash/app.py +177 -0
datawash_inspector-0.1.0/datawash/autoprep.py +333 -0
datawash_inspector-0.1.0/datawash/cleaner.py +469 -0
datawash_inspector-0.1.0/datawash/eda_engine.py +611 -0
datawash_inspector-0.1.0/datawash/plot_factory.py +224 -0
datawash_inspector-0.1.0/datawash/stats_engine.py +374 -0
datawash_inspector-0.1.0/datawash_inspector.egg-info/PKG-INFO +76 -0
datawash_inspector-0.1.0/datawash_inspector.egg-info/SOURCES.txt +18 -0
datawash_inspector-0.1.0/datawash_inspector.egg-info/dependency_links.txt +1 -0
datawash_inspector-0.1.0/datawash_inspector.egg-info/requires.txt +6 -0
datawash_inspector-0.1.0/datawash_inspector.egg-info/top_level.txt +1 -0
datawash_inspector-0.1.0/pyproject.toml +34 -0
datawash_inspector-0.1.0/requirements.txt +6 -0
datawash_inspector-0.1.0/setup.cfg +4 -0
datawash_inspector-0.1.0/setup.py +33 -0

datawash_inspector-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 kishown
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

datawash_inspector-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,5 @@
+include README.md
+include LICENSE
+include pyproject.toml
+include setup.py
+include requirements.txt

datawash_inspector-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,76 @@
+Metadata-Version: 2.4
+Name: datawash-inspector
+Version: 0.1.0
+Summary: An automated end-to-end data cleaning, preprocessing, and EDA pipeline.
+Home-page: https://github.com/Kishown-e22194/datawash
+Author: kishown
+Author-email: kishown <kishown@example.com>
+Project-URL: Homepage, https://github.com/Kishown-e22194/datawash
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas>=1.3.0
+Requires-Dist: numpy>=1.20.0
+Requires-Dist: scikit-learn>=1.0.0
+Requires-Dist: scipy>=1.7.0
+Requires-Dist: plotly>=5.0.0
+Requires-Dist: streamlit>=1.20.0
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+# DataWash 🔬
+**DataWash** is an automated, end-to-end Python data pipeline that cleans, prunes, prepares, and visualizes your datasets with zero manual effort. It includes a built-in interactive Streamlit dashboard for zero-code data exploration!
+## Installation
+You can install `datawash` via pip:
+```bash
+pip install datawash-inspector
+```
+## Quick Start
+It only takes a few lines of code to completely sanitize, optimize, and encode your dataset for machine learning.
+```python
+from datawash import DataPipeline
+# 1. Initialize Pipeline
+pipe = DataPipeline()
+# 2. Load and Auto-Clean
+pipe.load_data('your_data.csv')
+pipe.sanitize_garbage().auto_type_correct()
+pipe.handle_missing_values(strategy='auto')
+# 3. Export Clean Data
+pipe.save_data('cleaned_data.csv')
+```
+## The Interactive Dashboard
+`datawash` comes with a fully automated EDA (Exploratory Data Analysis) dashboard powered by Streamlit. You can launch it directly from your script to visually explore your data!
+```python
+# Launch the dashboard to see Heatmaps, Bar Charts, and Health Reports!
+pipe.run_app()
+```
+## Features
+- **Memory Optimization:** Automatically downcasts large numbers to save memory (up to 50%+ reduction).
+- **Auto-Cleaning:** Automatically converts pure whitespace strings or garbage text (`"N/A"`, `"?"`, `"-"`) to `NaN` values.
+- **Smart Imputation:** Dynamically imputes missing numeric values with the median and categorical values with the mode.
+- **Statistical EDA:** Calculates an advanced unified correlation matrix supporting numerical, categorical, and mixed variable types (Cramér's V, Eta, Spearman).
+- **Feature Pruning:** Automatically identifies and drops highly redundant features (e.g., `> 90%` correlation) and high-cardinality IDs.
+## License
+This project is licensed under the MIT License.

datawash_inspector-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,51 @@
+# DataWash 🔬
+**DataWash** is an automated, end-to-end Python data pipeline that cleans, prunes, prepares, and visualizes your datasets with zero manual effort. It includes a built-in interactive Streamlit dashboard for zero-code data exploration!
+## Installation
+You can install `datawash` via pip:
+```bash
+pip install datawash-inspector
+```
+## Quick Start
+It only takes a few lines of code to completely sanitize, optimize, and encode your dataset for machine learning.
+```python
+from datawash import DataPipeline
+# 1. Initialize Pipeline
+pipe = DataPipeline()
+# 2. Load and Auto-Clean
+pipe.load_data('your_data.csv')
+pipe.sanitize_garbage().auto_type_correct()
+pipe.handle_missing_values(strategy='auto')
+# 3. Export Clean Data
+pipe.save_data('cleaned_data.csv')
+```
+## The Interactive Dashboard
+`datawash` comes with a fully automated EDA (Exploratory Data Analysis) dashboard powered by Streamlit. You can launch it directly from your script to visually explore your data!
+```python
+# Launch the dashboard to see Heatmaps, Bar Charts, and Health Reports!
+pipe.run_app()
+```
+## Features
+- **Memory Optimization:** Automatically downcasts large numbers to save memory (up to 50%+ reduction).
+- **Auto-Cleaning:** Automatically converts pure whitespace strings or garbage text (`"N/A"`, `"?"`, `"-"`) to `NaN` values.
+- **Smart Imputation:** Dynamically imputes missing numeric values with the median and categorical values with the mode.
+- **Statistical EDA:** Calculates an advanced unified correlation matrix supporting numerical, categorical, and mixed variable types (Cramér's V, Eta, Spearman).
+- **Feature Pruning:** Automatically identifies and drops highly redundant features (e.g., `> 90%` correlation) and high-cardinality IDs.
+## License
+This project is licensed under the MIT License.

datawash_inspector-0.1.0/datawash/__init__.py ADDED Viewed

@@ -0,0 +1,51 @@
+"""
+datawash — A Python Data Cleaning & EDA Library
+=====================================================
+Provides a unified ``DataPipeline`` class for end-to-end data preparation,
+and a standalone ``PlottingMethods`` factory for generating embeddable HTML charts.
+Inheritance chain:
+    autoprep.DataPipeline          → Step 1: Ingestion, sanitization, type correction
+      └─ cleaner.DataPipeline      → Step 2: Summary, health report, imputation, outliers
+           └─ eda_engine.DataPipeline → Step 3+4: Feature engineering & visualisation
+                └─ stats_engine.InsightsEngine → Step 5: Unified statistical associations
+Usage::
+    from datawash import DataPipeline
+    pipe = DataPipeline()
+    # Step 1 — Ingest & Prep
+    pipe.load_data('data.csv').sanitize_garbage().auto_type_correct()
+    # Step 2 — Explore & Clean
+    pipe.data_health_report()
+    pipe.handle_missing_values(strategy='auto').remove_duplicates()
+    # Step 3 — Feature Engineering (leak-safe)
+    from sklearn.model_selection import train_test_split
+    train, test = train_test_split(pipe.df, test_size=0.2)
+    pipe.fit_transform_ml(train, num_strategy='standard', cat_strategy='onehot')
+    test_processed = pipe.transform_ml(test)
+    # Or quick (full-dataset) convenience wrapper:
+    # pipe.prepare_for_ml(num_strategy='standard', cat_strategy='onehot')
+    # Step 4 — Visualise
+    pipe.plot_correlation_matrix()
+    pipe.plot_missing_values()
+    pipe.univariate_all()
+    # Step 5 — Statistical Associations
+    pipe.plot_all_associations_heatmap()
+    # Export
+    pipe.save_data('cleaned.csv')
+"""
+from .stats_engine import InsightsEngine as DataPipeline
+from .plot_factory import PlottingMethods, compose_report
+__all__ = ['DataPipeline', 'PlottingMethods', 'compose_report']
+__version__ = '0.3.0'

datawash_inspector-0.1.0/datawash/app.py ADDED Viewed

@@ -0,0 +1,177 @@
+import streamlit as st
+import pandas as pd
+import sys
+import os
+import io
+# Point to the parent of datawash so the package's relative imports work
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+from datawash import DataPipeline, PlottingMethods
+st.set_page_config(page_title="datawash Dashboard", layout="wide", page_icon="🔬")
+st.title("🔬 datawash: Automated Data Pipeline")
+st.markdown("Upload your dataset to automatically clean, prune, and explore your data without writing a single line of code.")
+# --- SIDEBAR: Controls ---
+preloaded_data_path = os.environ.get("DATAWASH_APP_DATA")
+if preloaded_data_path and os.path.exists(preloaded_data_path):
+    st.sidebar.success(f"📂 Programmatically loaded data from API.")
+    file_name = "Programmatic_Data"
+    file_to_load = preloaded_data_path
+else:
+    uploaded_file = st.sidebar.file_uploader("Upload your dataset (CSV)", type=['csv'])
+    file_name = uploaded_file.name if uploaded_file else None
+    file_to_load = uploaded_file
+if file_to_load is not None:
+    # Initialize the pipeline inside Streamlit's session state so it persists across button clicks
+    if 'pipe' not in st.session_state or st.session_state.get('last_uploaded') != file_name:
+        st.session_state.pipe = DataPipeline()
+        df = pd.read_csv(file_to_load)
+        st.session_state.pipe.df = df.copy()
+        st.session_state.pipe.df_original = df.copy()
+        st.session_state.last_uploaded = file_name
+    pipe = st.session_state.pipe
+    st.sidebar.success(f"Loaded {pipe.df.shape[0]} rows and {pipe.df.shape[1]} columns.")
+    st.sidebar.header("1. Memory Optimization")
+    if st.sidebar.button("Downcast Data Types"):
+        with st.spinner("Optimizing memory..."):
+            pipe.optimize_memory()
+            st.sidebar.success("Memory Optimized!")
+    st.sidebar.header("2. Auto-Cleaning")
+    imputation_strategy = st.sidebar.selectbox(
+        "Missing Value Strategy",
+        ['auto', 'mean', 'median', 'mode', 'drop']
+    )
+    do_text_standardization = st.sidebar.checkbox("Standardize Text (Lowercase & Strip)", value=True)
+    if st.sidebar.button("Sanitize & Impute"):
+        with st.spinner("Cleaning data..."):
+            pipe.sanitize_garbage().auto_type_correct()
+            if do_text_standardization:
+                pipe.standardize_text()
+            pipe.handle_missing_values(strategy=imputation_strategy)
+            st.sidebar.success("Data Cleaned & Imputed!")
+    st.sidebar.header("3. Feature Pruning")
+    threshold = st.sidebar.slider("Redundancy Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.05)
+    if st.sidebar.button("Drop Redundant Features"):
+        with st.spinner("Calculating associations and dropping..."):
+            pipe.drop_redundant_features(threshold=threshold)
+            st.sidebar.success(f"Redundant features (>{threshold}) dropped!")
+    st.sidebar.header("4. Export")
+    csv = pipe.df.to_csv(index=False).encode('utf-8')
+    st.sidebar.download_button(
+        label="Download Cleaned CSV",
+        data=csv,
+        file_name='cleaned_data.csv',
+        mime='text/csv',
+    )
+    # --- MAIN PANEL ---
+    tab1, tab2, tab3 = st.tabs(["Data Preview", "Health Report", "Visualizations"])
+    with tab1:
+        st.subheader("Dataset Preview")
+        st.dataframe(pipe.df.head(100))
+        col1, col2, col3 = st.columns(3)
+        col1.metric("Rows", pipe.df.shape[0])
+        col2.metric("Columns", pipe.df.shape[1])
+        mem_mb = pipe.df.memory_usage().sum() / 1024**2
+        col3.metric("Memory Usage (MB)", f"{mem_mb:.2f}")
+    with tab2:
+        st.subheader("Data Health Report")
+        health = pipe._compute_health_data()
+        # Missing values
+        if health['missing']:
+            st.warning(f"⚠️ {len(health['missing'])} columns have missing values:")
+            missing_df = pd.DataFrame([
+                {'Column': col, 'Missing Count': v['count'], 'Percentage': f"{v['pct']}%"}
+                for col, v in health['missing'].items()
+            ])
+            st.dataframe(missing_df, hide_index=True)
+        else:
+            st.success("✅ No missing values found.")
+        if health['constant_cols']:
+            st.warning(f"🟡 {len(health['constant_cols'])} Constant Columns (Zero Information):")
+            st.write(health['constant_cols'])
+        else:
+            st.success("✅ No constant columns found.")
+        if health['high_cardinality']:
+            st.warning(f"🟡 {len(health['high_cardinality'])} High-Cardinality Categoricals (Likely IDs):")
+            st.write([h['col'] for h in health['high_cardinality']])
+        else:
+            st.success("✅ No high-cardinality categorical columns detected.")
+    with tab3:
+        st.subheader("Exploratory Data Analysis")
+        plotter = PlottingMethods(df=pipe.df)
+        col1, col2 = st.columns(2)
+        with col1:
+            cat_cols = pipe.df.select_dtypes(exclude=['number']).columns.tolist()
+            if cat_cols:
+                selected_cat = st.selectbox("Select Categorical Feature for Bar Chart", cat_cols)
+                if selected_cat:
+                    html = plotter.get_bar_html(selected_cat)
+                    st.components.v1.html(html, height=500, scrolling=True)
+            else:
+                st.info("No categorical features available to plot.")
+        with col2:
+            num_cols = pipe.df.select_dtypes(include=['number']).columns.tolist()
+            if num_cols:
+                selected_num = st.selectbox("Select Numeric Feature for Histogram", num_cols)
+                if selected_num:
+                    html = plotter.get_histogram_html(selected_num)
+                    st.components.v1.html(html, height=500, scrolling=True)
+            else:
+                st.info("No numeric features available to plot.")
+        st.subheader("Bivariate Associations")
+        if len(pipe.df.columns) >= 2:
+            if preloaded_data_path or st.button("Generate Unified Heatmap (May take a few seconds)"):
+                with st.spinner("Calculating Cramér's V, Eta, and Spearman's rho..."):
+                    fig = pipe.plot_all_associations_heatmap(return_fig=True)
+                    if fig is not None and fig != pipe:
+                        st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.info("Need at least 2 columns to calculate associations.")
+        st.subheader("Advanced Charts")
+        col3, col4 = st.columns(2)
+        with col3:
+            st.write("**Scatter Plot**")
+            if len(pipe.df.columns) >= 2:
+                scat_x = st.selectbox("X-Axis", pipe.df.columns.tolist())
+                scat_y = st.selectbox("Y-Axis", pipe.df.columns.tolist()[::-1])
+                if scat_x and scat_y:
+                    scat_html = plotter.get_scatter_html(scat_x, scat_y)
+                    st.components.v1.html(scat_html, height=500, scrolling=True)
+        with col4:
+            st.write("**Pie Chart**")
+            if cat_cols:
+                pie_cat = st.selectbox("Category", cat_cols, key='pie_cat')
+                if pie_cat:
+                    pie_html = plotter.get_pie_html(names=pie_cat)
+                    st.components.v1.html(pie_html, height=500, scrolling=True)
+            else:
+                st.info("No categorical features available for Pie Chart.")
+else:
+    st.info("👈 Please upload a CSV file from the sidebar to begin.")