datawash-inspector 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 kishown
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,5 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+ include setup.py
5
+ include requirements.txt
@@ -0,0 +1,76 @@
1
+ Metadata-Version: 2.4
2
+ Name: datawash-inspector
3
+ Version: 0.1.0
4
+ Summary: An automated end-to-end data cleaning, preprocessing, and EDA pipeline.
5
+ Home-page: https://github.com/Kishown-e22194/datawash
6
+ Author: kishown
7
+ Author-email: kishown <kishown@example.com>
8
+ Project-URL: Homepage, https://github.com/Kishown-e22194/datawash
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Requires-Python: >=3.8
13
+ Description-Content-Type: text/markdown
14
+ License-File: LICENSE
15
+ Requires-Dist: pandas>=1.3.0
16
+ Requires-Dist: numpy>=1.20.0
17
+ Requires-Dist: scikit-learn>=1.0.0
18
+ Requires-Dist: scipy>=1.7.0
19
+ Requires-Dist: plotly>=5.0.0
20
+ Requires-Dist: streamlit>=1.20.0
21
+ Dynamic: author
22
+ Dynamic: home-page
23
+ Dynamic: license-file
24
+ Dynamic: requires-python
25
+
26
+ # DataWash 🔬
27
+
28
+ **DataWash** is an automated, end-to-end Python data pipeline that cleans, prunes, prepares, and visualizes your datasets with zero manual effort. It includes a built-in interactive Streamlit dashboard for zero-code data exploration!
29
+
30
+ ## Installation
31
+
32
+ You can install `datawash` via pip:
33
+
34
+ ```bash
35
+ pip install datawash-inspector
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ It only takes a few lines of code to completely sanitize, optimize, and encode your dataset for machine learning.
41
+
42
+ ```python
43
+ from datawash import DataPipeline
44
+
45
+ # 1. Initialize Pipeline
46
+ pipe = DataPipeline()
47
+
48
+ # 2. Load and Auto-Clean
49
+ pipe.load_data('your_data.csv')
50
+ pipe.sanitize_garbage().auto_type_correct()
51
+ pipe.handle_missing_values(strategy='auto')
52
+
53
+ # 3. Export Clean Data
54
+ pipe.save_data('cleaned_data.csv')
55
+ ```
56
+
57
+ ## The Interactive Dashboard
58
+
59
+ `datawash` comes with a fully automated EDA (Exploratory Data Analysis) dashboard powered by Streamlit. You can launch it directly from your script to visually explore your data!
60
+
61
+ ```python
62
+ # Launch the dashboard to see Heatmaps, Bar Charts, and Health Reports!
63
+ pipe.run_app()
64
+ ```
65
+
66
+ ## Features
67
+
68
+ - **Memory Optimization:** Automatically downcasts large numbers to save memory (up to 50%+ reduction).
69
+ - **Auto-Cleaning:** Automatically converts pure whitespace strings or garbage text (`"N/A"`, `"?"`, `"-"`) to `NaN` values.
70
+ - **Smart Imputation:** Dynamically imputes missing numeric values with the median and categorical values with the mode.
71
+ - **Statistical EDA:** Calculates an advanced unified correlation matrix supporting numerical, categorical, and mixed variable types (Cramér's V, Eta, Spearman).
72
+ - **Feature Pruning:** Automatically identifies and drops highly redundant features (e.g., `> 90%` correlation) and high-cardinality IDs.
73
+
74
+ ## License
75
+
76
+ This project is licensed under the MIT License.
@@ -0,0 +1,51 @@
1
+ # DataWash 🔬
2
+
3
+ **DataWash** is an automated, end-to-end Python data pipeline that cleans, prunes, prepares, and visualizes your datasets with zero manual effort. It includes a built-in interactive Streamlit dashboard for zero-code data exploration!
4
+
5
+ ## Installation
6
+
7
+ You can install `datawash` via pip:
8
+
9
+ ```bash
10
+ pip install datawash-inspector
11
+ ```
12
+
13
+ ## Quick Start
14
+
15
+ It only takes a few lines of code to completely sanitize, optimize, and encode your dataset for machine learning.
16
+
17
+ ```python
18
+ from datawash import DataPipeline
19
+
20
+ # 1. Initialize Pipeline
21
+ pipe = DataPipeline()
22
+
23
+ # 2. Load and Auto-Clean
24
+ pipe.load_data('your_data.csv')
25
+ pipe.sanitize_garbage().auto_type_correct()
26
+ pipe.handle_missing_values(strategy='auto')
27
+
28
+ # 3. Export Clean Data
29
+ pipe.save_data('cleaned_data.csv')
30
+ ```
31
+
32
+ ## The Interactive Dashboard
33
+
34
+ `datawash` comes with a fully automated EDA (Exploratory Data Analysis) dashboard powered by Streamlit. You can launch it directly from your script to visually explore your data!
35
+
36
+ ```python
37
+ # Launch the dashboard to see Heatmaps, Bar Charts, and Health Reports!
38
+ pipe.run_app()
39
+ ```
40
+
41
+ ## Features
42
+
43
+ - **Memory Optimization:** Automatically downcasts large numbers to save memory (up to 50%+ reduction).
44
+ - **Auto-Cleaning:** Automatically converts pure whitespace strings or garbage text (`"N/A"`, `"?"`, `"-"`) to `NaN` values.
45
+ - **Smart Imputation:** Dynamically imputes missing numeric values with the median and categorical values with the mode.
46
+ - **Statistical EDA:** Calculates an advanced unified correlation matrix supporting numerical, categorical, and mixed variable types (Cramér's V, Eta, Spearman).
47
+ - **Feature Pruning:** Automatically identifies and drops highly redundant features (e.g., `> 90%` correlation) and high-cardinality IDs.
48
+
49
+ ## License
50
+
51
+ This project is licensed under the MIT License.
@@ -0,0 +1,51 @@
1
+ """
2
+ datawash — A Python Data Cleaning & EDA Library
3
+ =====================================================
4
+ Provides a unified ``DataPipeline`` class for end-to-end data preparation,
5
+ and a standalone ``PlottingMethods`` factory for generating embeddable HTML charts.
6
+
7
+ Inheritance chain:
8
+ autoprep.DataPipeline → Step 1: Ingestion, sanitization, type correction
9
+ └─ cleaner.DataPipeline → Step 2: Summary, health report, imputation, outliers
10
+ └─ eda_engine.DataPipeline → Step 3+4: Feature engineering & visualisation
11
+ └─ stats_engine.InsightsEngine → Step 5: Unified statistical associations
12
+
13
+ Usage::
14
+
15
+ from datawash import DataPipeline
16
+
17
+ pipe = DataPipeline()
18
+
19
+ # Step 1 — Ingest & Prep
20
+ pipe.load_data('data.csv').sanitize_garbage().auto_type_correct()
21
+
22
+ # Step 2 — Explore & Clean
23
+ pipe.data_health_report()
24
+ pipe.handle_missing_values(strategy='auto').remove_duplicates()
25
+
26
+ # Step 3 — Feature Engineering (leak-safe)
27
+ from sklearn.model_selection import train_test_split
28
+ train, test = train_test_split(pipe.df, test_size=0.2)
29
+ pipe.fit_transform_ml(train, num_strategy='standard', cat_strategy='onehot')
30
+ test_processed = pipe.transform_ml(test)
31
+
32
+ # Or quick (full-dataset) convenience wrapper:
33
+ # pipe.prepare_for_ml(num_strategy='standard', cat_strategy='onehot')
34
+
35
+ # Step 4 — Visualise
36
+ pipe.plot_correlation_matrix()
37
+ pipe.plot_missing_values()
38
+ pipe.univariate_all()
39
+
40
+ # Step 5 — Statistical Associations
41
+ pipe.plot_all_associations_heatmap()
42
+
43
+ # Export
44
+ pipe.save_data('cleaned.csv')
45
+ """
46
+
47
+ from .stats_engine import InsightsEngine as DataPipeline
48
+ from .plot_factory import PlottingMethods, compose_report
49
+
50
+ __all__ = ['DataPipeline', 'PlottingMethods', 'compose_report']
51
+ __version__ = '0.3.0'
@@ -0,0 +1,177 @@
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import sys
4
+ import os
5
+ import io
6
+
7
+ # Point to the parent of datawash so the package's relative imports work
8
+ sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
9
+ from datawash import DataPipeline, PlottingMethods
10
+
11
+ st.set_page_config(page_title="datawash Dashboard", layout="wide", page_icon="🔬")
12
+
13
+ st.title("🔬 datawash: Automated Data Pipeline")
14
+ st.markdown("Upload your dataset to automatically clean, prune, and explore your data without writing a single line of code.")
15
+
16
+ # --- SIDEBAR: Controls ---
17
+ preloaded_data_path = os.environ.get("DATAWASH_APP_DATA")
18
+
19
+ if preloaded_data_path and os.path.exists(preloaded_data_path):
20
+ st.sidebar.success(f"📂 Programmatically loaded data from API.")
21
+ file_name = "Programmatic_Data"
22
+ file_to_load = preloaded_data_path
23
+ else:
24
+ uploaded_file = st.sidebar.file_uploader("Upload your dataset (CSV)", type=['csv'])
25
+ file_name = uploaded_file.name if uploaded_file else None
26
+ file_to_load = uploaded_file
27
+
28
+ if file_to_load is not None:
29
+ # Initialize the pipeline inside Streamlit's session state so it persists across button clicks
30
+ if 'pipe' not in st.session_state or st.session_state.get('last_uploaded') != file_name:
31
+ st.session_state.pipe = DataPipeline()
32
+ df = pd.read_csv(file_to_load)
33
+ st.session_state.pipe.df = df.copy()
34
+ st.session_state.pipe.df_original = df.copy()
35
+ st.session_state.last_uploaded = file_name
36
+
37
+ pipe = st.session_state.pipe
38
+
39
+ st.sidebar.success(f"Loaded {pipe.df.shape[0]} rows and {pipe.df.shape[1]} columns.")
40
+
41
+ st.sidebar.header("1. Memory Optimization")
42
+ if st.sidebar.button("Downcast Data Types"):
43
+ with st.spinner("Optimizing memory..."):
44
+ pipe.optimize_memory()
45
+ st.sidebar.success("Memory Optimized!")
46
+
47
+ st.sidebar.header("2. Auto-Cleaning")
48
+ imputation_strategy = st.sidebar.selectbox(
49
+ "Missing Value Strategy",
50
+ ['auto', 'mean', 'median', 'mode', 'drop']
51
+ )
52
+ do_text_standardization = st.sidebar.checkbox("Standardize Text (Lowercase & Strip)", value=True)
53
+
54
+ if st.sidebar.button("Sanitize & Impute"):
55
+ with st.spinner("Cleaning data..."):
56
+ pipe.sanitize_garbage().auto_type_correct()
57
+ if do_text_standardization:
58
+ pipe.standardize_text()
59
+ pipe.handle_missing_values(strategy=imputation_strategy)
60
+ st.sidebar.success("Data Cleaned & Imputed!")
61
+
62
+ st.sidebar.header("3. Feature Pruning")
63
+ threshold = st.sidebar.slider("Redundancy Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.05)
64
+ if st.sidebar.button("Drop Redundant Features"):
65
+ with st.spinner("Calculating associations and dropping..."):
66
+ pipe.drop_redundant_features(threshold=threshold)
67
+ st.sidebar.success(f"Redundant features (>{threshold}) dropped!")
68
+
69
+ st.sidebar.header("4. Export")
70
+ csv = pipe.df.to_csv(index=False).encode('utf-8')
71
+ st.sidebar.download_button(
72
+ label="Download Cleaned CSV",
73
+ data=csv,
74
+ file_name='cleaned_data.csv',
75
+ mime='text/csv',
76
+ )
77
+
78
+ # --- MAIN PANEL ---
79
+ tab1, tab2, tab3 = st.tabs(["Data Preview", "Health Report", "Visualizations"])
80
+
81
+ with tab1:
82
+ st.subheader("Dataset Preview")
83
+ st.dataframe(pipe.df.head(100))
84
+
85
+ col1, col2, col3 = st.columns(3)
86
+ col1.metric("Rows", pipe.df.shape[0])
87
+ col2.metric("Columns", pipe.df.shape[1])
88
+ mem_mb = pipe.df.memory_usage().sum() / 1024**2
89
+ col3.metric("Memory Usage (MB)", f"{mem_mb:.2f}")
90
+
91
+ with tab2:
92
+ st.subheader("Data Health Report")
93
+
94
+ health = pipe._compute_health_data()
95
+
96
+ # Missing values
97
+ if health['missing']:
98
+ st.warning(f"⚠️ {len(health['missing'])} columns have missing values:")
99
+ missing_df = pd.DataFrame([
100
+ {'Column': col, 'Missing Count': v['count'], 'Percentage': f"{v['pct']}%"}
101
+ for col, v in health['missing'].items()
102
+ ])
103
+ st.dataframe(missing_df, hide_index=True)
104
+ else:
105
+ st.success("✅ No missing values found.")
106
+
107
+ if health['constant_cols']:
108
+ st.warning(f"🟡 {len(health['constant_cols'])} Constant Columns (Zero Information):")
109
+ st.write(health['constant_cols'])
110
+ else:
111
+ st.success("✅ No constant columns found.")
112
+
113
+ if health['high_cardinality']:
114
+ st.warning(f"🟡 {len(health['high_cardinality'])} High-Cardinality Categoricals (Likely IDs):")
115
+ st.write([h['col'] for h in health['high_cardinality']])
116
+ else:
117
+ st.success("✅ No high-cardinality categorical columns detected.")
118
+
119
+ with tab3:
120
+ st.subheader("Exploratory Data Analysis")
121
+ plotter = PlottingMethods(df=pipe.df)
122
+
123
+ col1, col2 = st.columns(2)
124
+ with col1:
125
+ cat_cols = pipe.df.select_dtypes(exclude=['number']).columns.tolist()
126
+ if cat_cols:
127
+ selected_cat = st.selectbox("Select Categorical Feature for Bar Chart", cat_cols)
128
+ if selected_cat:
129
+ html = plotter.get_bar_html(selected_cat)
130
+ st.components.v1.html(html, height=500, scrolling=True)
131
+ else:
132
+ st.info("No categorical features available to plot.")
133
+
134
+ with col2:
135
+ num_cols = pipe.df.select_dtypes(include=['number']).columns.tolist()
136
+ if num_cols:
137
+ selected_num = st.selectbox("Select Numeric Feature for Histogram", num_cols)
138
+ if selected_num:
139
+ html = plotter.get_histogram_html(selected_num)
140
+ st.components.v1.html(html, height=500, scrolling=True)
141
+ else:
142
+ st.info("No numeric features available to plot.")
143
+
144
+ st.subheader("Bivariate Associations")
145
+ if len(pipe.df.columns) >= 2:
146
+ if preloaded_data_path or st.button("Generate Unified Heatmap (May take a few seconds)"):
147
+ with st.spinner("Calculating Cramér's V, Eta, and Spearman's rho..."):
148
+ fig = pipe.plot_all_associations_heatmap(return_fig=True)
149
+ if fig is not None and fig != pipe:
150
+ st.plotly_chart(fig, use_container_width=True)
151
+ else:
152
+ st.info("Need at least 2 columns to calculate associations.")
153
+
154
+ st.subheader("Advanced Charts")
155
+ col3, col4 = st.columns(2)
156
+
157
+ with col3:
158
+ st.write("**Scatter Plot**")
159
+ if len(pipe.df.columns) >= 2:
160
+ scat_x = st.selectbox("X-Axis", pipe.df.columns.tolist())
161
+ scat_y = st.selectbox("Y-Axis", pipe.df.columns.tolist()[::-1])
162
+ if scat_x and scat_y:
163
+ scat_html = plotter.get_scatter_html(scat_x, scat_y)
164
+ st.components.v1.html(scat_html, height=500, scrolling=True)
165
+
166
+ with col4:
167
+ st.write("**Pie Chart**")
168
+ if cat_cols:
169
+ pie_cat = st.selectbox("Category", cat_cols, key='pie_cat')
170
+ if pie_cat:
171
+ pie_html = plotter.get_pie_html(names=pie_cat)
172
+ st.components.v1.html(pie_html, height=500, scrolling=True)
173
+ else:
174
+ st.info("No categorical features available for Pie Chart.")
175
+
176
+ else:
177
+ st.info("👈 Please upload a CSV file from the sidebar to begin.")