datawash 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datawash-0.2.0/LICENSE +21 -0
- datawash-0.2.0/PKG-INFO +353 -0
- datawash-0.2.0/README.md +305 -0
- datawash-0.2.0/pyproject.toml +104 -0
- datawash-0.2.0/setup.cfg +4 -0
- datawash-0.2.0/src/datawash/__init__.py +9 -0
- datawash-0.2.0/src/datawash/adapters/__init__.py +12 -0
- datawash-0.2.0/src/datawash/adapters/base.py +66 -0
- datawash-0.2.0/src/datawash/adapters/csv_adapter.py +23 -0
- datawash-0.2.0/src/datawash/adapters/excel_adapter.py +36 -0
- datawash-0.2.0/src/datawash/adapters/json_adapter.py +21 -0
- datawash-0.2.0/src/datawash/adapters/parquet_adapter.py +34 -0
- datawash-0.2.0/src/datawash/cli/__init__.py +0 -0
- datawash-0.2.0/src/datawash/cli/formatters.py +110 -0
- datawash-0.2.0/src/datawash/cli/main.py +168 -0
- datawash-0.2.0/src/datawash/codegen/__init__.py +1 -0
- datawash-0.2.0/src/datawash/codegen/generator.py +72 -0
- datawash-0.2.0/src/datawash/core/__init__.py +1 -0
- datawash-0.2.0/src/datawash/core/cache.py +64 -0
- datawash-0.2.0/src/datawash/core/config.py +56 -0
- datawash-0.2.0/src/datawash/core/dtypes.py +24 -0
- datawash-0.2.0/src/datawash/core/exceptions.py +21 -0
- datawash-0.2.0/src/datawash/core/models.py +78 -0
- datawash-0.2.0/src/datawash/core/report.py +430 -0
- datawash-0.2.0/src/datawash/core/sampling.py +84 -0
- datawash-0.2.0/src/datawash/detectors/__init__.py +13 -0
- datawash-0.2.0/src/datawash/detectors/base.py +27 -0
- datawash-0.2.0/src/datawash/detectors/duplicate_detector.py +56 -0
- datawash-0.2.0/src/datawash/detectors/format_detector.py +130 -0
- datawash-0.2.0/src/datawash/detectors/missing_detector.py +78 -0
- datawash-0.2.0/src/datawash/detectors/outlier_detector.py +93 -0
- datawash-0.2.0/src/datawash/detectors/registry.py +64 -0
- datawash-0.2.0/src/datawash/detectors/similarity_detector.py +294 -0
- datawash-0.2.0/src/datawash/detectors/type_detector.py +100 -0
- datawash-0.2.0/src/datawash/profiler/__init__.py +1 -0
- datawash-0.2.0/src/datawash/profiler/engine.py +88 -0
- datawash-0.2.0/src/datawash/profiler/parallel.py +122 -0
- datawash-0.2.0/src/datawash/profiler/patterns.py +80 -0
- datawash-0.2.0/src/datawash/profiler/statistics.py +41 -0
- datawash-0.2.0/src/datawash/suggestors/__init__.py +1 -0
- datawash-0.2.0/src/datawash/suggestors/base.py +15 -0
- datawash-0.2.0/src/datawash/suggestors/engine.py +327 -0
- datawash-0.2.0/src/datawash/suggestors/prioritizer.py +23 -0
- datawash-0.2.0/src/datawash/transformers/__init__.py +13 -0
- datawash-0.2.0/src/datawash/transformers/base.py +27 -0
- datawash-0.2.0/src/datawash/transformers/categories.py +64 -0
- datawash-0.2.0/src/datawash/transformers/columns.py +72 -0
- datawash-0.2.0/src/datawash/transformers/duplicates.py +43 -0
- datawash-0.2.0/src/datawash/transformers/formats.py +95 -0
- datawash-0.2.0/src/datawash/transformers/missing.py +201 -0
- datawash-0.2.0/src/datawash/transformers/registry.py +30 -0
- datawash-0.2.0/src/datawash/transformers/types.py +95 -0
- datawash-0.2.0/src/datawash.egg-info/PKG-INFO +353 -0
- datawash-0.2.0/src/datawash.egg-info/SOURCES.txt +69 -0
- datawash-0.2.0/src/datawash.egg-info/dependency_links.txt +1 -0
- datawash-0.2.0/src/datawash.egg-info/entry_points.txt +2 -0
- datawash-0.2.0/src/datawash.egg-info/requires.txt +30 -0
- datawash-0.2.0/src/datawash.egg-info/top_level.txt +1 -0
- datawash-0.2.0/tests/test_adapters.py +35 -0
- datawash-0.2.0/tests/test_cache.py +90 -0
- datawash-0.2.0/tests/test_cli.py +89 -0
- datawash-0.2.0/tests/test_codegen.py +51 -0
- datawash-0.2.0/tests/test_detectors.py +212 -0
- datawash-0.2.0/tests/test_dtypes.py +66 -0
- datawash-0.2.0/tests/test_edge_cases.py +267 -0
- datawash-0.2.0/tests/test_integration.py +103 -0
- datawash-0.2.0/tests/test_parallel.py +96 -0
- datawash-0.2.0/tests/test_profiler.py +47 -0
- datawash-0.2.0/tests/test_sampling.py +68 -0
- datawash-0.2.0/tests/test_suggestors.py +210 -0
- datawash-0.2.0/tests/test_transformers.py +143 -0
datawash-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 DataWash Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
datawash-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datawash
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Intelligent data cleaning and quality analysis
|
|
5
|
+
Author: Sai Pranav Krovvidi
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Pranav1011/DataWash
|
|
8
|
+
Project-URL: Repository, https://github.com/Pranav1011/DataWash
|
|
9
|
+
Project-URL: Issues, https://github.com/Pranav1011/DataWash/issues
|
|
10
|
+
Keywords: data-cleaning,data-quality,pandas,etl,data-analysis
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: pandas>=1.5.0
|
|
22
|
+
Requires-Dist: numpy>=1.21.0
|
|
23
|
+
Requires-Dist: pydantic>=2.0.0
|
|
24
|
+
Requires-Dist: rich>=13.0.0
|
|
25
|
+
Requires-Dist: typer>=0.9.0
|
|
26
|
+
Provides-Extra: ml
|
|
27
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "ml"
|
|
28
|
+
Requires-Dist: datasketch>=1.5.0; extra == "ml"
|
|
29
|
+
Requires-Dist: scikit-learn>=1.0.0; extra == "ml"
|
|
30
|
+
Requires-Dist: python-Levenshtein>=0.21.0; extra == "ml"
|
|
31
|
+
Provides-Extra: formats
|
|
32
|
+
Requires-Dist: pyarrow>=10.0.0; extra == "formats"
|
|
33
|
+
Requires-Dist: openpyxl>=3.0.0; extra == "formats"
|
|
34
|
+
Provides-Extra: all
|
|
35
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == "all"
|
|
36
|
+
Requires-Dist: datasketch>=1.5.0; extra == "all"
|
|
37
|
+
Requires-Dist: scikit-learn>=1.0.0; extra == "all"
|
|
38
|
+
Requires-Dist: python-Levenshtein>=0.21.0; extra == "all"
|
|
39
|
+
Requires-Dist: pyarrow>=10.0.0; extra == "all"
|
|
40
|
+
Requires-Dist: openpyxl>=3.0.0; extra == "all"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
44
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
46
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
47
|
+
Dynamic: license-file
|
|
48
|
+
|
|
49
|
+
# DataWash
|
|
50
|
+
|
|
51
|
+
<p align="center">
|
|
52
|
+
<strong>Intelligent data cleaning and quality analysis for Python</strong>
|
|
53
|
+
</p>
|
|
54
|
+
|
|
55
|
+
<p align="center">
|
|
56
|
+
<a href="#installation">Installation</a> •
|
|
57
|
+
<a href="#quick-start">Quick Start</a> •
|
|
58
|
+
<a href="#features">Features</a> •
|
|
59
|
+
<a href="#documentation">Documentation</a> •
|
|
60
|
+
<a href="#examples">Examples</a>
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
<p align="center">
|
|
64
|
+
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue" alt="Python">
|
|
65
|
+
<img src="https://img.shields.io/badge/coverage-92%25-brightgreen" alt="Coverage">
|
|
66
|
+
<img src="https://img.shields.io/badge/tests-114%20passing-brightgreen" alt="Tests">
|
|
67
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
68
|
+
</p>
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
DataWash analyzes your tabular data, detects quality issues, suggests prioritized fixes, and generates reproducible Python code — all in a few lines of code.
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from datawash import analyze
|
|
76
|
+
|
|
77
|
+
report = analyze("messy_data.csv")
|
|
78
|
+
print(f"Quality Score: {report.quality_score}/100")
|
|
79
|
+
clean_df = report.apply_all()
|
|
80
|
+
print(report.generate_code())
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Why DataWash?
|
|
84
|
+
|
|
85
|
+
| Problem | DataWash Solution |
|
|
86
|
+
|---------|-------------------|
|
|
87
|
+
| Missing values silently break ML models | Automatic detection + smart filling strategies |
|
|
88
|
+
| Inconsistent date formats cause parsing errors | Detects and standardizes to ISO format |
|
|
89
|
+
| Duplicate rows inflate statistics | Identifies and removes exact duplicates |
|
|
90
|
+
| Boolean values stored as "yes"/"no" strings | Converts to proper boolean type |
|
|
91
|
+
| Manual data cleaning is tedious and error-prone | Generates reproducible Python code |
|
|
92
|
+
|
|
93
|
+
## Installation
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
pip install datawash
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**Optional extras:**
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install datawash[formats] # Parquet + Excel support
|
|
103
|
+
pip install datawash[ml] # ML-powered detection (coming soon)
|
|
104
|
+
pip install datawash[all] # All optional dependencies
|
|
105
|
+
pip install datawash[dev] # Development tools
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Quick Start
|
|
109
|
+
|
|
110
|
+
### Python API
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from datawash import analyze
|
|
114
|
+
|
|
115
|
+
# 1. Analyze your data (sampling enabled by default for large datasets)
|
|
116
|
+
report = analyze("data.csv") # or pass a DataFrame
|
|
117
|
+
|
|
118
|
+
# 2. Check quality score
|
|
119
|
+
print(f"Quality Score: {report.quality_score}/100")
|
|
120
|
+
print(f"Issues Found: {len(report.issues)}")
|
|
121
|
+
|
|
122
|
+
# 3. Review suggestions
|
|
123
|
+
for s in report.suggestions:
|
|
124
|
+
print(f"[{s.id}] {s.action}")
|
|
125
|
+
|
|
126
|
+
# 4. Apply all fixes
|
|
127
|
+
clean_df = report.apply_all()
|
|
128
|
+
|
|
129
|
+
# 5. Or apply selectively
|
|
130
|
+
clean_df = report.apply([1, 3, 5]) # by suggestion ID
|
|
131
|
+
|
|
132
|
+
# 6. Generate reproducible code
|
|
133
|
+
print(report.generate_code())
|
|
134
|
+
|
|
135
|
+
# Disable sampling for exact results on large datasets
|
|
136
|
+
report = analyze("data.csv", sample=False)
|
|
137
|
+
|
|
138
|
+
# Disable parallel processing
|
|
139
|
+
report = analyze("data.csv", parallel=False)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Command Line
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Analyze and see quality report
|
|
146
|
+
datawash analyze data.csv
|
|
147
|
+
|
|
148
|
+
# Get prioritized suggestions
|
|
149
|
+
datawash suggest data.csv --use-case ml
|
|
150
|
+
|
|
151
|
+
# Clean and export
|
|
152
|
+
datawash clean data.csv -o clean.csv --apply-all
|
|
153
|
+
|
|
154
|
+
# Generate Python code
|
|
155
|
+
datawash codegen data.csv --apply-all
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Features
|
|
159
|
+
|
|
160
|
+
### Data Quality Detection
|
|
161
|
+
|
|
162
|
+
| Detector | What It Finds |
|
|
163
|
+
|----------|---------------|
|
|
164
|
+
| **Missing** | Null values, empty strings, whitespace-only values |
|
|
165
|
+
| **Duplicates** | Exact duplicate rows |
|
|
166
|
+
| **Formats** | Mixed case, inconsistent dates, whitespace padding |
|
|
167
|
+
| **Outliers** | Statistical anomalies (IQR or Z-score) |
|
|
168
|
+
| **Types** | Numbers/booleans stored as strings |
|
|
169
|
+
| **Similarity** | Potentially duplicate columns |
|
|
170
|
+
|
|
171
|
+
### Smart Transformations
|
|
172
|
+
|
|
173
|
+
| Transformer | Operations |
|
|
174
|
+
|-------------|------------|
|
|
175
|
+
| **Missing** | Drop rows, fill with median/mode/value, clean empty strings |
|
|
176
|
+
| **Duplicates** | Remove exact duplicates |
|
|
177
|
+
| **Types** | Convert to numeric, boolean, datetime |
|
|
178
|
+
| **Formats** | Standardize case, dates, strip whitespace |
|
|
179
|
+
| **Columns** | Drop, rename, merge columns |
|
|
180
|
+
| **Categories** | Normalize categorical values |
|
|
181
|
+
|
|
182
|
+
### Intelligent Suggestion System
|
|
183
|
+
|
|
184
|
+
- **Conflict Resolution**: Automatically prevents conflicting transformations
|
|
185
|
+
- **Execution Ordering**: Applies fixes in optimal order (6 phases)
|
|
186
|
+
- **Use-Case Aware**: Priorities adjust for ML, analytics, or export workflows
|
|
187
|
+
- **Contextual Rationale**: Every suggestion explains why it's recommended
|
|
188
|
+
|
|
189
|
+
### Code Generation
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
# Generate a reusable cleaning function
|
|
193
|
+
code = report.generate_code(style="function")
|
|
194
|
+
|
|
195
|
+
# Or a standalone script
|
|
196
|
+
code = report.generate_code(style="script")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Performance
|
|
200
|
+
|
|
201
|
+
DataWash v0.2.0 is optimized for large datasets:
|
|
202
|
+
|
|
203
|
+
| Dataset | Time | Throughput |
|
|
204
|
+
|---------|------|------------|
|
|
205
|
+
| 1M rows x 10 cols | 0.72s | 1.4M rows/sec |
|
|
206
|
+
| 100K rows x 50 cols | 2.13s | 47K rows/sec |
|
|
207
|
+
| 10K rows x 100 cols | 4.35s | 2.3K rows/sec |
|
|
208
|
+
| 1M rows x 50 cols | 3.24s | 309K rows/sec |
|
|
209
|
+
| 50K rows x 250 cols | 9.99s | 5K rows/sec |
|
|
210
|
+
|
|
211
|
+
**Optimizations include:**
|
|
212
|
+
- Smart sampling for datasets >=50K rows (10-20x speedup)
|
|
213
|
+
- Parallel column profiling and detection
|
|
214
|
+
- 31% memory reduction via dtype optimization
|
|
215
|
+
- O(n) similarity detection with MinHash + LSH
|
|
216
|
+
|
|
217
|
+
## Examples
|
|
218
|
+
|
|
219
|
+
We provide ready-to-run examples in the `examples/` directory:
|
|
220
|
+
|
|
221
|
+
| Example | Description |
|
|
222
|
+
|---------|-------------|
|
|
223
|
+
| [`quickstart.py`](examples/quickstart.py) | Basic workflow: analyze → suggest → apply → codegen |
|
|
224
|
+
| [`csv_cleaning.py`](examples/csv_cleaning.py) | Load CSV, clean, save with CLI equivalents |
|
|
225
|
+
| [`ml_preprocessing.py`](examples/ml_preprocessing.py) | ML-optimized cleaning workflow |
|
|
226
|
+
| [`jupyter_demo.ipynb`](examples/jupyter_demo.ipynb) | Interactive notebook with visualizations |
|
|
227
|
+
|
|
228
|
+
**Sample datasets** in `examples/sample_data/`:
|
|
229
|
+
- `customers_messy.csv` - Names, emails, phones with various issues
|
|
230
|
+
- `orders_messy.csv` - Dates, amounts, categories with inconsistencies
|
|
231
|
+
- `employees_messy.csv` - Mixed types, duplicates, outliers
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
# Run an example
|
|
235
|
+
python examples/quickstart.py
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Documentation
|
|
239
|
+
|
|
240
|
+
| Document | Description |
|
|
241
|
+
|----------|-------------|
|
|
242
|
+
| [Getting Started](docs/getting-started.md) | Installation and first steps |
|
|
243
|
+
| [User Guide](docs/user-guide.md) | Complete feature walkthrough |
|
|
244
|
+
| [API Reference](docs/api-reference.md) | Detailed API documentation |
|
|
245
|
+
| [CLI Reference](docs/cli-reference.md) | Command-line interface guide |
|
|
246
|
+
| [Configuration](docs/configuration.md) | Customization options |
|
|
247
|
+
| [Contributing](docs/contributing.md) | How to contribute |
|
|
248
|
+
|
|
249
|
+
## Use Cases
|
|
250
|
+
|
|
251
|
+
Choose a use case to get optimized suggestions:
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
report = analyze(df, use_case="ml") # or "general", "analytics", "export"
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
| Use Case | Prioritizes |
|
|
258
|
+
|----------|-------------|
|
|
259
|
+
| `general` | Balanced approach for exploration |
|
|
260
|
+
| `ml` | Duplicates, missing values, type conversions |
|
|
261
|
+
| `analytics` | Consistency, date formats, outliers |
|
|
262
|
+
| `export` | Format standardization, clean values |
|
|
263
|
+
|
|
264
|
+
## Configuration
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
report = analyze(
|
|
268
|
+
"data.csv",
|
|
269
|
+
use_case="ml",
|
|
270
|
+
config={
|
|
271
|
+
"detectors": {
|
|
272
|
+
"outlier_method": "zscore", # or "iqr"
|
|
273
|
+
"outlier_threshold": 2.5,
|
|
274
|
+
"min_similarity": 0.8,
|
|
275
|
+
},
|
|
276
|
+
"suggestions": {
|
|
277
|
+
"max_suggestions": 20,
|
|
278
|
+
},
|
|
279
|
+
},
|
|
280
|
+
)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Project Status
|
|
284
|
+
|
|
285
|
+
| Metric | Value |
|
|
286
|
+
|--------|-------|
|
|
287
|
+
| Source Code | ~2,900 lines |
|
|
288
|
+
| Test Code | ~1,270 lines |
|
|
289
|
+
| Tests | 114 passing |
|
|
290
|
+
| Coverage | ~92% |
|
|
291
|
+
| Python | 3.10, 3.11, 3.12 |
|
|
292
|
+
| Platforms | Linux, macOS, Windows |
|
|
293
|
+
|
|
294
|
+
### What's Working
|
|
295
|
+
|
|
296
|
+
- ✅ Multi-format loading (CSV, JSON, Parquet, Excel)
|
|
297
|
+
- ✅ Comprehensive profiling and statistics
|
|
298
|
+
- ✅ 6 detectors for common data quality issues
|
|
299
|
+
- ✅ 6 transformers with multiple operations each
|
|
300
|
+
- ✅ Smart suggestion system with conflict resolution
|
|
301
|
+
- ✅ Reproducible Python code generation
|
|
302
|
+
- ✅ Rich CLI with colored output
|
|
303
|
+
- ✅ Jupyter notebook support
|
|
304
|
+
|
|
305
|
+
### What's Next
|
|
306
|
+
|
|
307
|
+
- ML-powered semantic similarity detection
|
|
308
|
+
- Fuzzy duplicate detection for near-duplicate rows
|
|
309
|
+
- Advanced imputation (KNN, MICE)
|
|
310
|
+
- Cloud storage connectors (S3, BigQuery)
|
|
311
|
+
- PII detection for sensitive data
|
|
312
|
+
- Schema validation for expected column types and constraints
|
|
313
|
+
|
|
314
|
+
## Requirements
|
|
315
|
+
|
|
316
|
+
- **Python** >= 3.10
|
|
317
|
+
- **Core**: pandas, numpy, pydantic, rich, typer, scikit-learn
|
|
318
|
+
- **Optional**: pyarrow (Parquet), openpyxl (Excel)
|
|
319
|
+
|
|
320
|
+
## Development
|
|
321
|
+
|
|
322
|
+
```bash
|
|
323
|
+
# Clone and install
|
|
324
|
+
git clone https://github.com/Pranav1011/DataWash.git
|
|
325
|
+
cd DataWash
|
|
326
|
+
pip install -e ".[dev,all]"
|
|
327
|
+
|
|
328
|
+
# Run tests
|
|
329
|
+
pytest
|
|
330
|
+
|
|
331
|
+
# Format code
|
|
332
|
+
black src tests
|
|
333
|
+
ruff check src tests
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
## Contributing
|
|
337
|
+
|
|
338
|
+
Contributions welcome! See [CONTRIBUTING.md](docs/contributing.md) for guidelines.
|
|
339
|
+
|
|
340
|
+
**Areas where help is needed:**
|
|
341
|
+
- ML module implementation (sentence-transformers)
|
|
342
|
+
- Additional detectors (PII, schema validation)
|
|
343
|
+
- Performance optimization
|
|
344
|
+
- Documentation and examples
|
|
345
|
+
- Cloud connectors
|
|
346
|
+
|
|
347
|
+
## License
|
|
348
|
+
|
|
349
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
350
|
+
|
|
351
|
+
## Acknowledgments
|
|
352
|
+
|
|
353
|
+
Built with [pandas](https://pandas.pydata.org/), [pydantic](https://pydantic-docs.helpmanual.io/), [rich](https://rich.readthedocs.io/), [typer](https://typer.tiangolo.com/), and [scikit-learn](https://scikit-learn.org/).
|
datawash-0.2.0/README.md
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# DataWash
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<strong>Intelligent data cleaning and quality analysis for Python</strong>
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="#installation">Installation</a> •
|
|
9
|
+
<a href="#quick-start">Quick Start</a> •
|
|
10
|
+
<a href="#features">Features</a> •
|
|
11
|
+
<a href="#documentation">Documentation</a> •
|
|
12
|
+
<a href="#examples">Examples</a>
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue" alt="Python">
|
|
17
|
+
<img src="https://img.shields.io/badge/coverage-92%25-brightgreen" alt="Coverage">
|
|
18
|
+
<img src="https://img.shields.io/badge/tests-114%20passing-brightgreen" alt="Tests">
|
|
19
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="License">
|
|
20
|
+
</p>
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
DataWash analyzes your tabular data, detects quality issues, suggests prioritized fixes, and generates reproducible Python code — all in a few lines of code.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from datawash import analyze
|
|
28
|
+
|
|
29
|
+
report = analyze("messy_data.csv")
|
|
30
|
+
print(f"Quality Score: {report.quality_score}/100")
|
|
31
|
+
clean_df = report.apply_all()
|
|
32
|
+
print(report.generate_code())
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Why DataWash?
|
|
36
|
+
|
|
37
|
+
| Problem | DataWash Solution |
|
|
38
|
+
|---------|-------------------|
|
|
39
|
+
| Missing values silently break ML models | Automatic detection + smart filling strategies |
|
|
40
|
+
| Inconsistent date formats cause parsing errors | Detects and standardizes to ISO format |
|
|
41
|
+
| Duplicate rows inflate statistics | Identifies and removes exact duplicates |
|
|
42
|
+
| Boolean values stored as "yes"/"no" strings | Converts to proper boolean type |
|
|
43
|
+
| Manual data cleaning is tedious and error-prone | Generates reproducible Python code |
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install datawash
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
**Optional extras:**
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install datawash[formats] # Parquet + Excel support
|
|
55
|
+
pip install datawash[ml] # ML-powered detection (coming soon)
|
|
56
|
+
pip install datawash[all] # All optional dependencies
|
|
57
|
+
pip install datawash[dev] # Development tools
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
### Python API
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from datawash import analyze
|
|
66
|
+
|
|
67
|
+
# 1. Analyze your data (sampling enabled by default for large datasets)
|
|
68
|
+
report = analyze("data.csv") # or pass a DataFrame
|
|
69
|
+
|
|
70
|
+
# 2. Check quality score
|
|
71
|
+
print(f"Quality Score: {report.quality_score}/100")
|
|
72
|
+
print(f"Issues Found: {len(report.issues)}")
|
|
73
|
+
|
|
74
|
+
# 3. Review suggestions
|
|
75
|
+
for s in report.suggestions:
|
|
76
|
+
print(f"[{s.id}] {s.action}")
|
|
77
|
+
|
|
78
|
+
# 4. Apply all fixes
|
|
79
|
+
clean_df = report.apply_all()
|
|
80
|
+
|
|
81
|
+
# 5. Or apply selectively
|
|
82
|
+
clean_df = report.apply([1, 3, 5]) # by suggestion ID
|
|
83
|
+
|
|
84
|
+
# 6. Generate reproducible code
|
|
85
|
+
print(report.generate_code())
|
|
86
|
+
|
|
87
|
+
# Disable sampling for exact results on large datasets
|
|
88
|
+
report = analyze("data.csv", sample=False)
|
|
89
|
+
|
|
90
|
+
# Disable parallel processing
|
|
91
|
+
report = analyze("data.csv", parallel=False)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Command Line
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
# Analyze and see quality report
|
|
98
|
+
datawash analyze data.csv
|
|
99
|
+
|
|
100
|
+
# Get prioritized suggestions
|
|
101
|
+
datawash suggest data.csv --use-case ml
|
|
102
|
+
|
|
103
|
+
# Clean and export
|
|
104
|
+
datawash clean data.csv -o clean.csv --apply-all
|
|
105
|
+
|
|
106
|
+
# Generate Python code
|
|
107
|
+
datawash codegen data.csv --apply-all
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Features
|
|
111
|
+
|
|
112
|
+
### Data Quality Detection
|
|
113
|
+
|
|
114
|
+
| Detector | What It Finds |
|
|
115
|
+
|----------|---------------|
|
|
116
|
+
| **Missing** | Null values, empty strings, whitespace-only values |
|
|
117
|
+
| **Duplicates** | Exact duplicate rows |
|
|
118
|
+
| **Formats** | Mixed case, inconsistent dates, whitespace padding |
|
|
119
|
+
| **Outliers** | Statistical anomalies (IQR or Z-score) |
|
|
120
|
+
| **Types** | Numbers/booleans stored as strings |
|
|
121
|
+
| **Similarity** | Potentially duplicate columns |
|
|
122
|
+
|
|
123
|
+
### Smart Transformations
|
|
124
|
+
|
|
125
|
+
| Transformer | Operations |
|
|
126
|
+
|-------------|------------|
|
|
127
|
+
| **Missing** | Drop rows, fill with median/mode/value, clean empty strings |
|
|
128
|
+
| **Duplicates** | Remove exact duplicates |
|
|
129
|
+
| **Types** | Convert to numeric, boolean, datetime |
|
|
130
|
+
| **Formats** | Standardize case, dates, strip whitespace |
|
|
131
|
+
| **Columns** | Drop, rename, merge columns |
|
|
132
|
+
| **Categories** | Normalize categorical values |
|
|
133
|
+
|
|
134
|
+
### Intelligent Suggestion System
|
|
135
|
+
|
|
136
|
+
- **Conflict Resolution**: Automatically prevents conflicting transformations
|
|
137
|
+
- **Execution Ordering**: Applies fixes in optimal order (6 phases)
|
|
138
|
+
- **Use-Case Aware**: Priorities adjust for ML, analytics, or export workflows
|
|
139
|
+
- **Contextual Rationale**: Every suggestion explains why it's recommended
|
|
140
|
+
|
|
141
|
+
### Code Generation
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
# Generate a reusable cleaning function
|
|
145
|
+
code = report.generate_code(style="function")
|
|
146
|
+
|
|
147
|
+
# Or a standalone script
|
|
148
|
+
code = report.generate_code(style="script")
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Performance
|
|
152
|
+
|
|
153
|
+
DataWash v0.2.0 is optimized for large datasets:
|
|
154
|
+
|
|
155
|
+
| Dataset | Time | Throughput |
|
|
156
|
+
|---------|------|------------|
|
|
157
|
+
| 1M rows x 10 cols | 0.72s | 1.4M rows/sec |
|
|
158
|
+
| 100K rows x 50 cols | 2.13s | 47K rows/sec |
|
|
159
|
+
| 10K rows x 100 cols | 4.35s | 2.3K rows/sec |
|
|
160
|
+
| 1M rows x 50 cols | 3.24s | 309K rows/sec |
|
|
161
|
+
| 50K rows x 250 cols | 9.99s | 5K rows/sec |
|
|
162
|
+
|
|
163
|
+
**Optimizations include:**
|
|
164
|
+
- Smart sampling for datasets >=50K rows (10-20x speedup)
|
|
165
|
+
- Parallel column profiling and detection
|
|
166
|
+
- 31% memory reduction via dtype optimization
|
|
167
|
+
- O(n) similarity detection with MinHash + LSH
|
|
168
|
+
|
|
169
|
+
## Examples
|
|
170
|
+
|
|
171
|
+
We provide ready-to-run examples in the `examples/` directory:
|
|
172
|
+
|
|
173
|
+
| Example | Description |
|
|
174
|
+
|---------|-------------|
|
|
175
|
+
| [`quickstart.py`](examples/quickstart.py) | Basic workflow: analyze → suggest → apply → codegen |
|
|
176
|
+
| [`csv_cleaning.py`](examples/csv_cleaning.py) | Load CSV, clean, save with CLI equivalents |
|
|
177
|
+
| [`ml_preprocessing.py`](examples/ml_preprocessing.py) | ML-optimized cleaning workflow |
|
|
178
|
+
| [`jupyter_demo.ipynb`](examples/jupyter_demo.ipynb) | Interactive notebook with visualizations |
|
|
179
|
+
|
|
180
|
+
**Sample datasets** in `examples/sample_data/`:
|
|
181
|
+
- `customers_messy.csv` - Names, emails, phones with various issues
|
|
182
|
+
- `orders_messy.csv` - Dates, amounts, categories with inconsistencies
|
|
183
|
+
- `employees_messy.csv` - Mixed types, duplicates, outliers
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
# Run an example
|
|
187
|
+
python examples/quickstart.py
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Documentation
|
|
191
|
+
|
|
192
|
+
| Document | Description |
|
|
193
|
+
|----------|-------------|
|
|
194
|
+
| [Getting Started](docs/getting-started.md) | Installation and first steps |
|
|
195
|
+
| [User Guide](docs/user-guide.md) | Complete feature walkthrough |
|
|
196
|
+
| [API Reference](docs/api-reference.md) | Detailed API documentation |
|
|
197
|
+
| [CLI Reference](docs/cli-reference.md) | Command-line interface guide |
|
|
198
|
+
| [Configuration](docs/configuration.md) | Customization options |
|
|
199
|
+
| [Contributing](docs/contributing.md) | How to contribute |
|
|
200
|
+
|
|
201
|
+
## Use Cases
|
|
202
|
+
|
|
203
|
+
Choose a use case to get optimized suggestions:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
report = analyze(df, use_case="ml") # or "general", "analytics", "export"
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
| Use Case | Prioritizes |
|
|
210
|
+
|----------|-------------|
|
|
211
|
+
| `general` | Balanced approach for exploration |
|
|
212
|
+
| `ml` | Duplicates, missing values, type conversions |
|
|
213
|
+
| `analytics` | Consistency, date formats, outliers |
|
|
214
|
+
| `export` | Format standardization, clean values |
|
|
215
|
+
|
|
216
|
+
## Configuration
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
report = analyze(
|
|
220
|
+
"data.csv",
|
|
221
|
+
use_case="ml",
|
|
222
|
+
config={
|
|
223
|
+
"detectors": {
|
|
224
|
+
"outlier_method": "zscore", # or "iqr"
|
|
225
|
+
"outlier_threshold": 2.5,
|
|
226
|
+
"min_similarity": 0.8,
|
|
227
|
+
},
|
|
228
|
+
"suggestions": {
|
|
229
|
+
"max_suggestions": 20,
|
|
230
|
+
},
|
|
231
|
+
},
|
|
232
|
+
)
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Project Status
|
|
236
|
+
|
|
237
|
+
| Metric | Value |
|
|
238
|
+
|--------|-------|
|
|
239
|
+
| Source Code | ~2,900 lines |
|
|
240
|
+
| Test Code | ~1,270 lines |
|
|
241
|
+
| Tests | 114 passing |
|
|
242
|
+
| Coverage | ~92% |
|
|
243
|
+
| Python | 3.10, 3.11, 3.12 |
|
|
244
|
+
| Platforms | Linux, macOS, Windows |
|
|
245
|
+
|
|
246
|
+
### What's Working
|
|
247
|
+
|
|
248
|
+
- ✅ Multi-format loading (CSV, JSON, Parquet, Excel)
|
|
249
|
+
- ✅ Comprehensive profiling and statistics
|
|
250
|
+
- ✅ 6 detectors for common data quality issues
|
|
251
|
+
- ✅ 6 transformers with multiple operations each
|
|
252
|
+
- ✅ Smart suggestion system with conflict resolution
|
|
253
|
+
- ✅ Reproducible Python code generation
|
|
254
|
+
- ✅ Rich CLI with colored output
|
|
255
|
+
- ✅ Jupyter notebook support
|
|
256
|
+
|
|
257
|
+
### What's Next
|
|
258
|
+
|
|
259
|
+
- ML-powered semantic similarity detection
|
|
260
|
+
- Fuzzy duplicate detection for near-duplicate rows
|
|
261
|
+
- Advanced imputation (KNN, MICE)
|
|
262
|
+
- Cloud storage connectors (S3, BigQuery)
|
|
263
|
+
- PII detection for sensitive data
|
|
264
|
+
- Schema validation for expected column types and constraints
|
|
265
|
+
|
|
266
|
+
## Requirements
|
|
267
|
+
|
|
268
|
+
- **Python** >= 3.10
|
|
269
|
+
- **Core**: pandas, numpy, pydantic, rich, typer, scikit-learn
|
|
270
|
+
- **Optional**: pyarrow (Parquet), openpyxl (Excel)
|
|
271
|
+
|
|
272
|
+
## Development
|
|
273
|
+
|
|
274
|
+
```bash
|
|
275
|
+
# Clone and install
|
|
276
|
+
git clone https://github.com/Pranav1011/DataWash.git
|
|
277
|
+
cd DataWash
|
|
278
|
+
pip install -e ".[dev,all]"
|
|
279
|
+
|
|
280
|
+
# Run tests
|
|
281
|
+
pytest
|
|
282
|
+
|
|
283
|
+
# Format code
|
|
284
|
+
black src tests
|
|
285
|
+
ruff check src tests
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Contributing
|
|
289
|
+
|
|
290
|
+
Contributions welcome! See [CONTRIBUTING.md](docs/contributing.md) for guidelines.
|
|
291
|
+
|
|
292
|
+
**Areas where help is needed:**
|
|
293
|
+
- ML module implementation (sentence-transformers)
|
|
294
|
+
- Additional detectors (PII, schema validation)
|
|
295
|
+
- Performance optimization
|
|
296
|
+
- Documentation and examples
|
|
297
|
+
- Cloud connectors
|
|
298
|
+
|
|
299
|
+
## License
|
|
300
|
+
|
|
301
|
+
MIT License - see [LICENSE](LICENSE) for details.
|
|
302
|
+
|
|
303
|
+
## Acknowledgments
|
|
304
|
+
|
|
305
|
+
Built with [pandas](https://pandas.pydata.org/), [pydantic](https://pydantic-docs.helpmanual.io/), [rich](https://rich.readthedocs.io/), [typer](https://typer.tiangolo.com/), and [scikit-learn](https://scikit-learn.org/).
|