datawash 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. datawash-0.2.0/LICENSE +21 -0
  2. datawash-0.2.0/PKG-INFO +353 -0
  3. datawash-0.2.0/README.md +305 -0
  4. datawash-0.2.0/pyproject.toml +104 -0
  5. datawash-0.2.0/setup.cfg +4 -0
  6. datawash-0.2.0/src/datawash/__init__.py +9 -0
  7. datawash-0.2.0/src/datawash/adapters/__init__.py +12 -0
  8. datawash-0.2.0/src/datawash/adapters/base.py +66 -0
  9. datawash-0.2.0/src/datawash/adapters/csv_adapter.py +23 -0
  10. datawash-0.2.0/src/datawash/adapters/excel_adapter.py +36 -0
  11. datawash-0.2.0/src/datawash/adapters/json_adapter.py +21 -0
  12. datawash-0.2.0/src/datawash/adapters/parquet_adapter.py +34 -0
  13. datawash-0.2.0/src/datawash/cli/__init__.py +0 -0
  14. datawash-0.2.0/src/datawash/cli/formatters.py +110 -0
  15. datawash-0.2.0/src/datawash/cli/main.py +168 -0
  16. datawash-0.2.0/src/datawash/codegen/__init__.py +1 -0
  17. datawash-0.2.0/src/datawash/codegen/generator.py +72 -0
  18. datawash-0.2.0/src/datawash/core/__init__.py +1 -0
  19. datawash-0.2.0/src/datawash/core/cache.py +64 -0
  20. datawash-0.2.0/src/datawash/core/config.py +56 -0
  21. datawash-0.2.0/src/datawash/core/dtypes.py +24 -0
  22. datawash-0.2.0/src/datawash/core/exceptions.py +21 -0
  23. datawash-0.2.0/src/datawash/core/models.py +78 -0
  24. datawash-0.2.0/src/datawash/core/report.py +430 -0
  25. datawash-0.2.0/src/datawash/core/sampling.py +84 -0
  26. datawash-0.2.0/src/datawash/detectors/__init__.py +13 -0
  27. datawash-0.2.0/src/datawash/detectors/base.py +27 -0
  28. datawash-0.2.0/src/datawash/detectors/duplicate_detector.py +56 -0
  29. datawash-0.2.0/src/datawash/detectors/format_detector.py +130 -0
  30. datawash-0.2.0/src/datawash/detectors/missing_detector.py +78 -0
  31. datawash-0.2.0/src/datawash/detectors/outlier_detector.py +93 -0
  32. datawash-0.2.0/src/datawash/detectors/registry.py +64 -0
  33. datawash-0.2.0/src/datawash/detectors/similarity_detector.py +294 -0
  34. datawash-0.2.0/src/datawash/detectors/type_detector.py +100 -0
  35. datawash-0.2.0/src/datawash/profiler/__init__.py +1 -0
  36. datawash-0.2.0/src/datawash/profiler/engine.py +88 -0
  37. datawash-0.2.0/src/datawash/profiler/parallel.py +122 -0
  38. datawash-0.2.0/src/datawash/profiler/patterns.py +80 -0
  39. datawash-0.2.0/src/datawash/profiler/statistics.py +41 -0
  40. datawash-0.2.0/src/datawash/suggestors/__init__.py +1 -0
  41. datawash-0.2.0/src/datawash/suggestors/base.py +15 -0
  42. datawash-0.2.0/src/datawash/suggestors/engine.py +327 -0
  43. datawash-0.2.0/src/datawash/suggestors/prioritizer.py +23 -0
  44. datawash-0.2.0/src/datawash/transformers/__init__.py +13 -0
  45. datawash-0.2.0/src/datawash/transformers/base.py +27 -0
  46. datawash-0.2.0/src/datawash/transformers/categories.py +64 -0
  47. datawash-0.2.0/src/datawash/transformers/columns.py +72 -0
  48. datawash-0.2.0/src/datawash/transformers/duplicates.py +43 -0
  49. datawash-0.2.0/src/datawash/transformers/formats.py +95 -0
  50. datawash-0.2.0/src/datawash/transformers/missing.py +201 -0
  51. datawash-0.2.0/src/datawash/transformers/registry.py +30 -0
  52. datawash-0.2.0/src/datawash/transformers/types.py +95 -0
  53. datawash-0.2.0/src/datawash.egg-info/PKG-INFO +353 -0
  54. datawash-0.2.0/src/datawash.egg-info/SOURCES.txt +69 -0
  55. datawash-0.2.0/src/datawash.egg-info/dependency_links.txt +1 -0
  56. datawash-0.2.0/src/datawash.egg-info/entry_points.txt +2 -0
  57. datawash-0.2.0/src/datawash.egg-info/requires.txt +30 -0
  58. datawash-0.2.0/src/datawash.egg-info/top_level.txt +1 -0
  59. datawash-0.2.0/tests/test_adapters.py +35 -0
  60. datawash-0.2.0/tests/test_cache.py +90 -0
  61. datawash-0.2.0/tests/test_cli.py +89 -0
  62. datawash-0.2.0/tests/test_codegen.py +51 -0
  63. datawash-0.2.0/tests/test_detectors.py +212 -0
  64. datawash-0.2.0/tests/test_dtypes.py +66 -0
  65. datawash-0.2.0/tests/test_edge_cases.py +267 -0
  66. datawash-0.2.0/tests/test_integration.py +103 -0
  67. datawash-0.2.0/tests/test_parallel.py +96 -0
  68. datawash-0.2.0/tests/test_profiler.py +47 -0
  69. datawash-0.2.0/tests/test_sampling.py +68 -0
  70. datawash-0.2.0/tests/test_suggestors.py +210 -0
  71. datawash-0.2.0/tests/test_transformers.py +143 -0
datawash-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 DataWash Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,353 @@
1
+ Metadata-Version: 2.4
2
+ Name: datawash
3
+ Version: 0.2.0
4
+ Summary: Intelligent data cleaning and quality analysis
5
+ Author: Sai Pranav Krovvidi
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Pranav1011/DataWash
8
+ Project-URL: Repository, https://github.com/Pranav1011/DataWash
9
+ Project-URL: Issues, https://github.com/Pranav1011/DataWash/issues
10
+ Keywords: data-cleaning,data-quality,pandas,etl,data-analysis
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: pandas>=1.5.0
22
+ Requires-Dist: numpy>=1.21.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Requires-Dist: rich>=13.0.0
25
+ Requires-Dist: typer>=0.9.0
26
+ Provides-Extra: ml
27
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "ml"
28
+ Requires-Dist: datasketch>=1.5.0; extra == "ml"
29
+ Requires-Dist: scikit-learn>=1.0.0; extra == "ml"
30
+ Requires-Dist: python-Levenshtein>=0.21.0; extra == "ml"
31
+ Provides-Extra: formats
32
+ Requires-Dist: pyarrow>=10.0.0; extra == "formats"
33
+ Requires-Dist: openpyxl>=3.0.0; extra == "formats"
34
+ Provides-Extra: all
35
+ Requires-Dist: sentence-transformers>=2.2.0; extra == "all"
36
+ Requires-Dist: datasketch>=1.5.0; extra == "all"
37
+ Requires-Dist: scikit-learn>=1.0.0; extra == "all"
38
+ Requires-Dist: python-Levenshtein>=0.21.0; extra == "all"
39
+ Requires-Dist: pyarrow>=10.0.0; extra == "all"
40
+ Requires-Dist: openpyxl>=3.0.0; extra == "all"
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
43
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
44
+ Requires-Dist: black>=23.0.0; extra == "dev"
45
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
46
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
47
+ Dynamic: license-file
48
+
49
+ # DataWash
50
+
51
+ <p align="center">
52
+ <strong>Intelligent data cleaning and quality analysis for Python</strong>
53
+ </p>
54
+
55
+ <p align="center">
56
+ <a href="#installation">Installation</a> •
57
+ <a href="#quick-start">Quick Start</a> •
58
+ <a href="#features">Features</a> •
59
+ <a href="#documentation">Documentation</a> •
60
+ <a href="#examples">Examples</a>
61
+ </p>
62
+
63
+ <p align="center">
64
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue" alt="Python">
65
+ <img src="https://img.shields.io/badge/coverage-92%25-brightgreen" alt="Coverage">
66
+ <img src="https://img.shields.io/badge/tests-114%20passing-brightgreen" alt="Tests">
67
+ <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
68
+ </p>
69
+
70
+ ---
71
+
72
+ DataWash analyzes your tabular data, detects quality issues, suggests prioritized fixes, and generates reproducible Python code — all in a few lines of code.
73
+
74
+ ```python
75
+ from datawash import analyze
76
+
77
+ report = analyze("messy_data.csv")
78
+ print(f"Quality Score: {report.quality_score}/100")
79
+ clean_df = report.apply_all()
80
+ print(report.generate_code())
81
+ ```
82
+
83
+ ## Why DataWash?
84
+
85
+ | Problem | DataWash Solution |
86
+ |---------|-------------------|
87
+ | Missing values silently break ML models | Automatic detection + smart filling strategies |
88
+ | Inconsistent date formats cause parsing errors | Detects and standardizes to ISO format |
89
+ | Duplicate rows inflate statistics | Identifies and removes exact duplicates |
90
+ | Boolean values stored as "yes"/"no" strings | Converts to proper boolean type |
91
+ | Manual data cleaning is tedious and error-prone | Generates reproducible Python code |
92
+
93
+ ## Installation
94
+
95
+ ```bash
96
+ pip install datawash
97
+ ```
98
+
99
+ **Optional extras:**
100
+
101
+ ```bash
102
+ pip install datawash[formats] # Parquet + Excel support
103
+ pip install datawash[ml] # ML-powered detection (coming soon)
104
+ pip install datawash[all] # All optional dependencies
105
+ pip install datawash[dev] # Development tools
106
+ ```
107
+
108
+ ## Quick Start
109
+
110
+ ### Python API
111
+
112
+ ```python
113
+ from datawash import analyze
114
+
115
+ # 1. Analyze your data (sampling enabled by default for large datasets)
116
+ report = analyze("data.csv") # or pass a DataFrame
117
+
118
+ # 2. Check quality score
119
+ print(f"Quality Score: {report.quality_score}/100")
120
+ print(f"Issues Found: {len(report.issues)}")
121
+
122
+ # 3. Review suggestions
123
+ for s in report.suggestions:
124
+ print(f"[{s.id}] {s.action}")
125
+
126
+ # 4. Apply all fixes
127
+ clean_df = report.apply_all()
128
+
129
+ # 5. Or apply selectively
130
+ clean_df = report.apply([1, 3, 5]) # by suggestion ID
131
+
132
+ # 6. Generate reproducible code
133
+ print(report.generate_code())
134
+
135
+ # Disable sampling for exact results on large datasets
136
+ report = analyze("data.csv", sample=False)
137
+
138
+ # Disable parallel processing
139
+ report = analyze("data.csv", parallel=False)
140
+ ```
141
+
142
+ ### Command Line
143
+
144
+ ```bash
145
+ # Analyze and see quality report
146
+ datawash analyze data.csv
147
+
148
+ # Get prioritized suggestions
149
+ datawash suggest data.csv --use-case ml
150
+
151
+ # Clean and export
152
+ datawash clean data.csv -o clean.csv --apply-all
153
+
154
+ # Generate Python code
155
+ datawash codegen data.csv --apply-all
156
+ ```
157
+
158
+ ## Features
159
+
160
+ ### Data Quality Detection
161
+
162
+ | Detector | What It Finds |
163
+ |----------|---------------|
164
+ | **Missing** | Null values, empty strings, whitespace-only values |
165
+ | **Duplicates** | Exact duplicate rows |
166
+ | **Formats** | Mixed case, inconsistent dates, whitespace padding |
167
+ | **Outliers** | Statistical anomalies (IQR or Z-score) |
168
+ | **Types** | Numbers/booleans stored as strings |
169
+ | **Similarity** | Potentially duplicate columns |
170
+
171
+ ### Smart Transformations
172
+
173
+ | Transformer | Operations |
174
+ |-------------|------------|
175
+ | **Missing** | Drop rows, fill with median/mode/value, clean empty strings |
176
+ | **Duplicates** | Remove exact duplicates |
177
+ | **Types** | Convert to numeric, boolean, datetime |
178
+ | **Formats** | Standardize case, dates, strip whitespace |
179
+ | **Columns** | Drop, rename, merge columns |
180
+ | **Categories** | Normalize categorical values |
181
+
182
+ ### Intelligent Suggestion System
183
+
184
+ - **Conflict Resolution**: Automatically prevents conflicting transformations
185
+ - **Execution Ordering**: Applies fixes in optimal order (6 phases)
186
+ - **Use-Case Aware**: Priorities adjust for ML, analytics, or export workflows
187
+ - **Contextual Rationale**: Every suggestion explains why it's recommended
188
+
189
+ ### Code Generation
190
+
191
+ ```python
192
+ # Generate a reusable cleaning function
193
+ code = report.generate_code(style="function")
194
+
195
+ # Or a standalone script
196
+ code = report.generate_code(style="script")
197
+ ```
198
+
199
+ ## Performance
200
+
201
+ DataWash v0.2.0 is optimized for large datasets:
202
+
203
+ | Dataset | Time | Throughput |
204
+ |---------|------|------------|
205
+ | 1M rows x 10 cols | 0.72s | 1.4M rows/sec |
206
+ | 100K rows x 50 cols | 2.13s | 47K rows/sec |
207
+ | 10K rows x 100 cols | 4.35s | 2.3K rows/sec |
208
+ | 1M rows x 50 cols | 3.24s | 309K rows/sec |
209
+ | 50K rows x 250 cols | 9.99s | 5K rows/sec |
210
+
211
+ **Optimizations include:**
212
+ - Smart sampling for datasets >=50K rows (10-20x speedup)
213
+ - Parallel column profiling and detection
214
+ - 31% memory reduction via dtype optimization
215
+ - O(n) similarity detection with MinHash + LSH
216
+
217
+ ## Examples
218
+
219
+ We provide ready-to-run examples in the `examples/` directory:
220
+
221
+ | Example | Description |
222
+ |---------|-------------|
223
+ | [`quickstart.py`](examples/quickstart.py) | Basic workflow: analyze → suggest → apply → codegen |
224
+ | [`csv_cleaning.py`](examples/csv_cleaning.py) | Load CSV, clean, save with CLI equivalents |
225
+ | [`ml_preprocessing.py`](examples/ml_preprocessing.py) | ML-optimized cleaning workflow |
226
+ | [`jupyter_demo.ipynb`](examples/jupyter_demo.ipynb) | Interactive notebook with visualizations |
227
+
228
+ **Sample datasets** in `examples/sample_data/`:
229
+ - `customers_messy.csv` - Names, emails, phones with various issues
230
+ - `orders_messy.csv` - Dates, amounts, categories with inconsistencies
231
+ - `employees_messy.csv` - Mixed types, duplicates, outliers
232
+
233
+ ```bash
234
+ # Run an example
235
+ python examples/quickstart.py
236
+ ```
237
+
238
+ ## Documentation
239
+
240
+ | Document | Description |
241
+ |----------|-------------|
242
+ | [Getting Started](docs/getting-started.md) | Installation and first steps |
243
+ | [User Guide](docs/user-guide.md) | Complete feature walkthrough |
244
+ | [API Reference](docs/api-reference.md) | Detailed API documentation |
245
+ | [CLI Reference](docs/cli-reference.md) | Command-line interface guide |
246
+ | [Configuration](docs/configuration.md) | Customization options |
247
+ | [Contributing](docs/contributing.md) | How to contribute |
248
+
249
+ ## Use Cases
250
+
251
+ Choose a use case to get optimized suggestions:
252
+
253
+ ```python
254
+ report = analyze(df, use_case="ml") # or "general", "analytics", "export"
255
+ ```
256
+
257
+ | Use Case | Prioritizes |
258
+ |----------|-------------|
259
+ | `general` | Balanced approach for exploration |
260
+ | `ml` | Duplicates, missing values, type conversions |
261
+ | `analytics` | Consistency, date formats, outliers |
262
+ | `export` | Format standardization, clean values |
263
+
264
+ ## Configuration
265
+
266
+ ```python
267
+ report = analyze(
268
+ "data.csv",
269
+ use_case="ml",
270
+ config={
271
+ "detectors": {
272
+ "outlier_method": "zscore", # or "iqr"
273
+ "outlier_threshold": 2.5,
274
+ "min_similarity": 0.8,
275
+ },
276
+ "suggestions": {
277
+ "max_suggestions": 20,
278
+ },
279
+ },
280
+ )
281
+ ```
282
+
283
+ ## Project Status
284
+
285
+ | Metric | Value |
286
+ |--------|-------|
287
+ | Source Code | ~2,900 lines |
288
+ | Test Code | ~1,270 lines |
289
+ | Tests | 114 passing |
290
+ | Coverage | ~92% |
291
+ | Python | 3.10, 3.11, 3.12 |
292
+ | Platforms | Linux, macOS, Windows |
293
+
294
+ ### What's Working
295
+
296
+ - ✅ Multi-format loading (CSV, JSON, Parquet, Excel)
297
+ - ✅ Comprehensive profiling and statistics
298
+ - ✅ 6 detectors for common data quality issues
299
+ - ✅ 6 transformers with multiple operations each
300
+ - ✅ Smart suggestion system with conflict resolution
301
+ - ✅ Reproducible Python code generation
302
+ - ✅ Rich CLI with colored output
303
+ - ✅ Jupyter notebook support
304
+
305
+ ### What's Next
306
+
307
+ - ML-powered semantic similarity detection
308
+ - Fuzzy duplicate detection for near-duplicate rows
309
+ - Advanced imputation (KNN, MICE)
310
+ - Cloud storage connectors (S3, BigQuery)
311
+ - PII detection for sensitive data
312
+ - Schema validation for expected column types and constraints
313
+
314
+ ## Requirements
315
+
316
+ - **Python** >= 3.10
317
+ - **Core**: pandas, numpy, pydantic, rich, typer, scikit-learn
318
+ - **Optional**: pyarrow (Parquet), openpyxl (Excel)
319
+
320
+ ## Development
321
+
322
+ ```bash
323
+ # Clone and install
324
+ git clone https://github.com/Pranav1011/DataWash.git
325
+ cd DataWash
326
+ pip install -e ".[dev,all]"
327
+
328
+ # Run tests
329
+ pytest
330
+
331
+ # Format code
332
+ black src tests
333
+ ruff check src tests
334
+ ```
335
+
336
+ ## Contributing
337
+
338
+ Contributions welcome! See [CONTRIBUTING.md](docs/contributing.md) for guidelines.
339
+
340
+ **Areas where help is needed:**
341
+ - ML module implementation (sentence-transformers)
342
+ - Additional detectors (PII, schema validation)
343
+ - Performance optimization
344
+ - Documentation and examples
345
+ - Cloud connectors
346
+
347
+ ## License
348
+
349
+ MIT License - see [LICENSE](LICENSE) for details.
350
+
351
+ ## Acknowledgments
352
+
353
+ Built with [pandas](https://pandas.pydata.org/), [pydantic](https://pydantic-docs.helpmanual.io/), [rich](https://rich.readthedocs.io/), [typer](https://typer.tiangolo.com/), and [scikit-learn](https://scikit-learn.org/).
@@ -0,0 +1,305 @@
1
+ # DataWash
2
+
3
+ <p align="center">
4
+ <strong>Intelligent data cleaning and quality analysis for Python</strong>
5
+ </p>
6
+
7
+ <p align="center">
8
+ <a href="#installation">Installation</a> •
9
+ <a href="#quick-start">Quick Start</a> •
10
+ <a href="#features">Features</a> •
11
+ <a href="#documentation">Documentation</a> •
12
+ <a href="#examples">Examples</a>
13
+ </p>
14
+
15
+ <p align="center">
16
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue" alt="Python">
17
+ <img src="https://img.shields.io/badge/coverage-92%25-brightgreen" alt="Coverage">
18
+ <img src="https://img.shields.io/badge/tests-114%20passing-brightgreen" alt="Tests">
19
+ <img src="https://img.shields.io/badge/license-MIT-green" alt="License">
20
+ </p>
21
+
22
+ ---
23
+
24
+ DataWash analyzes your tabular data, detects quality issues, suggests prioritized fixes, and generates reproducible Python code — all in a few lines of code.
25
+
26
+ ```python
27
+ from datawash import analyze
28
+
29
+ report = analyze("messy_data.csv")
30
+ print(f"Quality Score: {report.quality_score}/100")
31
+ clean_df = report.apply_all()
32
+ print(report.generate_code())
33
+ ```
34
+
35
+ ## Why DataWash?
36
+
37
+ | Problem | DataWash Solution |
38
+ |---------|-------------------|
39
+ | Missing values silently break ML models | Automatic detection + smart filling strategies |
40
+ | Inconsistent date formats cause parsing errors | Detects and standardizes to ISO format |
41
+ | Duplicate rows inflate statistics | Identifies and removes exact duplicates |
42
+ | Boolean values stored as "yes"/"no" strings | Converts to proper boolean type |
43
+ | Manual data cleaning is tedious and error-prone | Generates reproducible Python code |
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install datawash
49
+ ```
50
+
51
+ **Optional extras:**
52
+
53
+ ```bash
54
+ pip install datawash[formats] # Parquet + Excel support
55
+ pip install datawash[ml] # ML-powered detection (coming soon)
56
+ pip install datawash[all] # All optional dependencies
57
+ pip install datawash[dev] # Development tools
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ### Python API
63
+
64
+ ```python
65
+ from datawash import analyze
66
+
67
+ # 1. Analyze your data (sampling enabled by default for large datasets)
68
+ report = analyze("data.csv") # or pass a DataFrame
69
+
70
+ # 2. Check quality score
71
+ print(f"Quality Score: {report.quality_score}/100")
72
+ print(f"Issues Found: {len(report.issues)}")
73
+
74
+ # 3. Review suggestions
75
+ for s in report.suggestions:
76
+ print(f"[{s.id}] {s.action}")
77
+
78
+ # 4. Apply all fixes
79
+ clean_df = report.apply_all()
80
+
81
+ # 5. Or apply selectively
82
+ clean_df = report.apply([1, 3, 5]) # by suggestion ID
83
+
84
+ # 6. Generate reproducible code
85
+ print(report.generate_code())
86
+
87
+ # Disable sampling for exact results on large datasets
88
+ report = analyze("data.csv", sample=False)
89
+
90
+ # Disable parallel processing
91
+ report = analyze("data.csv", parallel=False)
92
+ ```
93
+
94
+ ### Command Line
95
+
96
+ ```bash
97
+ # Analyze and see quality report
98
+ datawash analyze data.csv
99
+
100
+ # Get prioritized suggestions
101
+ datawash suggest data.csv --use-case ml
102
+
103
+ # Clean and export
104
+ datawash clean data.csv -o clean.csv --apply-all
105
+
106
+ # Generate Python code
107
+ datawash codegen data.csv --apply-all
108
+ ```
109
+
110
+ ## Features
111
+
112
+ ### Data Quality Detection
113
+
114
+ | Detector | What It Finds |
115
+ |----------|---------------|
116
+ | **Missing** | Null values, empty strings, whitespace-only values |
117
+ | **Duplicates** | Exact duplicate rows |
118
+ | **Formats** | Mixed case, inconsistent dates, whitespace padding |
119
+ | **Outliers** | Statistical anomalies (IQR or Z-score) |
120
+ | **Types** | Numbers/booleans stored as strings |
121
+ | **Similarity** | Potentially duplicate columns |
122
+
123
+ ### Smart Transformations
124
+
125
+ | Transformer | Operations |
126
+ |-------------|------------|
127
+ | **Missing** | Drop rows, fill with median/mode/value, clean empty strings |
128
+ | **Duplicates** | Remove exact duplicates |
129
+ | **Types** | Convert to numeric, boolean, datetime |
130
+ | **Formats** | Standardize case, dates, strip whitespace |
131
+ | **Columns** | Drop, rename, merge columns |
132
+ | **Categories** | Normalize categorical values |
133
+
134
+ ### Intelligent Suggestion System
135
+
136
+ - **Conflict Resolution**: Automatically prevents conflicting transformations
137
+ - **Execution Ordering**: Applies fixes in optimal order (6 phases)
138
+ - **Use-Case Aware**: Priorities adjust for ML, analytics, or export workflows
139
+ - **Contextual Rationale**: Every suggestion explains why it's recommended
140
+
141
+ ### Code Generation
142
+
143
+ ```python
144
+ # Generate a reusable cleaning function
145
+ code = report.generate_code(style="function")
146
+
147
+ # Or a standalone script
148
+ code = report.generate_code(style="script")
149
+ ```
150
+
151
+ ## Performance
152
+
153
+ DataWash v0.2.0 is optimized for large datasets:
154
+
155
+ | Dataset | Time | Throughput |
156
+ |---------|------|------------|
157
+ | 1M rows x 10 cols | 0.72s | 1.4M rows/sec |
158
+ | 100K rows x 50 cols | 2.13s | 47K rows/sec |
159
+ | 10K rows x 100 cols | 4.35s | 2.3K rows/sec |
160
+ | 1M rows x 50 cols | 3.24s | 309K rows/sec |
161
+ | 50K rows x 250 cols | 9.99s | 5K rows/sec |
162
+
163
+ **Optimizations include:**
164
+ - Smart sampling for datasets >=50K rows (10-20x speedup)
165
+ - Parallel column profiling and detection
166
+ - 31% memory reduction via dtype optimization
167
+ - O(n) similarity detection with MinHash + LSH
168
+
169
+ ## Examples
170
+
171
+ We provide ready-to-run examples in the `examples/` directory:
172
+
173
+ | Example | Description |
174
+ |---------|-------------|
175
+ | [`quickstart.py`](examples/quickstart.py) | Basic workflow: analyze → suggest → apply → codegen |
176
+ | [`csv_cleaning.py`](examples/csv_cleaning.py) | Load CSV, clean, save with CLI equivalents |
177
+ | [`ml_preprocessing.py`](examples/ml_preprocessing.py) | ML-optimized cleaning workflow |
178
+ | [`jupyter_demo.ipynb`](examples/jupyter_demo.ipynb) | Interactive notebook with visualizations |
179
+
180
+ **Sample datasets** in `examples/sample_data/`:
181
+ - `customers_messy.csv` - Names, emails, phones with various issues
182
+ - `orders_messy.csv` - Dates, amounts, categories with inconsistencies
183
+ - `employees_messy.csv` - Mixed types, duplicates, outliers
184
+
185
+ ```bash
186
+ # Run an example
187
+ python examples/quickstart.py
188
+ ```
189
+
190
+ ## Documentation
191
+
192
+ | Document | Description |
193
+ |----------|-------------|
194
+ | [Getting Started](docs/getting-started.md) | Installation and first steps |
195
+ | [User Guide](docs/user-guide.md) | Complete feature walkthrough |
196
+ | [API Reference](docs/api-reference.md) | Detailed API documentation |
197
+ | [CLI Reference](docs/cli-reference.md) | Command-line interface guide |
198
+ | [Configuration](docs/configuration.md) | Customization options |
199
+ | [Contributing](docs/contributing.md) | How to contribute |
200
+
201
+ ## Use Cases
202
+
203
+ Choose a use case to get optimized suggestions:
204
+
205
+ ```python
206
+ report = analyze(df, use_case="ml") # or "general", "analytics", "export"
207
+ ```
208
+
209
+ | Use Case | Prioritizes |
210
+ |----------|-------------|
211
+ | `general` | Balanced approach for exploration |
212
+ | `ml` | Duplicates, missing values, type conversions |
213
+ | `analytics` | Consistency, date formats, outliers |
214
+ | `export` | Format standardization, clean values |
215
+
216
+ ## Configuration
217
+
218
+ ```python
219
+ report = analyze(
220
+ "data.csv",
221
+ use_case="ml",
222
+ config={
223
+ "detectors": {
224
+ "outlier_method": "zscore", # or "iqr"
225
+ "outlier_threshold": 2.5,
226
+ "min_similarity": 0.8,
227
+ },
228
+ "suggestions": {
229
+ "max_suggestions": 20,
230
+ },
231
+ },
232
+ )
233
+ ```
234
+
235
+ ## Project Status
236
+
237
+ | Metric | Value |
238
+ |--------|-------|
239
+ | Source Code | ~2,900 lines |
240
+ | Test Code | ~1,270 lines |
241
+ | Tests | 114 passing |
242
+ | Coverage | ~92% |
243
+ | Python | 3.10, 3.11, 3.12 |
244
+ | Platforms | Linux, macOS, Windows |
245
+
246
+ ### What's Working
247
+
248
+ - ✅ Multi-format loading (CSV, JSON, Parquet, Excel)
249
+ - ✅ Comprehensive profiling and statistics
250
+ - ✅ 6 detectors for common data quality issues
251
+ - ✅ 6 transformers with multiple operations each
252
+ - ✅ Smart suggestion system with conflict resolution
253
+ - ✅ Reproducible Python code generation
254
+ - ✅ Rich CLI with colored output
255
+ - ✅ Jupyter notebook support
256
+
257
+ ### What's Next
258
+
259
+ - ML-powered semantic similarity detection
260
+ - Fuzzy duplicate detection for near-duplicate rows
261
+ - Advanced imputation (KNN, MICE)
262
+ - Cloud storage connectors (S3, BigQuery)
263
+ - PII detection for sensitive data
264
+ - Schema validation for expected column types and constraints
265
+
266
+ ## Requirements
267
+
268
+ - **Python** >= 3.10
269
+ - **Core**: pandas, numpy, pydantic, rich, typer, scikit-learn
270
+ - **Optional**: pyarrow (Parquet), openpyxl (Excel)
271
+
272
+ ## Development
273
+
274
+ ```bash
275
+ # Clone and install
276
+ git clone https://github.com/Pranav1011/DataWash.git
277
+ cd DataWash
278
+ pip install -e ".[dev,all]"
279
+
280
+ # Run tests
281
+ pytest
282
+
283
+ # Format code
284
+ black src tests
285
+ ruff check src tests
286
+ ```
287
+
288
+ ## Contributing
289
+
290
+ Contributions welcome! See [CONTRIBUTING.md](docs/contributing.md) for guidelines.
291
+
292
+ **Areas where help is needed:**
293
+ - ML module implementation (sentence-transformers)
294
+ - Additional detectors (PII, schema validation)
295
+ - Performance optimization
296
+ - Documentation and examples
297
+ - Cloud connectors
298
+
299
+ ## License
300
+
301
+ MIT License - see [LICENSE](LICENSE) for details.
302
+
303
+ ## Acknowledgments
304
+
305
+ Built with [pandas](https://pandas.pydata.org/), [pydantic](https://pydantic-docs.helpmanual.io/), [rich](https://rich.readthedocs.io/), [typer](https://typer.tiangolo.com/), and [scikit-learn](https://scikit-learn.org/).