datawatcher-ml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datawatcher-ml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Production-grade dataset auditing and ML readiness scoring library
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/ranjeet3102/datawatcher
|
|
7
|
+
Project-URL: Repository, https://github.com/ranjeet3102/datawatcher
|
|
8
|
+
Project-URL: Issues, https://github.com/ranjeet3102/datawatcher/issues
|
|
9
|
+
Keywords: data quality,machine learning,data auditing,dataset validation,ML readiness,data profiling
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: pandas>=1.5.0
|
|
23
|
+
Requires-Dist: typer>=0.9.0
|
|
24
|
+
Requires-Dist: rich>=13.0.0
|
|
25
|
+
Provides-Extra: pdf
|
|
26
|
+
Requires-Dist: reportlab>=4.0.0; extra == "pdf"
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
30
|
+
|
|
31
|
+
# DataWatcher
|
|
32
|
+
|
|
33
|
+
**Production-grade dataset auditing and ML readiness scoring library.**
|
|
34
|
+
|
|
35
|
+
[](https://www.python.org/)
|
|
36
|
+
[](https://opensource.org/licenses/MIT)
|
|
37
|
+
|
|
38
|
+
DataWatcher runs a comprehensive battery of **22+ audits** across your dataset — checking structure, data quality, statistical properties, categorical features, and ML-specific risks — then produces an overall **ML Readiness Score (0–100)** and a prioritized **Risk Summary**.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Installation
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install datawatcher
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
For PDF report export support:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install "datawatcher[pdf]"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Quick Start
|
|
57
|
+
|
|
58
|
+
### Python API
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import datawatcher
|
|
62
|
+
|
|
63
|
+
# Audit a CSV file
|
|
64
|
+
results = datawatcher.audit_csv("train.csv", target="survived")
|
|
65
|
+
|
|
66
|
+
print(results["ml_readiness"])
|
|
67
|
+
# {'score': 84, 'grade': 'GOOD', 'total_penalty': 16.0, ...}
|
|
68
|
+
|
|
69
|
+
print(results["risk_summary"])
|
|
70
|
+
# {'risk_level': 'LOW', 'top_risks': ['missing_value_audit'], ...}
|
|
71
|
+
|
|
72
|
+
# Access individual audit results
|
|
73
|
+
for audit in results["audit_results"]:
|
|
74
|
+
print(audit.audit_name, audit.severity, audit.passed)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Audit an in-memory DataFrame
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import pandas as pd
|
|
81
|
+
import datawatcher
|
|
82
|
+
|
|
83
|
+
df = pd.read_csv("transactions.csv")
|
|
84
|
+
|
|
85
|
+
results = datawatcher.audit_dataframe(
|
|
86
|
+
df,
|
|
87
|
+
target="churn",
|
|
88
|
+
domain="finance" # activates finance-specific audits
|
|
89
|
+
)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Domain-specific auditing
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# Healthcare domain adds: age range, BMI, blood pressure,
|
|
96
|
+
# heart rate, lab results, missing diagnosis, medication consistency
|
|
97
|
+
results = datawatcher.audit_csv(
|
|
98
|
+
"patients.csv",
|
|
99
|
+
target="readmitted",
|
|
100
|
+
domain="healthcare"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Finance domain adds: negative values, currency consistency,
|
|
104
|
+
# interest rate validity, balance consistency
|
|
105
|
+
results = datawatcher.audit_csv(
|
|
106
|
+
"loans.csv",
|
|
107
|
+
target="default",
|
|
108
|
+
domain="finance"
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Time series domain adds: duplicate timestamp detection
|
|
112
|
+
results = datawatcher.audit_csv(
|
|
113
|
+
"sensor_data.csv",
|
|
114
|
+
domain="timeseries"
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## CLI Usage
|
|
121
|
+
|
|
122
|
+
After installation, the `datawatcher` command is available globally:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
# Basic audit
|
|
126
|
+
datawatcher audit run data.csv
|
|
127
|
+
|
|
128
|
+
# With target column
|
|
129
|
+
datawatcher audit run data.csv --target label
|
|
130
|
+
|
|
131
|
+
# With domain plugin
|
|
132
|
+
datawatcher audit run data.csv --target label --domain healthcare
|
|
133
|
+
|
|
134
|
+
# Export reports
|
|
135
|
+
datawatcher audit run data.csv --target label --export-html --export-pdf --export-json
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Audit Catalog
|
|
141
|
+
|
|
142
|
+
### Structural (4 audits)
|
|
143
|
+
| Audit | Checks |
|
|
144
|
+
|---|---|
|
|
145
|
+
| `shape_audit` | Row and column counts |
|
|
146
|
+
| `dtype_audit` | Data type summary per column |
|
|
147
|
+
| `memory_usage_audit` | Dataset memory footprint |
|
|
148
|
+
| `schema_consistency_audit` | Mixed types within columns |
|
|
149
|
+
|
|
150
|
+
### Quality (5 audits)
|
|
151
|
+
| Audit | Threshold | Source |
|
|
152
|
+
|---|---|---|
|
|
153
|
+
| `missing_value_audit` | LOW >3%, MEDIUM >15% | Google TFDV |
|
|
154
|
+
| `duplicate_audit` | LOW >0.5%, MEDIUM >5% | AWS Deequ |
|
|
155
|
+
| `constant_feature_audit` | Any constant column | — |
|
|
156
|
+
| `near_constant_audit` | >95% single value | scikit-learn |
|
|
157
|
+
| `invalid_value_audit` | Inf/NaN/unrealistic values | — |
|
|
158
|
+
|
|
159
|
+
### Statistical (5 audits)
|
|
160
|
+
| Audit | Threshold | Source |
|
|
161
|
+
|---|---|---|
|
|
162
|
+
| `descriptive_stats_audit` | Observational (no penalty) | — |
|
|
163
|
+
| `variance_audit` | Variance < 0.001 | scikit-learn VarianceThreshold |
|
|
164
|
+
| `skewness_audit` | \|skew\| ≥ 1.0 | Hair et al. (2010) |
|
|
165
|
+
| `kurtosis_audit` | Excess kurtosis > 7 | DeCarlo (1997) |
|
|
166
|
+
| `outlier_audit` | LOW >0.5% rows, MEDIUM >2% rows | IBM Research / TFDV |
|
|
167
|
+
|
|
168
|
+
### Categorical (3 audits)
|
|
169
|
+
| Audit | Threshold |
|
|
170
|
+
|---|---|
|
|
171
|
+
| `category_frequency_audit` | Observational |
|
|
172
|
+
| `rare_category_audit` | Category < 0.5% frequency |
|
|
173
|
+
| `category_imbalance_audit` | Dominant category > 70% |
|
|
174
|
+
|
|
175
|
+
### ML (5 audits)
|
|
176
|
+
| Audit | Threshold | Source |
|
|
177
|
+
|---|---|---|
|
|
178
|
+
| `cardinality_audit` | > 30% unique values | Industry ML best practice |
|
|
179
|
+
| `identifier_risk_audit` | > 90% unique values | GDPR / ML risk |
|
|
180
|
+
| `target_validation_audit` | Target column validity | — |
|
|
181
|
+
| `class_imbalance_audit` | Majority class > 75% | Japkowicz & Stephen (2002) |
|
|
182
|
+
| `leakage_audit` | \|Pearson r\| > 0.90 with target | Industry standard |
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## ML Readiness Score
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
Score = 100 − Σ(severity_weight × audit_weight)
|
|
190
|
+
|
|
191
|
+
Severity weights: INFO=0, LOW=3, MEDIUM=7, HIGH=15, CRITICAL=25
|
|
192
|
+
Audit weights (examples): leakage=3.0, target_validation=3.0, invalid_values=2.0
|
|
193
|
+
|
|
194
|
+
Grades:
|
|
195
|
+
≥ 90 → EXCELLENT
|
|
196
|
+
≥ 75 → GOOD
|
|
197
|
+
≥ 60 → FAIR
|
|
198
|
+
< 60 → POOR
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Extending with Custom Audits
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from datawatcher import BaseAudit, AuditResult, AuditRegistry, AuditEngine
|
|
207
|
+
from datawatcher import audit_dataframe
|
|
208
|
+
|
|
209
|
+
class MyCustomAudit(BaseAudit):
|
|
210
|
+
audit_name = "my_custom_audit"
|
|
211
|
+
category = "custom"
|
|
212
|
+
|
|
213
|
+
def run(self, dataset, context=None):
|
|
214
|
+
df = dataset.df
|
|
215
|
+
# ... your logic ...
|
|
216
|
+
return AuditResult(
|
|
217
|
+
audit_name=self.audit_name,
|
|
218
|
+
category=self.category,
|
|
219
|
+
passed=True,
|
|
220
|
+
severity="INFO",
|
|
221
|
+
findings={"message": "All good"},
|
|
222
|
+
recommendations=[]
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Use programmatically
|
|
226
|
+
registry = AuditRegistry()
|
|
227
|
+
registry.register(MyCustomAudit())
|
|
228
|
+
|
|
229
|
+
from datawatcher.core.audit_engine import AuditEngine
|
|
230
|
+
from datawatcher.loaders.factory import load_dataset
|
|
231
|
+
|
|
232
|
+
dataset = load_dataset("data.csv")
|
|
233
|
+
engine = AuditEngine(registry)
|
|
234
|
+
results = engine.run(dataset, context={"target": "label"})
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Return Value Structure
|
|
240
|
+
|
|
241
|
+
`audit_csv()` and `audit_dataframe()` return:
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
{
|
|
245
|
+
"audit_results": [AuditResult, ...], # list of all audit results
|
|
246
|
+
"ml_readiness": {
|
|
247
|
+
"score": 84, # 0-100
|
|
248
|
+
"grade": "GOOD", # EXCELLENT/GOOD/FAIR/POOR
|
|
249
|
+
"total_penalty": 16.0,
|
|
250
|
+
"severity_breakdown": {...}
|
|
251
|
+
},
|
|
252
|
+
"risk_summary": {
|
|
253
|
+
"risk_level": "LOW", # LOW/MEDIUM/HIGH
|
|
254
|
+
"top_risks": ["audit_name", ...],
|
|
255
|
+
"high_risk_audits": [...],
|
|
256
|
+
"medium_risk_audits": [...]
|
|
257
|
+
},
|
|
258
|
+
"metadata": {
|
|
259
|
+
"rows": 10000,
|
|
260
|
+
"columns": 25,
|
|
261
|
+
"memory_usage_mb": 4.2
|
|
262
|
+
},
|
|
263
|
+
"semantic_types": {
|
|
264
|
+
"column_name": "numeric", # numeric/categorical/datetime/...
|
|
265
|
+
...
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## License
|
|
273
|
+
|
|
274
|
+
MIT © Ranjeet
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
datawatcher_ml-0.1.0.dist-info/METADATA,sha256=-C7CJQPEoiYC66dIbiNVy8KcLkwV2Rutd-gNpvrHLPU,7845
|
|
2
|
+
datawatcher_ml-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
3
|
+
datawatcher_ml-0.1.0.dist-info/entry_points.txt,sha256=N7b2DeAhghZGMVf8jU01BN-6gkj9jcpF9sF6oUyjWa8,56
|
|
4
|
+
datawatcher_ml-0.1.0.dist-info/top_level.txt,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
5
|
+
datawatcher_ml-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|