datra 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datra-0.1.0/LICENSE +21 -0
- datra-0.1.0/PKG-INFO +338 -0
- datra-0.1.0/README.md +287 -0
- datra-0.1.0/datra/__init__.py +4 -0
- datra-0.1.0/datra/audit.py +60 -0
- datra-0.1.0/datra/checks/__init__.py +0 -0
- datra-0.1.0/datra/checks/completeness.py +19 -0
- datra-0.1.0/datra/checks/outliers.py +35 -0
- datra-0.1.0/datra/checks/profile.py +49 -0
- datra-0.1.0/datra/checks/uniqueness.py +18 -0
- datra-0.1.0/datra/checks/validate.py +151 -0
- datra-0.1.0/datra/cleaner.py +74 -0
- datra-0.1.0/datra/cleaning/__init__.py +0 -0
- datra-0.1.0/datra/cleaning/columns.py +35 -0
- datra-0.1.0/datra/cleaning/duplicates.py +6 -0
- datra-0.1.0/datra/cleaning/missing.py +56 -0
- datra-0.1.0/datra/defaults.py +12 -0
- datra-0.1.0/datra/io/__init__.py +4 -0
- datra-0.1.0/datra/io/load.py +34 -0
- datra-0.1.0/datra/io/save.py +46 -0
- datra-0.1.0/datra/reports/__init__.py +7 -0
- datra-0.1.0/datra/reports/builder.py +35 -0
- datra-0.1.0/datra/reports/exporter.py +35 -0
- datra-0.1.0/datra/reports/templates.py +26 -0
- datra-0.1.0/datra/scoring/__init__.py +0 -0
- datra-0.1.0/datra/scoring/score.py +51 -0
- datra-0.1.0/datra.egg-info/PKG-INFO +338 -0
- datra-0.1.0/datra.egg-info/SOURCES.txt +43 -0
- datra-0.1.0/datra.egg-info/dependency_links.txt +1 -0
- datra-0.1.0/datra.egg-info/requires.txt +7 -0
- datra-0.1.0/datra.egg-info/top_level.txt +1 -0
- datra-0.1.0/pyproject.toml +55 -0
- datra-0.1.0/setup.cfg +4 -0
- datra-0.1.0/tests/test_audit.py +86 -0
- datra-0.1.0/tests/test_builder.py +72 -0
- datra-0.1.0/tests/test_checks.py +0 -0
- datra-0.1.0/tests/test_clean.py +93 -0
- datra-0.1.0/tests/test_completeness.py +43 -0
- datra-0.1.0/tests/test_exporter.py +72 -0
- datra-0.1.0/tests/test_load.py +52 -0
- datra-0.1.0/tests/test_outliers.py +45 -0
- datra-0.1.0/tests/test_profile.py +40 -0
- datra-0.1.0/tests/test_save.py +42 -0
- datra-0.1.0/tests/test_score.py +86 -0
- datra-0.1.0/tests/test_uniqueness.py +48 -0
datra-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Raphael James
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
datra-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datra
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python library for cleaning, auditing, and validating tabular data.
|
|
5
|
+
Author-email: Raphael <raphaeljames897@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Raphael James
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/Raphaelj1/datra
|
|
29
|
+
Project-URL: Repository, https://github.com/Raphaelj1/datra
|
|
30
|
+
Project-URL: Issues, https://github.com/Raphaelj1/datra/issues
|
|
31
|
+
Keywords: data-cleaning,data-quality,validation,pandas,data-analysis
|
|
32
|
+
Classifier: Development Status :: 3 - Alpha
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
36
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Programming Language :: Python :: 3
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
41
|
+
Requires-Python: >=3.10
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
License-File: LICENSE
|
|
44
|
+
Requires-Dist: pandas<3.0,>=2.0
|
|
45
|
+
Requires-Dist: openpyxl>=3.1
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
48
|
+
Requires-Dist: build; extra == "dev"
|
|
49
|
+
Requires-Dist: twine; extra == "dev"
|
|
50
|
+
Dynamic: license-file
|
|
51
|
+
|
|
52
|
+
# Datra
|
|
53
|
+
|
|
54
|
+
A lightweight Python library for cleaning, auditing, and validating tabular data. It helps data scientists, analysts, and engineers quickly identify data quality issues, clean datasets using simple rules, and generate reports.
|
|
55
|
+
|
|
56
|
+
## Why Datra?
|
|
57
|
+
|
|
58
|
+
Data quality problems often consume more time than analysis itself. Missing values, duplicate records, inconsistent column names, and invalid entries can silently affect downstream models and business decisions.
|
|
59
|
+
|
|
60
|
+
Datra provides a simple workflow for understanding and improving dataset quality before analysis or machine learning.
|
|
61
|
+
|
|
62
|
+
With Datra, you can:
|
|
63
|
+
|
|
64
|
+
- Audit datasets to identify quality issues.
|
|
65
|
+
- Clean data using configurable rules.
|
|
66
|
+
- Validate datasets against business rules.
|
|
67
|
+
- Generate JSON and HTML quality reports.
|
|
68
|
+
- Work directly with Pandas DataFrames or CSV and Excel files.
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
- Dataset profiling
|
|
73
|
+
- Missing value analysis
|
|
74
|
+
- Duplicate detection
|
|
75
|
+
- Outlier detection (IQR-based)
|
|
76
|
+
- Rule-based data validation
|
|
77
|
+
- Automated data quality scoring
|
|
78
|
+
- Configurable data cleaning
|
|
79
|
+
- Column name standardization
|
|
80
|
+
- Support for Pandas DataFrames
|
|
81
|
+
- CSV and Excel file support
|
|
82
|
+
- JSON and HTML report generation
|
|
83
|
+
- Save cleaned datasets directly to disk
|
|
84
|
+
|
|
85
|
+
## Installation
|
|
86
|
+
|
|
87
|
+
Install Datra from PyPI:
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install datra
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Or install the latest development version:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git clone https://github.com/raphaelj1/datra.git
|
|
97
|
+
|
|
98
|
+
cd datra
|
|
99
|
+
|
|
100
|
+
pip install -e .
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Quick Start
|
|
104
|
+
|
|
105
|
+
### Clean a dataset
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from datra import clean
|
|
109
|
+
|
|
110
|
+
cleaned = clean(
|
|
111
|
+
"patients.csv",
|
|
112
|
+
drop_duplicates=True,
|
|
113
|
+
fill_numeric="median",
|
|
114
|
+
fill_categorical="mode",
|
|
115
|
+
standardize_columns=True,
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Audit a dataset
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from datra import Audit
|
|
123
|
+
|
|
124
|
+
audit = Audit("patients.csv")
|
|
125
|
+
|
|
126
|
+
print(audit.profile)
|
|
127
|
+
print(audit.score)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Validate a dataset
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
rules = {
|
|
134
|
+
"Age": {
|
|
135
|
+
"min": 0,
|
|
136
|
+
"max": 120,
|
|
137
|
+
},
|
|
138
|
+
"Gender": {
|
|
139
|
+
"allowed": [
|
|
140
|
+
"Male",
|
|
141
|
+
"Female",
|
|
142
|
+
],
|
|
143
|
+
},
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
report = audit.validate(rules)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Cleaning Data
|
|
150
|
+
|
|
151
|
+
The `clean()` function applies one or more cleaning operations to a dataset and returns a new DataFrame. It accepts either a Pandas DataFrame or the path to a CSV or Excel file.
|
|
152
|
+
|
|
153
|
+
### Using keyword arguments
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from datra import clean
|
|
157
|
+
|
|
158
|
+
cleaned = clean(
|
|
159
|
+
"patients.csv",
|
|
160
|
+
drop_duplicates=True,
|
|
161
|
+
fill_numeric="median",
|
|
162
|
+
fill_categorical="mode",
|
|
163
|
+
standardize_columns=True,
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Using cleaning rules
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
rules = {
|
|
171
|
+
"duplicates": {
|
|
172
|
+
"drop": True,
|
|
173
|
+
},
|
|
174
|
+
"missing": {
|
|
175
|
+
"numeric": "median",
|
|
176
|
+
"categorical": "mode",
|
|
177
|
+
},
|
|
178
|
+
"columns": {
|
|
179
|
+
"standardize": True,
|
|
180
|
+
},
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
cleaned = clean("patients.csv", rules=rules)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### Save the cleaned dataset
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
clean(
|
|
190
|
+
"patients.csv",
|
|
191
|
+
drop_duplicates=True,
|
|
192
|
+
output="cleaned_patients.xlsx",
|
|
193
|
+
)
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Auditing Data
|
|
197
|
+
|
|
198
|
+
Create an audit object to inspect dataset quality.
|
|
199
|
+
|
|
200
|
+
```python
|
|
201
|
+
from datra import Audit
|
|
202
|
+
|
|
203
|
+
audit = Audit("patients.csv")
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
Retrieve individual quality checks.
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
audit.profile
|
|
210
|
+
|
|
211
|
+
audit.completeness
|
|
212
|
+
|
|
213
|
+
audit.uniqueness
|
|
214
|
+
|
|
215
|
+
audit.outliers
|
|
216
|
+
|
|
217
|
+
audit.score
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
Or access all audit results at once.
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
audit.results
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Validation
|
|
227
|
+
|
|
228
|
+
Validate datasets against custom business rules.
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
rules = {
|
|
232
|
+
"Age": {
|
|
233
|
+
"min": 0,
|
|
234
|
+
"max": 120,
|
|
235
|
+
},
|
|
236
|
+
"Patient ID": {
|
|
237
|
+
"unique": True,
|
|
238
|
+
},
|
|
239
|
+
"Gender": {
|
|
240
|
+
"allowed": [
|
|
241
|
+
"Male",
|
|
242
|
+
"Female",
|
|
243
|
+
],
|
|
244
|
+
},
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
report = audit.validate(rules)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
Validation returns a structured report describing which checks passed, which failed, and the number of violations for each rule.
|
|
251
|
+
|
|
252
|
+
## Reports
|
|
253
|
+
|
|
254
|
+
Build a data quality report as a Python dictionary.
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
from datra import Audit
|
|
258
|
+
|
|
259
|
+
audit = Audit("patients.csv")
|
|
260
|
+
|
|
261
|
+
report = audit.build_report()
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Save the report as JSON.
|
|
265
|
+
|
|
266
|
+
```python
|
|
267
|
+
audit.save_report(
|
|
268
|
+
format="json",
|
|
269
|
+
)
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
Or save it as an HTML report.
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
audit.save_report(
|
|
276
|
+
format="html",
|
|
277
|
+
)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
## Supported File Formats
|
|
281
|
+
|
|
282
|
+
Datra supports both Pandas DataFrames and common tabular file formats.
|
|
283
|
+
|
|
284
|
+
| Input | Supported |
|
|
285
|
+
| ---------------- | --------- |
|
|
286
|
+
| Pandas DataFrame | ✅ |
|
|
287
|
+
| CSV | ✅ |
|
|
288
|
+
| Excel (.xlsx) | ✅ |
|
|
289
|
+
| Excel (.xls) | ✅ |
|
|
290
|
+
|
|
291
|
+
### Report Formats
|
|
292
|
+
|
|
293
|
+
| Format | Supported |
|
|
294
|
+
| ------ | ---------- |
|
|
295
|
+
| JSON | ✅ |
|
|
296
|
+
| HTML | ✅ |
|
|
297
|
+
| PDF | 🚧 Planned |
|
|
298
|
+
|
|
299
|
+
## Project Structure
|
|
300
|
+
|
|
301
|
+
```
|
|
302
|
+
datra/
|
|
303
|
+
├── datra/ # Library source code
|
|
304
|
+
├── examples/ # Example usage
|
|
305
|
+
├── tests/
|
|
306
|
+
├── pyproject.toml
|
|
307
|
+
├── README.md
|
|
308
|
+
└── LICENSE
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
## Roadmap
|
|
312
|
+
|
|
313
|
+
Planned improvements include:
|
|
314
|
+
|
|
315
|
+
- PDF report generation
|
|
316
|
+
- Command-line interface (CLI)
|
|
317
|
+
- Additional cleaning operations
|
|
318
|
+
- Additional validation rules
|
|
319
|
+
- More data quality checks
|
|
320
|
+
- Interactive HTML reports
|
|
321
|
+
- Support for additional file formats
|
|
322
|
+
|
|
323
|
+
## Contributing
|
|
324
|
+
|
|
325
|
+
Contributions, feature requests, and bug reports are welcome.
|
|
326
|
+
|
|
327
|
+
If you would like to contribute:
|
|
328
|
+
|
|
329
|
+
1. Fork the repository.
|
|
330
|
+
2. Create a new feature branch.
|
|
331
|
+
3. Commit your changes.
|
|
332
|
+
4. Open a pull request.
|
|
333
|
+
|
|
334
|
+
Please ensure all tests pass before submitting a pull request.
|
|
335
|
+
|
|
336
|
+
## License
|
|
337
|
+
|
|
338
|
+
This project is licensed under the MIT License.
|
datra-0.1.0/README.md
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
# Datra
|
|
2
|
+
|
|
3
|
+
A lightweight Python library for cleaning, auditing, and validating tabular data. It helps data scientists, analysts, and engineers quickly identify data quality issues, clean datasets using simple rules, and generate reports.
|
|
4
|
+
|
|
5
|
+
## Why Datra?
|
|
6
|
+
|
|
7
|
+
Data quality problems often consume more time than analysis itself. Missing values, duplicate records, inconsistent column names, and invalid entries can silently affect downstream models and business decisions.
|
|
8
|
+
|
|
9
|
+
Datra provides a simple workflow for understanding and improving dataset quality before analysis or machine learning.
|
|
10
|
+
|
|
11
|
+
With Datra, you can:
|
|
12
|
+
|
|
13
|
+
- Audit datasets to identify quality issues.
|
|
14
|
+
- Clean data using configurable rules.
|
|
15
|
+
- Validate datasets against business rules.
|
|
16
|
+
- Generate JSON and HTML quality reports.
|
|
17
|
+
- Work directly with Pandas DataFrames or CSV and Excel files.
|
|
18
|
+
|
|
19
|
+
## Features
|
|
20
|
+
|
|
21
|
+
- Dataset profiling
|
|
22
|
+
- Missing value analysis
|
|
23
|
+
- Duplicate detection
|
|
24
|
+
- Outlier detection (IQR-based)
|
|
25
|
+
- Rule-based data validation
|
|
26
|
+
- Automated data quality scoring
|
|
27
|
+
- Configurable data cleaning
|
|
28
|
+
- Column name standardization
|
|
29
|
+
- Support for Pandas DataFrames
|
|
30
|
+
- CSV and Excel file support
|
|
31
|
+
- JSON and HTML report generation
|
|
32
|
+
- Save cleaned datasets directly to disk
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
Install Datra from PyPI:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install datra
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or install the latest development version:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
git clone https://github.com/raphaelj1/datra.git
|
|
46
|
+
|
|
47
|
+
cd datra
|
|
48
|
+
|
|
49
|
+
pip install -e .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
### Clean a dataset
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from datra import clean
|
|
58
|
+
|
|
59
|
+
cleaned = clean(
|
|
60
|
+
"patients.csv",
|
|
61
|
+
drop_duplicates=True,
|
|
62
|
+
fill_numeric="median",
|
|
63
|
+
fill_categorical="mode",
|
|
64
|
+
standardize_columns=True,
|
|
65
|
+
)
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Audit a dataset
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from datra import Audit
|
|
72
|
+
|
|
73
|
+
audit = Audit("patients.csv")
|
|
74
|
+
|
|
75
|
+
print(audit.profile)
|
|
76
|
+
print(audit.score)
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Validate a dataset
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
rules = {
|
|
83
|
+
"Age": {
|
|
84
|
+
"min": 0,
|
|
85
|
+
"max": 120,
|
|
86
|
+
},
|
|
87
|
+
"Gender": {
|
|
88
|
+
"allowed": [
|
|
89
|
+
"Male",
|
|
90
|
+
"Female",
|
|
91
|
+
],
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
report = audit.validate(rules)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Cleaning Data
|
|
99
|
+
|
|
100
|
+
The `clean()` function applies one or more cleaning operations to a dataset and returns a new DataFrame. It accepts either a Pandas DataFrame or the path to a CSV or Excel file.
|
|
101
|
+
|
|
102
|
+
### Using keyword arguments
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from datra import clean
|
|
106
|
+
|
|
107
|
+
cleaned = clean(
|
|
108
|
+
"patients.csv",
|
|
109
|
+
drop_duplicates=True,
|
|
110
|
+
fill_numeric="median",
|
|
111
|
+
fill_categorical="mode",
|
|
112
|
+
standardize_columns=True,
|
|
113
|
+
)
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Using cleaning rules
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
rules = {
|
|
120
|
+
"duplicates": {
|
|
121
|
+
"drop": True,
|
|
122
|
+
},
|
|
123
|
+
"missing": {
|
|
124
|
+
"numeric": "median",
|
|
125
|
+
"categorical": "mode",
|
|
126
|
+
},
|
|
127
|
+
"columns": {
|
|
128
|
+
"standardize": True,
|
|
129
|
+
},
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
cleaned = clean("patients.csv", rules=rules)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
### Save the cleaned dataset
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
clean(
|
|
139
|
+
"patients.csv",
|
|
140
|
+
drop_duplicates=True,
|
|
141
|
+
output="cleaned_patients.xlsx",
|
|
142
|
+
)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Auditing Data
|
|
146
|
+
|
|
147
|
+
Create an audit object to inspect dataset quality.
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from datra import Audit
|
|
151
|
+
|
|
152
|
+
audit = Audit("patients.csv")
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Retrieve individual quality checks.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
audit.profile
|
|
159
|
+
|
|
160
|
+
audit.completeness
|
|
161
|
+
|
|
162
|
+
audit.uniqueness
|
|
163
|
+
|
|
164
|
+
audit.outliers
|
|
165
|
+
|
|
166
|
+
audit.score
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
Or access all audit results at once.
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
audit.results
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Validation
|
|
176
|
+
|
|
177
|
+
Validate datasets against custom business rules.
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
rules = {
|
|
181
|
+
"Age": {
|
|
182
|
+
"min": 0,
|
|
183
|
+
"max": 120,
|
|
184
|
+
},
|
|
185
|
+
"Patient ID": {
|
|
186
|
+
"unique": True,
|
|
187
|
+
},
|
|
188
|
+
"Gender": {
|
|
189
|
+
"allowed": [
|
|
190
|
+
"Male",
|
|
191
|
+
"Female",
|
|
192
|
+
],
|
|
193
|
+
},
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
report = audit.validate(rules)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Validation returns a structured report describing which checks passed, which failed, and the number of violations for each rule.
|
|
200
|
+
|
|
201
|
+
## Reports
|
|
202
|
+
|
|
203
|
+
Build a data quality report as a Python dictionary.
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
from datra import Audit
|
|
207
|
+
|
|
208
|
+
audit = Audit("patients.csv")
|
|
209
|
+
|
|
210
|
+
report = audit.build_report()
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Save the report as JSON.
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
audit.save_report(
|
|
217
|
+
format="json",
|
|
218
|
+
)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Or save it as an HTML report.
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
audit.save_report(
|
|
225
|
+
format="html",
|
|
226
|
+
)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
## Supported File Formats
|
|
230
|
+
|
|
231
|
+
Datra supports both Pandas DataFrames and common tabular file formats.
|
|
232
|
+
|
|
233
|
+
| Input | Supported |
|
|
234
|
+
| ---------------- | --------- |
|
|
235
|
+
| Pandas DataFrame | ✅ |
|
|
236
|
+
| CSV | ✅ |
|
|
237
|
+
| Excel (.xlsx) | ✅ |
|
|
238
|
+
| Excel (.xls) | ✅ |
|
|
239
|
+
|
|
240
|
+
### Report Formats
|
|
241
|
+
|
|
242
|
+
| Format | Supported |
|
|
243
|
+
| ------ | ---------- |
|
|
244
|
+
| JSON | ✅ |
|
|
245
|
+
| HTML | ✅ |
|
|
246
|
+
| PDF | 🚧 Planned |
|
|
247
|
+
|
|
248
|
+
## Project Structure
|
|
249
|
+
|
|
250
|
+
```
|
|
251
|
+
datra/
|
|
252
|
+
├── datra/ # Library source code
|
|
253
|
+
├── examples/ # Example usage
|
|
254
|
+
├── tests/
|
|
255
|
+
├── pyproject.toml
|
|
256
|
+
├── README.md
|
|
257
|
+
└── LICENSE
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Roadmap
|
|
261
|
+
|
|
262
|
+
Planned improvements include:
|
|
263
|
+
|
|
264
|
+
- PDF report generation
|
|
265
|
+
- Command-line interface (CLI)
|
|
266
|
+
- Additional cleaning operations
|
|
267
|
+
- Additional validation rules
|
|
268
|
+
- More data quality checks
|
|
269
|
+
- Interactive HTML reports
|
|
270
|
+
- Support for additional file formats
|
|
271
|
+
|
|
272
|
+
## Contributing
|
|
273
|
+
|
|
274
|
+
Contributions, feature requests, and bug reports are welcome.
|
|
275
|
+
|
|
276
|
+
If you would like to contribute:
|
|
277
|
+
|
|
278
|
+
1. Fork the repository.
|
|
279
|
+
2. Create a new feature branch.
|
|
280
|
+
3. Commit your changes.
|
|
281
|
+
4. Open a pull request.
|
|
282
|
+
|
|
283
|
+
Please ensure all tests pass before submitting a pull request.
|
|
284
|
+
|
|
285
|
+
## License
|
|
286
|
+
|
|
287
|
+
This project is licensed under the MIT License.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from functools import cached_property
|
|
2
|
+
|
|
3
|
+
from datra.io import load
|
|
4
|
+
from datra.checks.validate import validate as validate_df
|
|
5
|
+
from datra.checks.completeness import completeness as check_completeness
|
|
6
|
+
from datra.checks.uniqueness import uniqueness as check_uniqueness
|
|
7
|
+
from datra.checks.outliers import outliers as check_outliers
|
|
8
|
+
from datra.checks.profile import profile as check_profile
|
|
9
|
+
from datra.scoring.score import calculate_score
|
|
10
|
+
from datra.reports import build_report, save_report
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Audit:
|
|
14
|
+
def __init__(self, input_data):
|
|
15
|
+
self.df = load(input_data)
|
|
16
|
+
|
|
17
|
+
@cached_property
|
|
18
|
+
def completeness(self):
|
|
19
|
+
return check_completeness(self.df)
|
|
20
|
+
|
|
21
|
+
@cached_property
|
|
22
|
+
def uniqueness(self):
|
|
23
|
+
return check_uniqueness(self.df)
|
|
24
|
+
|
|
25
|
+
@cached_property
|
|
26
|
+
def outliers(self):
|
|
27
|
+
return check_outliers(self.df)
|
|
28
|
+
|
|
29
|
+
@cached_property
|
|
30
|
+
def profile(self):
|
|
31
|
+
return check_profile(self.df)
|
|
32
|
+
|
|
33
|
+
@cached_property
|
|
34
|
+
def score(self):
|
|
35
|
+
metrics_payload = {
|
|
36
|
+
"completeness": self.completeness,
|
|
37
|
+
"uniqueness": self.uniqueness,
|
|
38
|
+
"outliers": self.outliers,
|
|
39
|
+
}
|
|
40
|
+
return calculate_score(metrics_payload)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def results(self):
|
|
44
|
+
return {
|
|
45
|
+
"profile": self.profile,
|
|
46
|
+
"completeness": self.completeness,
|
|
47
|
+
"uniqueness": self.uniqueness,
|
|
48
|
+
"outliers": self.outliers,
|
|
49
|
+
"score": self.score,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
def validate(self, rules: dict):
|
|
53
|
+
return validate_df(self.df, rules)
|
|
54
|
+
|
|
55
|
+
def build_report(self):
|
|
56
|
+
return build_report(self.results)
|
|
57
|
+
|
|
58
|
+
def save_report(self, path="outputs", format="json"):
|
|
59
|
+
report = self.build_report()
|
|
60
|
+
return save_report(report, output=path, format=format)
|
|
File without changes
|