cleanalytix 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cleanalytix-0.1.0/LICENSE +21 -0
- cleanalytix-0.1.0/PKG-INFO +235 -0
- cleanalytix-0.1.0/README.md +180 -0
- cleanalytix-0.1.0/cleanalytix/__init__.py +39 -0
- cleanalytix-0.1.0/cleanalytix/adjust_prod_meta.py +135 -0
- cleanalytix-0.1.0/cleanalytix/cleaning_recommendations.py +107 -0
- cleanalytix-0.1.0/cleanalytix/compute_dq_score.py +235 -0
- cleanalytix-0.1.0/cleanalytix/generate_meta.py +551 -0
- cleanalytix-0.1.0/cleanalytix/get_cleaned_data.py +466 -0
- cleanalytix-0.1.0/cleanalytix/get_table_for_DQ_computation.py +65 -0
- cleanalytix-0.1.0/cleanalytix/pipeline.py +188 -0
- cleanalytix-0.1.0/cleanalytix/preprocess_types.py +43 -0
- cleanalytix-0.1.0/cleanalytix/summarize_dataset_health.py +19 -0
- cleanalytix-0.1.0/cleanalytix/version.py +2 -0
- cleanalytix-0.1.0/cleanalytix.egg-info/PKG-INFO +235 -0
- cleanalytix-0.1.0/cleanalytix.egg-info/SOURCES.txt +20 -0
- cleanalytix-0.1.0/cleanalytix.egg-info/dependency_links.txt +1 -0
- cleanalytix-0.1.0/cleanalytix.egg-info/requires.txt +9 -0
- cleanalytix-0.1.0/cleanalytix.egg-info/top_level.txt +1 -0
- cleanalytix-0.1.0/pyproject.toml +64 -0
- cleanalytix-0.1.0/setup.cfg +4 -0
- cleanalytix-0.1.0/tests/test_pipeline.py +118 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Probot-DATA contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cleanalytix
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cleanalytix is a modular Python library for profiling, scoring, and cleaning tabular datasets.
|
|
5
|
+
Author: Probot-DATA contributors
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Probot-DATA contributors
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/Probot-DATA/Cleanalytix_Repo
|
|
29
|
+
Project-URL: Repository, https://github.com/Probot-DATA/Cleanalytix_Repo
|
|
30
|
+
Project-URL: Issues, https://github.com/Probot-DATA/Cleanalytix_Repo/issues
|
|
31
|
+
Keywords: data quality,data cleaning,data profiling,EDA,machine learning,pandas
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: Intended Audience :: Science/Research
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
42
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
43
|
+
Requires-Python: >=3.9
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
License-File: LICENSE
|
|
46
|
+
Requires-Dist: pandas>=1.5.0
|
|
47
|
+
Requires-Dist: numpy>=1.23.0
|
|
48
|
+
Requires-Dist: scikit-learn>=1.1.0
|
|
49
|
+
Requires-Dist: nltk>=3.8.0
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
52
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
53
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
54
|
+
Dynamic: license-file
|
|
55
|
+
|
|
56
|
+
# Cleanalytix
|
|
57
|
+
|
|
58
|
+
`Cleanalytix` is a Python library for profiling, scoring, cleaning, and monitoring the quality of tabular datasets with a single pipeline.
|
|
59
|
+
|
|
60
|
+
It is designed for pandas-first workflows and supports:
|
|
61
|
+
|
|
62
|
+
- baseline dataset profiling and scoring
|
|
63
|
+
- optional cleaning recommendations and automatic cleaning
|
|
64
|
+
- optional production/new-dataset monitoring
|
|
65
|
+
- optional business rules, thresholds, weights, and type inference for new data
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
From a source checkout:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
git clone https://github.com/Probot-DATA/Cleanalytix_Repo
|
|
73
|
+
cd Cleanalytix_Repo
|
|
74
|
+
pip install -e ".[dev]"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Once this project is published to PyPI, the install command will be:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install cleanalytix
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Runtime requirements:
|
|
84
|
+
|
|
85
|
+
- Python 3.9+
|
|
86
|
+
- pandas
|
|
87
|
+
- numpy
|
|
88
|
+
- scikit-learn
|
|
89
|
+
- nltk
|
|
90
|
+
|
|
91
|
+
## Quick Start
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
import pandas as pd
|
|
95
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
96
|
+
|
|
97
|
+
df = pd.read_csv("my_data.csv")
|
|
98
|
+
|
|
99
|
+
result = Run_DQ_Pipeline(
|
|
100
|
+
dataset_names=["my_dataset"],
|
|
101
|
+
dataset_list=[df],
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
print(result["base_data"]["dirty_scores"])
|
|
105
|
+
print(result["base_data"]["meta_before_cleaning"])
|
|
106
|
+
print(result["base_data"]["recommendations"])
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Production / Monitoring Example
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import pandas as pd
|
|
113
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
114
|
+
|
|
115
|
+
train_df = pd.read_csv("train.csv")
|
|
116
|
+
prod_df = pd.read_csv("production.csv")
|
|
117
|
+
|
|
118
|
+
rules = {
|
|
119
|
+
"age": lambda value: pd.isna(value) or 0 <= float(value) <= 120,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
result = Run_DQ_Pipeline(
|
|
123
|
+
dataset_names=["customers"],
|
|
124
|
+
dataset_list=[train_df],
|
|
125
|
+
new_dataset_list=[prod_df],
|
|
126
|
+
rules=rules,
|
|
127
|
+
cleaning=True,
|
|
128
|
+
interactive=False,
|
|
129
|
+
score_mode="exponential",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
print(result["base_data"]["dirty_scores"])
|
|
133
|
+
print(result["base_data"]["cleaned_scores"])
|
|
134
|
+
print(result["prod_data"]["dirty_scores"])
|
|
135
|
+
print(result["prod_data"]["change_log"])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
## Public API
|
|
139
|
+
|
|
140
|
+
The primary entrypoint is:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Additional building blocks are also exported:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from cleanalytix import (
|
|
150
|
+
Compute_DQ_Score,
|
|
151
|
+
DEFAULT_THRESHOLDS,
|
|
152
|
+
generate_meta,
|
|
153
|
+
cleaning_recommendations,
|
|
154
|
+
get_cleaned_data,
|
|
155
|
+
get_table_for_DQ_computation,
|
|
156
|
+
summarize_dataset_health,
|
|
157
|
+
learn_reference_profile,
|
|
158
|
+
adjust_prod_meta_with_reference,
|
|
159
|
+
infer_and_fix_types,
|
|
160
|
+
)
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Pipeline Output Structure
|
|
164
|
+
|
|
165
|
+
`Run_DQ_Pipeline` returns:
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
{
|
|
169
|
+
"base_data": {...},
|
|
170
|
+
"prod_data": {...},
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
Each block preserves the same keys:
|
|
175
|
+
|
|
176
|
+
- `dirty_scores`
|
|
177
|
+
- `cleaned_scores`
|
|
178
|
+
- `cleaned_datasets`
|
|
179
|
+
- `meta_before_cleaning`
|
|
180
|
+
- `meta_after_cleaning`
|
|
181
|
+
- `recommendations`
|
|
182
|
+
- `change_log`
|
|
183
|
+
- `summarized_before`
|
|
184
|
+
- `summarized_after`
|
|
185
|
+
- `main_metrics_before`
|
|
186
|
+
- `main_metrics_after`
|
|
187
|
+
|
|
188
|
+
## Examples
|
|
189
|
+
|
|
190
|
+
Runnable examples live in [examples](./examples):
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
python examples/simple_usage.py
|
|
194
|
+
python examples/production_usage.py
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
These examples assume the package has already been installed in the active environment.
|
|
198
|
+
|
|
199
|
+
## Validation
|
|
200
|
+
|
|
201
|
+
The [validation](./validation) folder contains a portable real-world validation workflow.
|
|
202
|
+
|
|
203
|
+
- Large raw datasets are intentionally not committed to the repository.
|
|
204
|
+
- Put the expected files under `validation/datasets/` by following
|
|
205
|
+
[validation/datasets/README.md](./validation/datasets/README.md).
|
|
206
|
+
- Run the validation script:
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
python validation/run_validation.py
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
- The script saves non-empty outputs to `validation/outputs/<dataset_name>/`.
|
|
213
|
+
- The notebook [validation/main.ipynb](./validation/main.ipynb) uses the same relative-path workflow.
|
|
214
|
+
|
|
215
|
+
## Repository Layout
|
|
216
|
+
|
|
217
|
+
- `cleanalytix/` - installable library package
|
|
218
|
+
- `examples/` - small runnable examples
|
|
219
|
+
- `tests/` - smoke tests and lightweight sample fixtures
|
|
220
|
+
- `validation/` - public-friendly validation workflow and output folder
|
|
221
|
+
- `archive/legacy/` - historical prototype notebook/code kept for reference, not for active use
|
|
222
|
+
|
|
223
|
+
## Known Limitations
|
|
224
|
+
|
|
225
|
+
- Validation datasets are not bundled with the repository.
|
|
226
|
+
- The yellow taxi validation workflow samples the first `20,000` rows from each configured monthly file to match the original project workflow and to keep validation practical.
|
|
227
|
+
- Interactive cleaning is intended for notebook/CLI use and will prompt for input when `interactive=True`.
|
|
228
|
+
|
|
229
|
+
## Contributing
|
|
230
|
+
|
|
231
|
+
See [CONTRIBUTING.md](./CONTRIBUTING.md).
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
[MIT](./LICENSE) (c) 2026 Probot-DATA contributors
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# Cleanalytix
|
|
2
|
+
|
|
3
|
+
`Cleanalytix` is a Python library for profiling, scoring, cleaning, and monitoring the quality of tabular datasets with a single pipeline.
|
|
4
|
+
|
|
5
|
+
It is designed for pandas-first workflows and supports:
|
|
6
|
+
|
|
7
|
+
- baseline dataset profiling and scoring
|
|
8
|
+
- optional cleaning recommendations and automatic cleaning
|
|
9
|
+
- optional production/new-dataset monitoring
|
|
10
|
+
- optional business rules, thresholds, weights, and type inference for new data
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
From a source checkout:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
git clone https://github.com/Probot-DATA/Cleanalytix_Repo
|
|
18
|
+
cd Cleanalytix_Repo
|
|
19
|
+
pip install -e ".[dev]"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
Once this project is published to PyPI, the install command will be:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install cleanalytix
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Runtime requirements:
|
|
29
|
+
|
|
30
|
+
- Python 3.9+
|
|
31
|
+
- pandas
|
|
32
|
+
- numpy
|
|
33
|
+
- scikit-learn
|
|
34
|
+
- nltk
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
import pandas as pd
|
|
40
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
41
|
+
|
|
42
|
+
df = pd.read_csv("my_data.csv")
|
|
43
|
+
|
|
44
|
+
result = Run_DQ_Pipeline(
|
|
45
|
+
dataset_names=["my_dataset"],
|
|
46
|
+
dataset_list=[df],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
print(result["base_data"]["dirty_scores"])
|
|
50
|
+
print(result["base_data"]["meta_before_cleaning"])
|
|
51
|
+
print(result["base_data"]["recommendations"])
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Production / Monitoring Example
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
59
|
+
|
|
60
|
+
train_df = pd.read_csv("train.csv")
|
|
61
|
+
prod_df = pd.read_csv("production.csv")
|
|
62
|
+
|
|
63
|
+
rules = {
|
|
64
|
+
"age": lambda value: pd.isna(value) or 0 <= float(value) <= 120,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
result = Run_DQ_Pipeline(
|
|
68
|
+
dataset_names=["customers"],
|
|
69
|
+
dataset_list=[train_df],
|
|
70
|
+
new_dataset_list=[prod_df],
|
|
71
|
+
rules=rules,
|
|
72
|
+
cleaning=True,
|
|
73
|
+
interactive=False,
|
|
74
|
+
score_mode="exponential",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
print(result["base_data"]["dirty_scores"])
|
|
78
|
+
print(result["base_data"]["cleaned_scores"])
|
|
79
|
+
print(result["prod_data"]["dirty_scores"])
|
|
80
|
+
print(result["prod_data"]["change_log"])
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Public API
|
|
84
|
+
|
|
85
|
+
The primary entrypoint is:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from cleanalytix import Run_DQ_Pipeline
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Additional building blocks are also exported:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from cleanalytix import (
|
|
95
|
+
Compute_DQ_Score,
|
|
96
|
+
DEFAULT_THRESHOLDS,
|
|
97
|
+
generate_meta,
|
|
98
|
+
cleaning_recommendations,
|
|
99
|
+
get_cleaned_data,
|
|
100
|
+
get_table_for_DQ_computation,
|
|
101
|
+
summarize_dataset_health,
|
|
102
|
+
learn_reference_profile,
|
|
103
|
+
adjust_prod_meta_with_reference,
|
|
104
|
+
infer_and_fix_types,
|
|
105
|
+
)
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Pipeline Output Structure
|
|
109
|
+
|
|
110
|
+
`Run_DQ_Pipeline` returns:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
{
|
|
114
|
+
"base_data": {...},
|
|
115
|
+
"prod_data": {...},
|
|
116
|
+
}
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Each block preserves the same keys:
|
|
120
|
+
|
|
121
|
+
- `dirty_scores`
|
|
122
|
+
- `cleaned_scores`
|
|
123
|
+
- `cleaned_datasets`
|
|
124
|
+
- `meta_before_cleaning`
|
|
125
|
+
- `meta_after_cleaning`
|
|
126
|
+
- `recommendations`
|
|
127
|
+
- `change_log`
|
|
128
|
+
- `summarized_before`
|
|
129
|
+
- `summarized_after`
|
|
130
|
+
- `main_metrics_before`
|
|
131
|
+
- `main_metrics_after`
|
|
132
|
+
|
|
133
|
+
## Examples
|
|
134
|
+
|
|
135
|
+
Runnable examples live in [examples](./examples):
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
python examples/simple_usage.py
|
|
139
|
+
python examples/production_usage.py
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
These examples assume the package has already been installed in the active environment.
|
|
143
|
+
|
|
144
|
+
## Validation
|
|
145
|
+
|
|
146
|
+
The [validation](./validation) folder contains a portable real-world validation workflow.
|
|
147
|
+
|
|
148
|
+
- Large raw datasets are intentionally not committed to the repository.
|
|
149
|
+
- Put the expected files under `validation/datasets/` by following
|
|
150
|
+
[validation/datasets/README.md](./validation/datasets/README.md).
|
|
151
|
+
- Run the validation script:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
python validation/run_validation.py
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
- The script saves non-empty outputs to `validation/outputs/<dataset_name>/`.
|
|
158
|
+
- The notebook [validation/main.ipynb](./validation/main.ipynb) uses the same relative-path workflow.
|
|
159
|
+
|
|
160
|
+
## Repository Layout
|
|
161
|
+
|
|
162
|
+
- `cleanalytix/` - installable library package
|
|
163
|
+
- `examples/` - small runnable examples
|
|
164
|
+
- `tests/` - smoke tests and lightweight sample fixtures
|
|
165
|
+
- `validation/` - public-friendly validation workflow and output folder
|
|
166
|
+
- `archive/legacy/` - historical prototype notebook/code kept for reference, not for active use
|
|
167
|
+
|
|
168
|
+
## Known Limitations
|
|
169
|
+
|
|
170
|
+
- Validation datasets are not bundled with the repository.
|
|
171
|
+
- The yellow taxi validation workflow samples the first `20,000` rows from each configured monthly file to match the original project workflow and to keep validation practical.
|
|
172
|
+
- Interactive cleaning is intended for notebook/CLI use and will prompt for input when `interactive=True`.
|
|
173
|
+
|
|
174
|
+
## Contributing
|
|
175
|
+
|
|
176
|
+
See [CONTRIBUTING.md](./CONTRIBUTING.md).
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
[MIT](./LICENSE) (c) 2026 Probot-DATA contributors
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cleanalytix - Data Quality Framework.
|
|
3
|
+
|
|
4
|
+
Quick start
|
|
5
|
+
-----------
|
|
6
|
+
>>> from cleanalytix import Run_DQ_Pipeline
|
|
7
|
+
>>> result = Run_DQ_Pipeline(["my_dataset"], [df])
|
|
8
|
+
>>> print(result["base_data"]["dirty_scores"])
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .version import __author__, __version__
|
|
12
|
+
from .pipeline import Run_DQ_Pipeline
|
|
13
|
+
from .compute_dq_score import Compute_DQ_Score, DEFAULT_THRESHOLDS
|
|
14
|
+
from .generate_meta import generate_meta
|
|
15
|
+
from .cleaning_recommendations import cleaning_recommendations
|
|
16
|
+
from .get_cleaned_data import get_cleaned_data
|
|
17
|
+
from .get_table_for_DQ_computation import get_table_for_DQ_computation
|
|
18
|
+
from .summarize_dataset_health import summarize_dataset_health
|
|
19
|
+
from .adjust_prod_meta import learn_reference_profile, adjust_prod_meta_with_reference
|
|
20
|
+
from .preprocess_types import infer_and_fix_types
|
|
21
|
+
|
|
22
|
+
run_dq_pipeline = Run_DQ_Pipeline
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Run_DQ_Pipeline",
|
|
26
|
+
"run_dq_pipeline",
|
|
27
|
+
"Compute_DQ_Score",
|
|
28
|
+
"DEFAULT_THRESHOLDS",
|
|
29
|
+
"__author__",
|
|
30
|
+
"__version__",
|
|
31
|
+
"generate_meta",
|
|
32
|
+
"cleaning_recommendations",
|
|
33
|
+
"get_cleaned_data",
|
|
34
|
+
"get_table_for_DQ_computation",
|
|
35
|
+
"summarize_dataset_health",
|
|
36
|
+
"learn_reference_profile",
|
|
37
|
+
"adjust_prod_meta_with_reference",
|
|
38
|
+
"infer_and_fix_types",
|
|
39
|
+
]
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# cleanalytix/adjust_prod_meta.py
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
EPS = 1e-9
|
|
6
|
+
|
|
7
|
+
def learn_reference_profile(dataset_names, dataset_list, rare_pct_threshold=5.0):
|
|
8
|
+
"""
|
|
9
|
+
Learn per-column numeric bounds and training-popular categories from reference datasets.
|
|
10
|
+
Returns nested dict: profile[dataset_name][column] = {type, lower, upper, popular, skewness_level}
|
|
11
|
+
"""
|
|
12
|
+
profile = {}
|
|
13
|
+
for ds_name, df in zip(dataset_names, dataset_list):
|
|
14
|
+
profile[ds_name] = {}
|
|
15
|
+
for col in df.columns:
|
|
16
|
+
ser = df[col].dropna()
|
|
17
|
+
if ser.empty:
|
|
18
|
+
profile[ds_name][col] = {"type": "empty"}
|
|
19
|
+
continue
|
|
20
|
+
|
|
21
|
+
if pd.api.types.is_numeric_dtype(ser):
|
|
22
|
+
skew = ser.skew() if len(ser) > 2 else 0.0
|
|
23
|
+
abs_skew = abs(skew)
|
|
24
|
+
if abs_skew < 0.5:
|
|
25
|
+
mean = ser.mean()
|
|
26
|
+
std = ser.std(ddof=0) if ser.size > 1 else 0.0
|
|
27
|
+
if std == 0 or np.isnan(std):
|
|
28
|
+
lower, upper = -np.inf, np.inf
|
|
29
|
+
else:
|
|
30
|
+
lower, upper = mean - 3.0 * std, mean + 3.0 * std
|
|
31
|
+
method = "z"
|
|
32
|
+
elif abs_skew < 1:
|
|
33
|
+
Q1, Q3 = ser.quantile(0.25), ser.quantile(0.75)
|
|
34
|
+
IQR = Q3 - Q1
|
|
35
|
+
if IQR == 0 or np.isnan(IQR):
|
|
36
|
+
lower, upper = -np.inf, np.inf
|
|
37
|
+
else:
|
|
38
|
+
lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
|
|
39
|
+
method = "iqr"
|
|
40
|
+
else:
|
|
41
|
+
median = ser.median()
|
|
42
|
+
MAD = np.median(np.abs(ser - median))
|
|
43
|
+
if MAD == 0 or np.isnan(MAD):
|
|
44
|
+
lower, upper = -np.inf, np.inf
|
|
45
|
+
else:
|
|
46
|
+
factor = 3.5 / 0.6745
|
|
47
|
+
delta = factor * MAD
|
|
48
|
+
lower, upper = median - delta, median + delta
|
|
49
|
+
method = "mad"
|
|
50
|
+
|
|
51
|
+
profile[ds_name][col] = {
|
|
52
|
+
"type": "numeric",
|
|
53
|
+
"lower": float(lower) if np.isfinite(lower) else np.nan,
|
|
54
|
+
"upper": float(upper) if np.isfinite(upper) else np.nan,
|
|
55
|
+
"method": method,
|
|
56
|
+
"skewness": float(skew)
|
|
57
|
+
}
|
|
58
|
+
else:
|
|
59
|
+
vals = ser.astype(str)
|
|
60
|
+
freq = vals.value_counts(normalize=True) * 100
|
|
61
|
+
popular = freq[freq > rare_pct_threshold].index.tolist()
|
|
62
|
+
profile[ds_name][col] = {
|
|
63
|
+
"type": "categorical",
|
|
64
|
+
"popular": popular
|
|
65
|
+
}
|
|
66
|
+
return profile
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def adjust_prod_meta_with_reference(profile, reference_meta, prod_meta, prod_dataset_names, prod_dataset_list):
|
|
70
|
+
"""
|
|
71
|
+
Adjust production meta statistics using the learned reference profile.
|
|
72
|
+
|
|
73
|
+
Returns a deep copy of ``prod_meta`` with two fields overwritten:
|
|
74
|
+
|
|
75
|
+
- ``outlier_count``: recomputed using the numeric bounds (mean±3σ / IQR / MAD)
|
|
76
|
+
derived from the reference (training) dataset, so production outliers are
|
|
77
|
+
judged against the training distribution rather than their own.
|
|
78
|
+
- ``rare_category_percent``: recomputed as the fraction of production values
|
|
79
|
+
that do not appear in the training dataset's popular category list.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
profile : dict
|
|
84
|
+
Output of ``learn_reference_profile``.
|
|
85
|
+
reference_meta : pd.DataFrame
|
|
86
|
+
Meta table for the reference (training) split — accepted but not
|
|
87
|
+
currently consumed; reserved for future drift-metric alignment.
|
|
88
|
+
prod_meta : pd.DataFrame
|
|
89
|
+
Meta table for the production dataset (output of ``generate_meta``).
|
|
90
|
+
prod_dataset_names : list of str
|
|
91
|
+
prod_dataset_list : list of pd.DataFrame
|
|
92
|
+
"""
|
|
93
|
+
prod_meta = prod_meta.copy(deep=True)
|
|
94
|
+
|
|
95
|
+
for ds_name, df in zip(prod_dataset_names, prod_dataset_list):
|
|
96
|
+
for col in df.columns:
|
|
97
|
+
mask = (prod_meta["dataset_name"] == ds_name) & (prod_meta["column_name"] == col)
|
|
98
|
+
if not mask.any():
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
info = profile.get(ds_name, {}).get(col, None)
|
|
102
|
+
series = df[col].dropna()
|
|
103
|
+
|
|
104
|
+
# numeric outliers using reference bounds
|
|
105
|
+
if info and info.get("type") == "numeric" and len(series) > 0:
|
|
106
|
+
lb, ub = info.get("lower", np.nan), info.get("upper", np.nan)
|
|
107
|
+
outliers = 0
|
|
108
|
+
if pd.notna(lb) and pd.notna(ub):
|
|
109
|
+
# count finite bounds only
|
|
110
|
+
try:
|
|
111
|
+
outliers = int(((series < lb) | (series > ub)).sum())
|
|
112
|
+
except Exception:
|
|
113
|
+
outliers = 0
|
|
114
|
+
else:
|
|
115
|
+
# fallback: keep existing value in prod_meta (or recompute if desired)
|
|
116
|
+
# we choose to recompute with the same logic as generate_meta fallback
|
|
117
|
+
outliers = int(prod_meta.loc[mask, "outlier_count"].fillna(0).values[0])
|
|
118
|
+
|
|
119
|
+
prod_meta.loc[mask, "outlier_count"] = int(outliers)
|
|
120
|
+
|
|
121
|
+
# categorical rare percent relative to training popular categories
|
|
122
|
+
elif info and info.get("type") == "categorical" and len(series) > 0:
|
|
123
|
+
popular = info.get("popular", [])
|
|
124
|
+
vals = series.astype(str)
|
|
125
|
+
if len(popular) == 0:
|
|
126
|
+
# no popular categories in training -> set rare percent to 0 (or keep existing)
|
|
127
|
+
rare_percent = float(prod_meta.loc[mask, "rare_category_percent"].fillna(0).values[0])
|
|
128
|
+
else:
|
|
129
|
+
rare_percent = float((~vals.isin(popular)).mean() * 100)
|
|
130
|
+
prod_meta.loc[mask, "rare_category_percent"] = rare_percent
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ensure NaT replaced
|
|
134
|
+
prod_meta.replace({pd.NaT: np.nan}, inplace=True)
|
|
135
|
+
return prod_meta
|