cleanalytix 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Probot-DATA contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,235 @@
1
+ Metadata-Version: 2.4
2
+ Name: cleanalytix
3
+ Version: 0.1.0
4
+ Summary: Cleanalytix is a modular Python library for profiling, scoring, and cleaning tabular datasets.
5
+ Author: Probot-DATA contributors
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Probot-DATA contributors
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/Probot-DATA/Cleanalytix_Repo
29
+ Project-URL: Repository, https://github.com/Probot-DATA/Cleanalytix_Repo
30
+ Project-URL: Issues, https://github.com/Probot-DATA/Cleanalytix_Repo/issues
31
+ Keywords: data quality,data cleaning,data profiling,EDA,machine learning,pandas
32
+ Classifier: Development Status :: 4 - Beta
33
+ Classifier: Intended Audience :: Developers
34
+ Classifier: Intended Audience :: Science/Research
35
+ Classifier: License :: OSI Approved :: MIT License
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.9
38
+ Classifier: Programming Language :: Python :: 3.10
39
+ Classifier: Programming Language :: Python :: 3.11
40
+ Classifier: Programming Language :: Python :: 3.12
41
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
42
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
43
+ Requires-Python: >=3.9
44
+ Description-Content-Type: text/markdown
45
+ License-File: LICENSE
46
+ Requires-Dist: pandas>=1.5.0
47
+ Requires-Dist: numpy>=1.23.0
48
+ Requires-Dist: scikit-learn>=1.1.0
49
+ Requires-Dist: nltk>=3.8.0
50
+ Provides-Extra: dev
51
+ Requires-Dist: pytest>=7.0; extra == "dev"
52
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
53
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
54
+ Dynamic: license-file
55
+
56
+ # Cleanalytix
57
+
58
+ `Cleanalytix` is a Python library for profiling, scoring, cleaning, and monitoring the quality of tabular datasets with a single pipeline.
59
+
60
+ It is designed for pandas-first workflows and supports:
61
+
62
+ - baseline dataset profiling and scoring
63
+ - optional cleaning recommendations and automatic cleaning
64
+ - optional production/new-dataset monitoring
65
+ - optional business rules, thresholds, weights, and type inference for new data
66
+
67
+ ## Installation
68
+
69
+ From a source checkout:
70
+
71
+ ```bash
72
+ git clone https://github.com/Probot-DATA/Cleanalytix_Repo
73
+ cd Cleanalytix_Repo
74
+ pip install -e ".[dev]"
75
+ ```
76
+
77
+ Once this project is published to PyPI, the install command will be:
78
+
79
+ ```bash
80
+ pip install cleanalytix
81
+ ```
82
+
83
+ Runtime requirements:
84
+
85
+ - Python 3.9+
86
+ - pandas
87
+ - numpy
88
+ - scikit-learn
89
+ - nltk
90
+
91
+ ## Quick Start
92
+
93
+ ```python
94
+ import pandas as pd
95
+ from cleanalytix import Run_DQ_Pipeline
96
+
97
+ df = pd.read_csv("my_data.csv")
98
+
99
+ result = Run_DQ_Pipeline(
100
+ dataset_names=["my_dataset"],
101
+ dataset_list=[df],
102
+ )
103
+
104
+ print(result["base_data"]["dirty_scores"])
105
+ print(result["base_data"]["meta_before_cleaning"])
106
+ print(result["base_data"]["recommendations"])
107
+ ```
108
+
109
+ ## Production / Monitoring Example
110
+
111
+ ```python
112
+ import pandas as pd
113
+ from cleanalytix import Run_DQ_Pipeline
114
+
115
+ train_df = pd.read_csv("train.csv")
116
+ prod_df = pd.read_csv("production.csv")
117
+
118
+ rules = {
119
+ "age": lambda value: pd.isna(value) or 0 <= float(value) <= 120,
120
+ }
121
+
122
+ result = Run_DQ_Pipeline(
123
+ dataset_names=["customers"],
124
+ dataset_list=[train_df],
125
+ new_dataset_list=[prod_df],
126
+ rules=rules,
127
+ cleaning=True,
128
+ interactive=False,
129
+ score_mode="exponential",
130
+ )
131
+
132
+ print(result["base_data"]["dirty_scores"])
133
+ print(result["base_data"]["cleaned_scores"])
134
+ print(result["prod_data"]["dirty_scores"])
135
+ print(result["prod_data"]["change_log"])
136
+ ```
137
+
138
+ ## Public API
139
+
140
+ The primary entrypoint is:
141
+
142
+ ```python
143
+ from cleanalytix import Run_DQ_Pipeline
144
+ ```
145
+
146
+ Additional building blocks are also exported:
147
+
148
+ ```python
149
+ from cleanalytix import (
150
+ Compute_DQ_Score,
151
+ DEFAULT_THRESHOLDS,
152
+ generate_meta,
153
+ cleaning_recommendations,
154
+ get_cleaned_data,
155
+ get_table_for_DQ_computation,
156
+ summarize_dataset_health,
157
+ learn_reference_profile,
158
+ adjust_prod_meta_with_reference,
159
+ infer_and_fix_types,
160
+ )
161
+ ```
162
+
163
+ ## Pipeline Output Structure
164
+
165
+ `Run_DQ_Pipeline` returns:
166
+
167
+ ```python
168
+ {
169
+ "base_data": {...},
170
+ "prod_data": {...},
171
+ }
172
+ ```
173
+
174
+ Each block preserves the same keys:
175
+
176
+ - `dirty_scores`
177
+ - `cleaned_scores`
178
+ - `cleaned_datasets`
179
+ - `meta_before_cleaning`
180
+ - `meta_after_cleaning`
181
+ - `recommendations`
182
+ - `change_log`
183
+ - `summarized_before`
184
+ - `summarized_after`
185
+ - `main_metrics_before`
186
+ - `main_metrics_after`
187
+
188
+ ## Examples
189
+
190
+ Runnable examples live in [examples](./examples):
191
+
192
+ ```bash
193
+ python examples/simple_usage.py
194
+ python examples/production_usage.py
195
+ ```
196
+
197
+ These examples assume the package has already been installed in the active environment.
198
+
199
+ ## Validation
200
+
201
+ The [validation](./validation) folder contains a portable real-world validation workflow.
202
+
203
+ - Large raw datasets are intentionally not committed to the repository.
204
+ - Put the expected files under `validation/datasets/` by following
205
+ [validation/datasets/README.md](./validation/datasets/README.md).
206
+ - Run the validation script:
207
+
208
+ ```bash
209
+ python validation/run_validation.py
210
+ ```
211
+
212
+ - The script saves non-empty outputs to `validation/outputs/<dataset_name>/`.
213
+ - The notebook [validation/main.ipynb](./validation/main.ipynb) uses the same relative-path workflow.
214
+
215
+ ## Repository Layout
216
+
217
+ - `cleanalytix/` - installable library package
218
+ - `examples/` - small runnable examples
219
+ - `tests/` - smoke tests and lightweight sample fixtures
220
+ - `validation/` - public-friendly validation workflow and output folder
221
+ - `archive/legacy/` - historical prototype notebook/code kept for reference, not for active use
222
+
223
+ ## Known Limitations
224
+
225
+ - Validation datasets are not bundled with the repository.
226
+ - The yellow taxi validation workflow samples the first `20,000` rows from each configured monthly file to match the original project workflow and to keep validation practical.
227
+ - Interactive cleaning is intended for notebook/CLI use and will prompt for input when `interactive=True`.
228
+
229
+ ## Contributing
230
+
231
+ See [CONTRIBUTING.md](./CONTRIBUTING.md).
232
+
233
+ ## License
234
+
235
+ [MIT](./LICENSE) (c) 2026 Probot-DATA contributors
@@ -0,0 +1,180 @@
1
+ # Cleanalytix
2
+
3
+ `Cleanalytix` is a Python library for profiling, scoring, cleaning, and monitoring the quality of tabular datasets with a single pipeline.
4
+
5
+ It is designed for pandas-first workflows and supports:
6
+
7
+ - baseline dataset profiling and scoring
8
+ - optional cleaning recommendations and automatic cleaning
9
+ - optional production/new-dataset monitoring
10
+ - optional business rules, thresholds, weights, and type inference for new data
11
+
12
+ ## Installation
13
+
14
+ From a source checkout:
15
+
16
+ ```bash
17
+ git clone https://github.com/Probot-DATA/Cleanalytix_Repo
18
+ cd Cleanalytix_Repo
19
+ pip install -e ".[dev]"
20
+ ```
21
+
22
+ Once this project is published to PyPI, the install command will be:
23
+
24
+ ```bash
25
+ pip install cleanalytix
26
+ ```
27
+
28
+ Runtime requirements:
29
+
30
+ - Python 3.9+
31
+ - pandas
32
+ - numpy
33
+ - scikit-learn
34
+ - nltk
35
+
36
+ ## Quick Start
37
+
38
+ ```python
39
+ import pandas as pd
40
+ from cleanalytix import Run_DQ_Pipeline
41
+
42
+ df = pd.read_csv("my_data.csv")
43
+
44
+ result = Run_DQ_Pipeline(
45
+ dataset_names=["my_dataset"],
46
+ dataset_list=[df],
47
+ )
48
+
49
+ print(result["base_data"]["dirty_scores"])
50
+ print(result["base_data"]["meta_before_cleaning"])
51
+ print(result["base_data"]["recommendations"])
52
+ ```
53
+
54
+ ## Production / Monitoring Example
55
+
56
+ ```python
57
+ import pandas as pd
58
+ from cleanalytix import Run_DQ_Pipeline
59
+
60
+ train_df = pd.read_csv("train.csv")
61
+ prod_df = pd.read_csv("production.csv")
62
+
63
+ rules = {
64
+ "age": lambda value: pd.isna(value) or 0 <= float(value) <= 120,
65
+ }
66
+
67
+ result = Run_DQ_Pipeline(
68
+ dataset_names=["customers"],
69
+ dataset_list=[train_df],
70
+ new_dataset_list=[prod_df],
71
+ rules=rules,
72
+ cleaning=True,
73
+ interactive=False,
74
+ score_mode="exponential",
75
+ )
76
+
77
+ print(result["base_data"]["dirty_scores"])
78
+ print(result["base_data"]["cleaned_scores"])
79
+ print(result["prod_data"]["dirty_scores"])
80
+ print(result["prod_data"]["change_log"])
81
+ ```
82
+
83
+ ## Public API
84
+
85
+ The primary entrypoint is:
86
+
87
+ ```python
88
+ from cleanalytix import Run_DQ_Pipeline
89
+ ```
90
+
91
+ Additional building blocks are also exported:
92
+
93
+ ```python
94
+ from cleanalytix import (
95
+ Compute_DQ_Score,
96
+ DEFAULT_THRESHOLDS,
97
+ generate_meta,
98
+ cleaning_recommendations,
99
+ get_cleaned_data,
100
+ get_table_for_DQ_computation,
101
+ summarize_dataset_health,
102
+ learn_reference_profile,
103
+ adjust_prod_meta_with_reference,
104
+ infer_and_fix_types,
105
+ )
106
+ ```
107
+
108
+ ## Pipeline Output Structure
109
+
110
+ `Run_DQ_Pipeline` returns:
111
+
112
+ ```python
113
+ {
114
+ "base_data": {...},
115
+ "prod_data": {...},
116
+ }
117
+ ```
118
+
119
+ Each block preserves the same keys:
120
+
121
+ - `dirty_scores`
122
+ - `cleaned_scores`
123
+ - `cleaned_datasets`
124
+ - `meta_before_cleaning`
125
+ - `meta_after_cleaning`
126
+ - `recommendations`
127
+ - `change_log`
128
+ - `summarized_before`
129
+ - `summarized_after`
130
+ - `main_metrics_before`
131
+ - `main_metrics_after`
132
+
133
+ ## Examples
134
+
135
+ Runnable examples live in [examples](./examples):
136
+
137
+ ```bash
138
+ python examples/simple_usage.py
139
+ python examples/production_usage.py
140
+ ```
141
+
142
+ These examples assume the package has already been installed in the active environment.
143
+
144
+ ## Validation
145
+
146
+ The [validation](./validation) folder contains a portable real-world validation workflow.
147
+
148
+ - Large raw datasets are intentionally not committed to the repository.
149
+ - Put the expected files under `validation/datasets/` by following
150
+ [validation/datasets/README.md](./validation/datasets/README.md).
151
+ - Run the validation script:
152
+
153
+ ```bash
154
+ python validation/run_validation.py
155
+ ```
156
+
157
+ - The script saves non-empty outputs to `validation/outputs/<dataset_name>/`.
158
+ - The notebook [validation/main.ipynb](./validation/main.ipynb) uses the same relative-path workflow.
159
+
160
+ ## Repository Layout
161
+
162
+ - `cleanalytix/` - installable library package
163
+ - `examples/` - small runnable examples
164
+ - `tests/` - smoke tests and lightweight sample fixtures
165
+ - `validation/` - public-friendly validation workflow and output folder
166
+ - `archive/legacy/` - historical prototype notebook/code kept for reference, not for active use
167
+
168
+ ## Known Limitations
169
+
170
+ - Validation datasets are not bundled with the repository.
171
+ - The yellow taxi validation workflow samples the first `20,000` rows from each configured monthly file to match the original project workflow and to keep validation practical.
172
+ - Interactive cleaning is intended for notebook/CLI use and will prompt for input when `interactive=True`.
173
+
174
+ ## Contributing
175
+
176
+ See [CONTRIBUTING.md](./CONTRIBUTING.md).
177
+
178
+ ## License
179
+
180
+ [MIT](./LICENSE) (c) 2026 Probot-DATA contributors
@@ -0,0 +1,39 @@
1
+ """
2
+ Cleanalytix - Data Quality Framework.
3
+
4
+ Quick start
5
+ -----------
6
+ >>> from cleanalytix import Run_DQ_Pipeline
7
+ >>> result = Run_DQ_Pipeline(["my_dataset"], [df])
8
+ >>> print(result["base_data"]["dirty_scores"])
9
+ """
10
+
11
+ from .version import __author__, __version__
12
+ from .pipeline import Run_DQ_Pipeline
13
+ from .compute_dq_score import Compute_DQ_Score, DEFAULT_THRESHOLDS
14
+ from .generate_meta import generate_meta
15
+ from .cleaning_recommendations import cleaning_recommendations
16
+ from .get_cleaned_data import get_cleaned_data
17
+ from .get_table_for_DQ_computation import get_table_for_DQ_computation
18
+ from .summarize_dataset_health import summarize_dataset_health
19
+ from .adjust_prod_meta import learn_reference_profile, adjust_prod_meta_with_reference
20
+ from .preprocess_types import infer_and_fix_types
21
+
22
+ run_dq_pipeline = Run_DQ_Pipeline
23
+
24
+ __all__ = [
25
+ "Run_DQ_Pipeline",
26
+ "run_dq_pipeline",
27
+ "Compute_DQ_Score",
28
+ "DEFAULT_THRESHOLDS",
29
+ "__author__",
30
+ "__version__",
31
+ "generate_meta",
32
+ "cleaning_recommendations",
33
+ "get_cleaned_data",
34
+ "get_table_for_DQ_computation",
35
+ "summarize_dataset_health",
36
+ "learn_reference_profile",
37
+ "adjust_prod_meta_with_reference",
38
+ "infer_and_fix_types",
39
+ ]
@@ -0,0 +1,135 @@
1
+ # cleanalytix/adjust_prod_meta.py
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ EPS = 1e-9
6
+
7
+ def learn_reference_profile(dataset_names, dataset_list, rare_pct_threshold=5.0):
8
+ """
9
+ Learn per-column numeric bounds and training-popular categories from reference datasets.
10
+ Returns nested dict: profile[dataset_name][column] = {type, lower, upper, popular, skewness_level}
11
+ """
12
+ profile = {}
13
+ for ds_name, df in zip(dataset_names, dataset_list):
14
+ profile[ds_name] = {}
15
+ for col in df.columns:
16
+ ser = df[col].dropna()
17
+ if ser.empty:
18
+ profile[ds_name][col] = {"type": "empty"}
19
+ continue
20
+
21
+ if pd.api.types.is_numeric_dtype(ser):
22
+ skew = ser.skew() if len(ser) > 2 else 0.0
23
+ abs_skew = abs(skew)
24
+ if abs_skew < 0.5:
25
+ mean = ser.mean()
26
+ std = ser.std(ddof=0) if ser.size > 1 else 0.0
27
+ if std == 0 or np.isnan(std):
28
+ lower, upper = -np.inf, np.inf
29
+ else:
30
+ lower, upper = mean - 3.0 * std, mean + 3.0 * std
31
+ method = "z"
32
+ elif abs_skew < 1:
33
+ Q1, Q3 = ser.quantile(0.25), ser.quantile(0.75)
34
+ IQR = Q3 - Q1
35
+ if IQR == 0 or np.isnan(IQR):
36
+ lower, upper = -np.inf, np.inf
37
+ else:
38
+ lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
39
+ method = "iqr"
40
+ else:
41
+ median = ser.median()
42
+ MAD = np.median(np.abs(ser - median))
43
+ if MAD == 0 or np.isnan(MAD):
44
+ lower, upper = -np.inf, np.inf
45
+ else:
46
+ factor = 3.5 / 0.6745
47
+ delta = factor * MAD
48
+ lower, upper = median - delta, median + delta
49
+ method = "mad"
50
+
51
+ profile[ds_name][col] = {
52
+ "type": "numeric",
53
+ "lower": float(lower) if np.isfinite(lower) else np.nan,
54
+ "upper": float(upper) if np.isfinite(upper) else np.nan,
55
+ "method": method,
56
+ "skewness": float(skew)
57
+ }
58
+ else:
59
+ vals = ser.astype(str)
60
+ freq = vals.value_counts(normalize=True) * 100
61
+ popular = freq[freq > rare_pct_threshold].index.tolist()
62
+ profile[ds_name][col] = {
63
+ "type": "categorical",
64
+ "popular": popular
65
+ }
66
+ return profile
67
+
68
+
69
+ def adjust_prod_meta_with_reference(profile, reference_meta, prod_meta, prod_dataset_names, prod_dataset_list):
70
+ """
71
+ Adjust production meta statistics using the learned reference profile.
72
+
73
+ Returns a deep copy of ``prod_meta`` with two fields overwritten:
74
+
75
+ - ``outlier_count``: recomputed using the numeric bounds (mean±3σ / IQR / MAD)
76
+ derived from the reference (training) dataset, so production outliers are
77
+ judged against the training distribution rather than their own.
78
+ - ``rare_category_percent``: recomputed as the fraction of production values
79
+ that do not appear in the training dataset's popular category list.
80
+
81
+ Parameters
82
+ ----------
83
+ profile : dict
84
+ Output of ``learn_reference_profile``.
85
+ reference_meta : pd.DataFrame
86
+ Meta table for the reference (training) split — accepted but not
87
+ currently consumed; reserved for future drift-metric alignment.
88
+ prod_meta : pd.DataFrame
89
+ Meta table for the production dataset (output of ``generate_meta``).
90
+ prod_dataset_names : list of str
91
+ prod_dataset_list : list of pd.DataFrame
92
+ """
93
+ prod_meta = prod_meta.copy(deep=True)
94
+
95
+ for ds_name, df in zip(prod_dataset_names, prod_dataset_list):
96
+ for col in df.columns:
97
+ mask = (prod_meta["dataset_name"] == ds_name) & (prod_meta["column_name"] == col)
98
+ if not mask.any():
99
+ continue
100
+
101
+ info = profile.get(ds_name, {}).get(col, None)
102
+ series = df[col].dropna()
103
+
104
+ # numeric outliers using reference bounds
105
+ if info and info.get("type") == "numeric" and len(series) > 0:
106
+ lb, ub = info.get("lower", np.nan), info.get("upper", np.nan)
107
+ outliers = 0
108
+ if pd.notna(lb) and pd.notna(ub):
109
+ # count finite bounds only
110
+ try:
111
+ outliers = int(((series < lb) | (series > ub)).sum())
112
+ except Exception:
113
+ outliers = 0
114
+ else:
115
+ # fallback: keep existing value in prod_meta (or recompute if desired)
116
+ # we choose to recompute with the same logic as generate_meta fallback
117
+ outliers = int(prod_meta.loc[mask, "outlier_count"].fillna(0).values[0])
118
+
119
+ prod_meta.loc[mask, "outlier_count"] = int(outliers)
120
+
121
+ # categorical rare percent relative to training popular categories
122
+ elif info and info.get("type") == "categorical" and len(series) > 0:
123
+ popular = info.get("popular", [])
124
+ vals = series.astype(str)
125
+ if len(popular) == 0:
126
+ # no popular categories in training -> set rare percent to 0 (or keep existing)
127
+ rare_percent = float(prod_meta.loc[mask, "rare_category_percent"].fillna(0).values[0])
128
+ else:
129
+ rare_percent = float((~vals.isin(popular)).mean() * 100)
130
+ prod_meta.loc[mask, "rare_category_percent"] = rare_percent
131
+
132
+
133
+ # ensure NaT replaced
134
+ prod_meta.replace({pd.NaT: np.nan}, inplace=True)
135
+ return prod_meta