framelint 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- framelint-0.1.0/.gitignore +32 -0
- framelint-0.1.0/CHANGELOG.md +29 -0
- framelint-0.1.0/LICENSE +21 -0
- framelint-0.1.0/PKG-INFO +240 -0
- framelint-0.1.0/README.md +195 -0
- framelint-0.1.0/pyproject.toml +149 -0
- framelint-0.1.0/src/framelint/__init__.py +33 -0
- framelint-0.1.0/src/framelint/api.py +73 -0
- framelint-0.1.0/src/framelint/baseline.py +206 -0
- framelint-0.1.0/src/framelint/checks.py +421 -0
- framelint-0.1.0/src/framelint/cli.py +157 -0
- framelint-0.1.0/src/framelint/config.py +198 -0
- framelint-0.1.0/src/framelint/loaders.py +69 -0
- framelint-0.1.0/src/framelint/models.py +79 -0
- framelint-0.1.0/src/framelint/py.typed +0 -0
- framelint-0.1.0/src/framelint/report.py +226 -0
- framelint-0.1.0/tests/__init__.py +0 -0
- framelint-0.1.0/tests/conftest.py +63 -0
- framelint-0.1.0/tests/test_api.py +54 -0
- framelint-0.1.0/tests/test_baseline.py +87 -0
- framelint-0.1.0/tests/test_checks.py +198 -0
- framelint-0.1.0/tests/test_cli.py +104 -0
- framelint-0.1.0/tests/test_config.py +97 -0
- framelint-0.1.0/tests/test_loaders.py +65 -0
- framelint-0.1.0/tests/test_models.py +51 -0
- framelint-0.1.0/tests/test_report.py +110 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Byte-compiled / optimized / cache
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
.Python
|
|
8
|
+
build/
|
|
9
|
+
dist/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
*.egg
|
|
12
|
+
wheels/
|
|
13
|
+
|
|
14
|
+
# Virtual environments
|
|
15
|
+
.venv/
|
|
16
|
+
venv/
|
|
17
|
+
env/
|
|
18
|
+
ENV/
|
|
19
|
+
|
|
20
|
+
# Test / coverage
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
.coverage.*
|
|
24
|
+
coverage.xml
|
|
25
|
+
htmlcov/
|
|
26
|
+
.mypy_cache/
|
|
27
|
+
.ruff_cache/
|
|
28
|
+
|
|
29
|
+
# IDE / OS
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
.DS_Store
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project
|
|
5
|
+
adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [Unreleased]
|
|
8
|
+
|
|
9
|
+
## [0.1.0] - 2026-06-27
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- Initial release.
|
|
13
|
+
- Data-quality checks: missingness, duplicate rows, constant/zero-variance
|
|
14
|
+
columns, cardinality (likely-ID and high-cardinality), type consistency
|
|
15
|
+
(numbers-as-strings and mixed types), and numeric outliers (IQR / z-score).
|
|
16
|
+
- Opt-in per-column rules: email, date/datetime, numeric range, regex, and
|
|
17
|
+
allowed-value sets.
|
|
18
|
+
- Schema-drift detection against a saved baseline (added/removed columns, dtype
|
|
19
|
+
changes, null-rate jumps, and distribution shifts).
|
|
20
|
+
- `Report` with `summary()` (rich console), `to_dict`, `to_json`, `to_html`,
|
|
21
|
+
and `to_markdown`, plus a `passed` pass/fail decision.
|
|
22
|
+
- Typer-based CLI (`framelint scan`, `framelint baseline save`) with CI-friendly
|
|
23
|
+
exit codes (0 pass, 1 quality failure, 2 usage error).
|
|
24
|
+
- Configuration via defaults, `pyproject.toml`, standalone TOML, a dict/Config,
|
|
25
|
+
and CLI flags, with a documented precedence order.
|
|
26
|
+
- Full type hints and a `py.typed` marker.
|
|
27
|
+
|
|
28
|
+
[Unreleased]: https://github.com/AnoopIbrampur/framelint/compare/v0.1.0...HEAD
|
|
29
|
+
[0.1.0]: https://github.com/AnoopIbrampur/framelint/releases/tag/v0.1.0
|
framelint-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Anoop Ibrampur
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
framelint-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: framelint
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight data-quality profiler and CI gate for tabular data.
|
|
5
|
+
Project-URL: Homepage, https://github.com/AnoopIbrampur/framelint
|
|
6
|
+
Project-URL: Repository, https://github.com/AnoopIbrampur/framelint
|
|
7
|
+
Project-URL: Issues, https://github.com/AnoopIbrampur/framelint/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/AnoopIbrampur/framelint/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Anoop Ibrampur <anoopibrampur@gmail.com>
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ci,data-engineering,data-profiling,data-quality,data-validation,dataframe,pandas,schema-drift
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
25
|
+
Classifier: Topic :: Utilities
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Requires-Dist: pandas>=1.3
|
|
29
|
+
Requires-Dist: rich>=13.0
|
|
30
|
+
Requires-Dist: tomli>=2.0; python_version < '3.11'
|
|
31
|
+
Requires-Dist: typer>=0.9
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
35
|
+
Requires-Dist: pandas-stubs; extra == 'dev'
|
|
36
|
+
Requires-Dist: pre-commit>=3.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pyarrow>=10.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
41
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
42
|
+
Provides-Extra: parquet
|
|
43
|
+
Requires-Dist: pyarrow>=10.0; extra == 'parquet'
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# framelint
|
|
47
|
+
|
|
48
|
+
[](https://pypi.org/project/framelint/)
|
|
49
|
+
[](https://pypi.org/project/framelint/)
|
|
50
|
+
[](https://github.com/AnoopIbrampur/framelint/actions/workflows/ci.yml)
|
|
51
|
+
[](https://codecov.io/gh/AnoopIbrampur/framelint)
|
|
52
|
+
[](LICENSE)
|
|
53
|
+
[](https://peps.python.org/pep-0561/)
|
|
54
|
+
|
|
55
|
+
**A lightweight data-quality profiler and CI gate for tabular data.**
|
|
56
|
+
|
|
57
|
+
`framelint` scans a pandas DataFrame or a CSV/Parquet file and produces a clear
|
|
58
|
+
data-quality report — nulls, duplicates, constant columns, likely-ID columns,
|
|
59
|
+
type inconsistencies, numeric outliers, format violations, and schema drift.
|
|
60
|
+
|
|
61
|
+
Its standout feature: it doubles as a **CI gate**. Point it at your data, set
|
|
62
|
+
thresholds, and it exits non-zero when quality drops — so a bad dataset fails
|
|
63
|
+
the build instead of silently flowing downstream.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Why this exists
|
|
68
|
+
|
|
69
|
+
Data pipelines break quietly. A column starts arriving 40% null, an upstream job
|
|
70
|
+
starts writing numbers as strings, a join silently doubles your rows — and
|
|
71
|
+
nobody notices until a dashboard looks wrong weeks later. `framelint` turns
|
|
72
|
+
those failures into loud, early, automated signals you can drop into CI in one
|
|
73
|
+
line.
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
pip install framelint
|
|
79
|
+
# Parquet support:
|
|
80
|
+
pip install "framelint[parquet]"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Requires Python 3.9+.
|
|
84
|
+
|
|
85
|
+
## 30-second quickstart
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import framelint
|
|
89
|
+
|
|
90
|
+
report = framelint.scan("sales.csv") # or pass a DataFrame
|
|
91
|
+
report.summary() # pretty console table
|
|
92
|
+
print(report.passed) # -> True / False
|
|
93
|
+
|
|
94
|
+
report.to_json("report.json") # machine-readable
|
|
95
|
+
report.to_html("report.html") # shareable report
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Example console output:
|
|
99
|
+
|
|
100
|
+
```
|
|
101
|
+
framelint FAILED rows=1000 cols=6 errors=1 warnings=3 info=1
|
|
102
|
+
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
103
|
+
┃ Severity ┃ Check ┃ Column ┃ Message ┃
|
|
104
|
+
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
105
|
+
│ error │ missingness │ region │ Column 'region' is 62.0% null. │
|
|
106
|
+
│ warning │ duplicates │ — │ Found 12 duplicate rows (full-row). │
|
|
107
|
+
│ warning │ type_consistency │ price │ Column 'price' holds numbers as ... │
|
|
108
|
+
│ warning │ outliers │ amount │ Column 'amount' has 18 outliers ... │
|
|
109
|
+
│ info │ cardinality │ id │ Column 'id' looks like an identifier. │
|
|
110
|
+
└──────────┴──────────────────┴─────────┴───────────────────────────────────────┘
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Features
|
|
114
|
+
|
|
115
|
+
- **Missingness** — per-column null counts and rates, with severity thresholds.
|
|
116
|
+
- **Duplicate rows** — full-row or by a subset of key columns.
|
|
117
|
+
- **Constant / zero-variance** and all-null columns.
|
|
118
|
+
- **Cardinality** — likely-identifier and high-cardinality column detection.
|
|
119
|
+
- **Type consistency** — numbers stored as strings, mixed-type columns.
|
|
120
|
+
- **Outliers** — numeric outliers via IQR or z-score (configurable).
|
|
121
|
+
- **Format validation** (opt-in) — email, date/datetime, numeric ranges,
|
|
122
|
+
regex, and allowed-value sets, per column.
|
|
123
|
+
- **Schema drift** — save a baseline, then detect added/removed columns, dtype
|
|
124
|
+
changes, null-rate jumps, and distribution shifts.
|
|
125
|
+
- **Severity levels** — every finding is `info`, `warning`, or `error`.
|
|
126
|
+
- **Pass/fail decision** — based on configurable thresholds, for use in CI.
|
|
127
|
+
- **Outputs** — rich console, `dict`, JSON, HTML, and Markdown.
|
|
128
|
+
|
|
129
|
+
## CLI
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Scan and write reports
|
|
133
|
+
framelint scan sales.csv --html report.html --json report.json
|
|
134
|
+
|
|
135
|
+
# Fail the build if any error-level finding is present
|
|
136
|
+
framelint scan sales.csv --fail-on error
|
|
137
|
+
|
|
138
|
+
# Save a baseline, then scan a new file for drift
|
|
139
|
+
framelint baseline save sales.csv baseline.json
|
|
140
|
+
framelint scan new.csv --baseline baseline.json
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Exit codes: **0** = passed, **1** = quality failure, **2** = usage error.
|
|
144
|
+
|
|
145
|
+
### Use it in CI to gate data quality
|
|
146
|
+
|
|
147
|
+
```yaml
|
|
148
|
+
# .github/workflows/data-quality.yml
|
|
149
|
+
name: data-quality
|
|
150
|
+
on: [push, pull_request]
|
|
151
|
+
jobs:
|
|
152
|
+
check:
|
|
153
|
+
runs-on: ubuntu-latest
|
|
154
|
+
steps:
|
|
155
|
+
- uses: actions/checkout@v4
|
|
156
|
+
- uses: actions/setup-python@v5
|
|
157
|
+
with: { python-version: "3.12" }
|
|
158
|
+
- run: pip install framelint
|
|
159
|
+
- run: framelint scan data/sales.csv --fail-on error --baseline data/baseline.json
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
If quality drops below your thresholds, the step exits non-zero and the build
|
|
163
|
+
fails — no extra glue code required.
|
|
164
|
+
|
|
165
|
+
## Configuration
|
|
166
|
+
|
|
167
|
+
Thresholds and per-column rules can be set, in increasing order of precedence:
|
|
168
|
+
|
|
169
|
+
1. Built-in defaults
|
|
170
|
+
2. `[tool.framelint]` in `pyproject.toml`
|
|
171
|
+
3. A standalone TOML file (`--config rules.toml`)
|
|
172
|
+
4. A `dict` / `Config` passed to `scan(...)`
|
|
173
|
+
5. Individual CLI flags (e.g. `--fail-on`, `--outlier-method`)
|
|
174
|
+
|
|
175
|
+
```toml
|
|
176
|
+
# pyproject.toml (or a standalone --config file, same schema)
|
|
177
|
+
[tool.framelint]
|
|
178
|
+
null_rate_warning = 0.10
|
|
179
|
+
null_rate_error = 0.50
|
|
180
|
+
duplicate_rate_error = 0.05
|
|
181
|
+
outlier_method = "iqr" # or "zscore"
|
|
182
|
+
fail_on = "error"
|
|
183
|
+
|
|
184
|
+
[tool.framelint.columns.email]
|
|
185
|
+
type = "email"
|
|
186
|
+
|
|
187
|
+
[tool.framelint.columns.age]
|
|
188
|
+
min = 0
|
|
189
|
+
max = 120
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
| Key | Default | Meaning |
|
|
193
|
+
| --- | --- | --- |
|
|
194
|
+
| `null_rate_warning` / `null_rate_error` | 0.10 / 0.50 | Null-rate thresholds |
|
|
195
|
+
| `duplicate_rate_warning` / `duplicate_rate_error` | 0.0 / 0.10 | Duplicate-row thresholds |
|
|
196
|
+
| `duplicate_subset` | `null` | Key columns for duplicate detection |
|
|
197
|
+
| `id_cardinality_ratio` | 0.95 | Unique-ratio to flag a likely ID |
|
|
198
|
+
| `high_cardinality_ratio` | 0.50 | Unique-ratio to flag high cardinality |
|
|
199
|
+
| `outlier_method` | `"iqr"` | `iqr` or `zscore` |
|
|
200
|
+
| `iqr_factor` / `zscore_threshold` | 1.5 / 3.0 | Outlier sensitivity |
|
|
201
|
+
| `outlier_rate_warning` / `outlier_rate_error` | 0.01 / 0.10 | Outlier-rate thresholds |
|
|
202
|
+
| `drift_mean_shift` | 3.0 | Mean shift (in baseline std) to flag drift |
|
|
203
|
+
| `drift_null_rate_increase` | 0.10 | Null-rate jump to flag drift |
|
|
204
|
+
| `fail_on` | `"error"` | Severity at/above which `passed` is `False` |
|
|
205
|
+
|
|
206
|
+
Per-column rules (`[tool.framelint.columns.<name>]`): `type` (`email`/`date`/
|
|
207
|
+
`datetime`), `min`, `max`, `regex`, `allowed`.
|
|
208
|
+
|
|
209
|
+
## Programmatic API
|
|
210
|
+
|
|
211
|
+
```python
|
|
212
|
+
import framelint
|
|
213
|
+
|
|
214
|
+
# Baseline + drift
|
|
215
|
+
framelint.save_baseline("sales.csv", "baseline.json")
|
|
216
|
+
report = framelint.scan("new.csv", baseline="baseline.json")
|
|
217
|
+
|
|
218
|
+
# Inline configuration
|
|
219
|
+
report = framelint.scan(df, config={"fail_on": "warning", "outlier_method": "zscore"})
|
|
220
|
+
|
|
221
|
+
report.to_dict() # full machine-readable result
|
|
222
|
+
report.to_markdown() # Markdown string
|
|
223
|
+
report.counts_by_severity()
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Contributing
|
|
227
|
+
|
|
228
|
+
Contributions are welcome — see [CONTRIBUTING.md](CONTRIBUTING.md) and the
|
|
229
|
+
[Code of Conduct](CODE_OF_CONDUCT.md). In short:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
pip install -e ".[dev]"
|
|
233
|
+
ruff check . && ruff format --check .
|
|
234
|
+
mypy
|
|
235
|
+
pytest
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## License
|
|
239
|
+
|
|
240
|
+
[MIT](LICENSE) © Anoop Ibrampur
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# framelint
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/framelint/)
|
|
4
|
+
[](https://pypi.org/project/framelint/)
|
|
5
|
+
[](https://github.com/AnoopIbrampur/framelint/actions/workflows/ci.yml)
|
|
6
|
+
[](https://codecov.io/gh/AnoopIbrampur/framelint)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
[](https://peps.python.org/pep-0561/)
|
|
9
|
+
|
|
10
|
+
**A lightweight data-quality profiler and CI gate for tabular data.**
|
|
11
|
+
|
|
12
|
+
`framelint` scans a pandas DataFrame or a CSV/Parquet file and produces a clear
|
|
13
|
+
data-quality report — nulls, duplicates, constant columns, likely-ID columns,
|
|
14
|
+
type inconsistencies, numeric outliers, format violations, and schema drift.
|
|
15
|
+
|
|
16
|
+
Its standout feature: it doubles as a **CI gate**. Point it at your data, set
|
|
17
|
+
thresholds, and it exits non-zero when quality drops — so a bad dataset fails
|
|
18
|
+
the build instead of silently flowing downstream.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Why this exists
|
|
23
|
+
|
|
24
|
+
Data pipelines break quietly. A column starts arriving 40% null, an upstream job
|
|
25
|
+
starts writing numbers as strings, a join silently doubles your rows — and
|
|
26
|
+
nobody notices until a dashboard looks wrong weeks later. `framelint` turns
|
|
27
|
+
those failures into loud, early, automated signals you can drop into CI in one
|
|
28
|
+
line.
|
|
29
|
+
|
|
30
|
+
## Install
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install framelint
|
|
34
|
+
# Parquet support:
|
|
35
|
+
pip install "framelint[parquet]"
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Requires Python 3.9+.
|
|
39
|
+
|
|
40
|
+
## 30-second quickstart
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import framelint
|
|
44
|
+
|
|
45
|
+
report = framelint.scan("sales.csv") # or pass a DataFrame
|
|
46
|
+
report.summary() # pretty console table
|
|
47
|
+
print(report.passed) # -> True / False
|
|
48
|
+
|
|
49
|
+
report.to_json("report.json") # machine-readable
|
|
50
|
+
report.to_html("report.html") # shareable report
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Example console output:
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
framelint FAILED rows=1000 cols=6 errors=1 warnings=3 info=1
|
|
57
|
+
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
|
|
58
|
+
┃ Severity ┃ Check ┃ Column ┃ Message ┃
|
|
59
|
+
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
|
|
60
|
+
│ error │ missingness │ region │ Column 'region' is 62.0% null. │
|
|
61
|
+
│ warning │ duplicates │ — │ Found 12 duplicate rows (full-row). │
|
|
62
|
+
│ warning │ type_consistency │ price │ Column 'price' holds numbers as ... │
|
|
63
|
+
│ warning │ outliers │ amount │ Column 'amount' has 18 outliers ... │
|
|
64
|
+
│ info │ cardinality │ id │ Column 'id' looks like an identifier. │
|
|
65
|
+
└──────────┴──────────────────┴─────────┴───────────────────────────────────────┘
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Features
|
|
69
|
+
|
|
70
|
+
- **Missingness** — per-column null counts and rates, with severity thresholds.
|
|
71
|
+
- **Duplicate rows** — full-row or by a subset of key columns.
|
|
72
|
+
- **Constant / zero-variance** and all-null columns.
|
|
73
|
+
- **Cardinality** — likely-identifier and high-cardinality column detection.
|
|
74
|
+
- **Type consistency** — numbers stored as strings, mixed-type columns.
|
|
75
|
+
- **Outliers** — numeric outliers via IQR or z-score (configurable).
|
|
76
|
+
- **Format validation** (opt-in) — email, date/datetime, numeric ranges,
|
|
77
|
+
regex, and allowed-value sets, per column.
|
|
78
|
+
- **Schema drift** — save a baseline, then detect added/removed columns, dtype
|
|
79
|
+
changes, null-rate jumps, and distribution shifts.
|
|
80
|
+
- **Severity levels** — every finding is `info`, `warning`, or `error`.
|
|
81
|
+
- **Pass/fail decision** — based on configurable thresholds, for use in CI.
|
|
82
|
+
- **Outputs** — rich console, `dict`, JSON, HTML, and Markdown.
|
|
83
|
+
|
|
84
|
+
## CLI
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# Scan and write reports
|
|
88
|
+
framelint scan sales.csv --html report.html --json report.json
|
|
89
|
+
|
|
90
|
+
# Fail the build if any error-level finding is present
|
|
91
|
+
framelint scan sales.csv --fail-on error
|
|
92
|
+
|
|
93
|
+
# Save a baseline, then scan a new file for drift
|
|
94
|
+
framelint baseline save sales.csv baseline.json
|
|
95
|
+
framelint scan new.csv --baseline baseline.json
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Exit codes: **0** = passed, **1** = quality failure, **2** = usage error.
|
|
99
|
+
|
|
100
|
+
### Use it in CI to gate data quality
|
|
101
|
+
|
|
102
|
+
```yaml
|
|
103
|
+
# .github/workflows/data-quality.yml
|
|
104
|
+
name: data-quality
|
|
105
|
+
on: [push, pull_request]
|
|
106
|
+
jobs:
|
|
107
|
+
check:
|
|
108
|
+
runs-on: ubuntu-latest
|
|
109
|
+
steps:
|
|
110
|
+
- uses: actions/checkout@v4
|
|
111
|
+
- uses: actions/setup-python@v5
|
|
112
|
+
with: { python-version: "3.12" }
|
|
113
|
+
- run: pip install framelint
|
|
114
|
+
- run: framelint scan data/sales.csv --fail-on error --baseline data/baseline.json
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
If quality drops below your thresholds, the step exits non-zero and the build
|
|
118
|
+
fails — no extra glue code required.
|
|
119
|
+
|
|
120
|
+
## Configuration
|
|
121
|
+
|
|
122
|
+
Thresholds and per-column rules can be set, in increasing order of precedence:
|
|
123
|
+
|
|
124
|
+
1. Built-in defaults
|
|
125
|
+
2. `[tool.framelint]` in `pyproject.toml`
|
|
126
|
+
3. A standalone TOML file (`--config rules.toml`)
|
|
127
|
+
4. A `dict` / `Config` passed to `scan(...)`
|
|
128
|
+
5. Individual CLI flags (e.g. `--fail-on`, `--outlier-method`)
|
|
129
|
+
|
|
130
|
+
```toml
|
|
131
|
+
# pyproject.toml (or a standalone --config file, same schema)
|
|
132
|
+
[tool.framelint]
|
|
133
|
+
null_rate_warning = 0.10
|
|
134
|
+
null_rate_error = 0.50
|
|
135
|
+
duplicate_rate_error = 0.05
|
|
136
|
+
outlier_method = "iqr" # or "zscore"
|
|
137
|
+
fail_on = "error"
|
|
138
|
+
|
|
139
|
+
[tool.framelint.columns.email]
|
|
140
|
+
type = "email"
|
|
141
|
+
|
|
142
|
+
[tool.framelint.columns.age]
|
|
143
|
+
min = 0
|
|
144
|
+
max = 120
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
| Key | Default | Meaning |
|
|
148
|
+
| --- | --- | --- |
|
|
149
|
+
| `null_rate_warning` / `null_rate_error` | 0.10 / 0.50 | Null-rate thresholds |
|
|
150
|
+
| `duplicate_rate_warning` / `duplicate_rate_error` | 0.0 / 0.10 | Duplicate-row thresholds |
|
|
151
|
+
| `duplicate_subset` | `null` | Key columns for duplicate detection |
|
|
152
|
+
| `id_cardinality_ratio` | 0.95 | Unique-ratio to flag a likely ID |
|
|
153
|
+
| `high_cardinality_ratio` | 0.50 | Unique-ratio to flag high cardinality |
|
|
154
|
+
| `outlier_method` | `"iqr"` | `iqr` or `zscore` |
|
|
155
|
+
| `iqr_factor` / `zscore_threshold` | 1.5 / 3.0 | Outlier sensitivity |
|
|
156
|
+
| `outlier_rate_warning` / `outlier_rate_error` | 0.01 / 0.10 | Outlier-rate thresholds |
|
|
157
|
+
| `drift_mean_shift` | 3.0 | Mean shift (in baseline std) to flag drift |
|
|
158
|
+
| `drift_null_rate_increase` | 0.10 | Null-rate jump to flag drift |
|
|
159
|
+
| `fail_on` | `"error"` | Severity at/above which `passed` is `False` |
|
|
160
|
+
|
|
161
|
+
Per-column rules (`[tool.framelint.columns.<name>]`): `type` (`email`/`date`/
|
|
162
|
+
`datetime`), `min`, `max`, `regex`, `allowed`.
|
|
163
|
+
|
|
164
|
+
## Programmatic API
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
import framelint
|
|
168
|
+
|
|
169
|
+
# Baseline + drift
|
|
170
|
+
framelint.save_baseline("sales.csv", "baseline.json")
|
|
171
|
+
report = framelint.scan("new.csv", baseline="baseline.json")
|
|
172
|
+
|
|
173
|
+
# Inline configuration
|
|
174
|
+
report = framelint.scan(df, config={"fail_on": "warning", "outlier_method": "zscore"})
|
|
175
|
+
|
|
176
|
+
report.to_dict() # full machine-readable result
|
|
177
|
+
report.to_markdown() # Markdown string
|
|
178
|
+
report.counts_by_severity()
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Contributing
|
|
182
|
+
|
|
183
|
+
Contributions are welcome — see [CONTRIBUTING.md](CONTRIBUTING.md) and the
|
|
184
|
+
[Code of Conduct](CODE_OF_CONDUCT.md). In short:
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
pip install -e ".[dev]"
|
|
188
|
+
ruff check . && ruff format --check .
|
|
189
|
+
mypy
|
|
190
|
+
pytest
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
[MIT](LICENSE) © Anoop Ibrampur
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "framelint"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "A lightweight data-quality profiler and CI gate for tabular data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Anoop Ibrampur", email = "anoopibrampur@gmail.com" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"data-quality",
|
|
15
|
+
"data-validation",
|
|
16
|
+
"data-profiling",
|
|
17
|
+
"pandas",
|
|
18
|
+
"dataframe",
|
|
19
|
+
"ci",
|
|
20
|
+
"data-engineering",
|
|
21
|
+
"schema-drift",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"License :: OSI Approved :: MIT License",
|
|
28
|
+
"Operating System :: OS Independent",
|
|
29
|
+
"Programming Language :: Python :: 3",
|
|
30
|
+
"Programming Language :: Python :: 3.9",
|
|
31
|
+
"Programming Language :: Python :: 3.10",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Programming Language :: Python :: 3.12",
|
|
34
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
35
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
36
|
+
"Topic :: Utilities",
|
|
37
|
+
"Typing :: Typed",
|
|
38
|
+
]
|
|
39
|
+
dependencies = [
|
|
40
|
+
"pandas>=1.3",
|
|
41
|
+
"rich>=13.0",
|
|
42
|
+
"typer>=0.9",
|
|
43
|
+
"tomli>=2.0; python_version < '3.11'",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.optional-dependencies]
|
|
47
|
+
parquet = ["pyarrow>=10.0"]
|
|
48
|
+
dev = [
|
|
49
|
+
"framelint[parquet]",
|
|
50
|
+
"pytest>=7.0",
|
|
51
|
+
"pytest-cov>=4.0",
|
|
52
|
+
"ruff>=0.6",
|
|
53
|
+
"mypy>=1.8",
|
|
54
|
+
"pandas-stubs",
|
|
55
|
+
"build>=1.0",
|
|
56
|
+
"twine>=5.0",
|
|
57
|
+
"pre-commit>=3.0",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
[project.urls]
|
|
61
|
+
Homepage = "https://github.com/AnoopIbrampur/framelint"
|
|
62
|
+
Repository = "https://github.com/AnoopIbrampur/framelint"
|
|
63
|
+
Issues = "https://github.com/AnoopIbrampur/framelint/issues"
|
|
64
|
+
Changelog = "https://github.com/AnoopIbrampur/framelint/blob/main/CHANGELOG.md"
|
|
65
|
+
|
|
66
|
+
[project.scripts]
|
|
67
|
+
framelint = "framelint.cli:main"
|
|
68
|
+
|
|
69
|
+
[tool.hatch.version]
|
|
70
|
+
path = "src/framelint/__init__.py"
|
|
71
|
+
|
|
72
|
+
[tool.hatch.build.targets.wheel]
|
|
73
|
+
packages = ["src/framelint"]
|
|
74
|
+
|
|
75
|
+
[tool.hatch.build.targets.sdist]
|
|
76
|
+
include = ["src/framelint", "tests", "README.md", "LICENSE", "CHANGELOG.md"]
|
|
77
|
+
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
# Ruff: linter + formatter
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
[tool.ruff]
|
|
82
|
+
line-length = 100
|
|
83
|
+
target-version = "py39"
|
|
84
|
+
src = ["src", "tests"]
|
|
85
|
+
|
|
86
|
+
[tool.ruff.lint]
|
|
87
|
+
select = [
|
|
88
|
+
"E", # pycodestyle errors
|
|
89
|
+
"W", # pycodestyle warnings
|
|
90
|
+
"F", # pyflakes
|
|
91
|
+
"I", # isort
|
|
92
|
+
"N", # pep8-naming
|
|
93
|
+
"UP", # pyupgrade
|
|
94
|
+
"B", # flake8-bugbear
|
|
95
|
+
"C4", # flake8-comprehensions
|
|
96
|
+
"SIM", # flake8-simplify
|
|
97
|
+
"PTH", # flake8-use-pathlib
|
|
98
|
+
"ARG", # flake8-unused-arguments
|
|
99
|
+
"RUF", # ruff-specific
|
|
100
|
+
"D", # pydocstyle
|
|
101
|
+
]
|
|
102
|
+
ignore = [
|
|
103
|
+
"D203", # one-blank-line-before-class (conflicts with D211)
|
|
104
|
+
"D213", # multi-line-summary-second-line (conflicts with D212)
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
[tool.ruff.lint.pydocstyle]
|
|
108
|
+
convention = "google"
|
|
109
|
+
|
|
110
|
+
[tool.ruff.lint.per-file-ignores]
|
|
111
|
+
"tests/**" = ["D", "ARG"]
|
|
112
|
+
"examples/**" = ["D", "ARG"]
|
|
113
|
+
# Typer relies on function-call defaults (typer.Argument/Option) by design, and
|
|
114
|
+
# evaluates annotations at runtime, so `Optional[X]` is required for Python 3.9.
|
|
115
|
+
"src/framelint/cli.py" = ["B008", "UP007", "UP045"]
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# mypy: strict type checking
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
[tool.mypy]
|
|
121
|
+
strict = true
|
|
122
|
+
warn_unreachable = true
|
|
123
|
+
files = ["src", "tests"]
|
|
124
|
+
|
|
125
|
+
[[tool.mypy.overrides]]
|
|
126
|
+
module = ["tests.*"]
|
|
127
|
+
disallow_untyped_defs = false
|
|
128
|
+
disallow_incomplete_defs = false
|
|
129
|
+
disallow_untyped_calls = false
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# pytest + coverage
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
[tool.pytest.ini_options]
|
|
135
|
+
minversion = "7.0"
|
|
136
|
+
addopts = "-ra --cov=framelint --cov-report=term-missing --cov-report=xml"
|
|
137
|
+
testpaths = ["tests"]
|
|
138
|
+
|
|
139
|
+
[tool.coverage.run]
|
|
140
|
+
branch = true
|
|
141
|
+
source = ["framelint"]
|
|
142
|
+
|
|
143
|
+
[tool.coverage.report]
|
|
144
|
+
show_missing = true
|
|
145
|
+
exclude_lines = [
|
|
146
|
+
"pragma: no cover",
|
|
147
|
+
"if TYPE_CHECKING:",
|
|
148
|
+
"raise NotImplementedError",
|
|
149
|
+
]
|