framelint 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- framelint/__init__.py +33 -0
- framelint/api.py +73 -0
- framelint/baseline.py +206 -0
- framelint/checks.py +421 -0
- framelint/cli.py +157 -0
- framelint/config.py +198 -0
- framelint/loaders.py +69 -0
- framelint/models.py +79 -0
- framelint/py.typed +0 -0
- framelint/report.py +226 -0
- framelint-0.1.0.dist-info/METADATA +240 -0
- framelint-0.1.0.dist-info/RECORD +15 -0
- framelint-0.1.0.dist-info/WHEEL +4 -0
- framelint-0.1.0.dist-info/entry_points.txt +2 -0
- framelint-0.1.0.dist-info/licenses/LICENSE +21 -0
framelint/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""framelint: a lightweight data-quality profiler and CI gate for tabular data.
|
|
2
|
+
|
|
3
|
+
Scan a :class:`pandas.DataFrame` or a CSV/Parquet file and get a clear,
|
|
4
|
+
machine-readable data-quality report. The same report doubles as a CI gate:
|
|
5
|
+
it can fail a build (non-zero exit code) when quality drops below configurable
|
|
6
|
+
thresholds.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> import framelint
|
|
10
|
+
>>> report = framelint.scan("sales.csv")
|
|
11
|
+
>>> report.summary() # pretty console table
|
|
12
|
+
>>> report.passed # bool, based on thresholds
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from framelint.api import save_baseline, scan
|
|
18
|
+
from framelint.config import ColumnRule, Config
|
|
19
|
+
from framelint.models import Finding, Severity
|
|
20
|
+
from framelint.report import Report
|
|
21
|
+
|
|
22
|
+
__version__ = "0.1.0"
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"ColumnRule",
|
|
26
|
+
"Config",
|
|
27
|
+
"Finding",
|
|
28
|
+
"Report",
|
|
29
|
+
"Severity",
|
|
30
|
+
"__version__",
|
|
31
|
+
"save_baseline",
|
|
32
|
+
"scan",
|
|
33
|
+
]
|
framelint/api.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""The high-level public API: :func:`scan` and :func:`save_baseline`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from framelint.baseline import BaselineLike, compute_drift
|
|
9
|
+
from framelint.baseline import save_baseline as _save_baseline
|
|
10
|
+
from framelint.checks import run_all_checks
|
|
11
|
+
from framelint.config import Config
|
|
12
|
+
from framelint.loaders import DataSource, load_data
|
|
13
|
+
from framelint.report import Report
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def scan(
|
|
17
|
+
source: DataSource,
|
|
18
|
+
*,
|
|
19
|
+
config: Config | dict[str, Any] | None = None,
|
|
20
|
+
baseline: BaselineLike | None = None,
|
|
21
|
+
) -> Report:
|
|
22
|
+
"""Scan a dataset and return a data-quality :class:`Report`.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
source: A :class:`pandas.DataFrame`, or a path to a CSV/Parquet file.
|
|
26
|
+
config: A :class:`Config`, a dict of overrides, or ``None`` for defaults.
|
|
27
|
+
baseline: An optional baseline (path or dict) to enable schema-drift
|
|
28
|
+
detection against a previously-captured schema.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A :class:`Report` describing the findings and overall pass/fail status.
|
|
32
|
+
"""
|
|
33
|
+
from framelint import __version__
|
|
34
|
+
|
|
35
|
+
cfg = _coerce_config(config)
|
|
36
|
+
df = load_data(source)
|
|
37
|
+
|
|
38
|
+
findings = run_all_checks(df, cfg)
|
|
39
|
+
if baseline is not None:
|
|
40
|
+
findings.extend(compute_drift(df, baseline, cfg))
|
|
41
|
+
findings.sort(key=lambda f: f.severity.rank, reverse=True)
|
|
42
|
+
|
|
43
|
+
return Report(
|
|
44
|
+
n_rows=len(df),
|
|
45
|
+
n_columns=df.shape[1],
|
|
46
|
+
columns=[str(c) for c in df.columns],
|
|
47
|
+
findings=findings,
|
|
48
|
+
fail_on=cfg.fail_on,
|
|
49
|
+
framelint_version=__version__,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def save_baseline(source: DataSource, path: str | Path) -> Path:
|
|
54
|
+
"""Capture a baseline schema from ``source`` and write it to ``path``.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
source: A :class:`pandas.DataFrame`, or a path to a CSV/Parquet file.
|
|
58
|
+
path: Destination path for the baseline JSON.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The path the baseline was written to.
|
|
62
|
+
"""
|
|
63
|
+
df = load_data(source)
|
|
64
|
+
return _save_baseline(df, path)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _coerce_config(config: Config | dict[str, Any] | None) -> Config:
|
|
68
|
+
"""Turn the public ``config`` argument into a concrete :class:`Config`."""
|
|
69
|
+
if config is None:
|
|
70
|
+
return Config()
|
|
71
|
+
if isinstance(config, Config):
|
|
72
|
+
return config
|
|
73
|
+
return Config.from_dict(config)
|
framelint/baseline.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Baseline schema capture and schema-drift detection.
|
|
2
|
+
|
|
3
|
+
A baseline records a dataset's structure and a few key per-column statistics to
|
|
4
|
+
a JSON file. A later dataset can then be compared against it to surface added or
|
|
5
|
+
removed columns, dtype changes, and large distribution shifts.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Union
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
from framelint.config import Config
|
|
17
|
+
from framelint.models import Finding, Severity
|
|
18
|
+
|
|
19
|
+
BASELINE_VERSION = 1
|
|
20
|
+
|
|
21
|
+
BaselineLike = Union[str, Path, dict[str, Any]]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_baseline(df: pd.DataFrame) -> dict[str, Any]:
|
|
25
|
+
"""Capture a JSON-serializable baseline schema from a DataFrame.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
df: The reference dataset.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
A dict describing the schema: row count and, per column, dtype, null
|
|
32
|
+
rate, distinct count, and (for numeric columns) summary statistics.
|
|
33
|
+
"""
|
|
34
|
+
n_rows = len(df)
|
|
35
|
+
columns: dict[str, Any] = {}
|
|
36
|
+
for col in df.columns:
|
|
37
|
+
series = df[col]
|
|
38
|
+
null_count = int(series.isna().sum())
|
|
39
|
+
entry: dict[str, Any] = {
|
|
40
|
+
"dtype": str(series.dtype),
|
|
41
|
+
"null_rate": round(null_count / n_rows, 6) if n_rows else 0.0,
|
|
42
|
+
"n_unique": int(series.nunique(dropna=True)),
|
|
43
|
+
}
|
|
44
|
+
if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
|
|
45
|
+
values = series.dropna().astype(float)
|
|
46
|
+
if not values.empty:
|
|
47
|
+
entry["numeric"] = {
|
|
48
|
+
"mean": round(float(values.mean()), 6),
|
|
49
|
+
"std": round(float(values.std(ddof=0)), 6),
|
|
50
|
+
"min": round(float(values.min()), 6),
|
|
51
|
+
"max": round(float(values.max()), 6),
|
|
52
|
+
}
|
|
53
|
+
columns[str(col)] = entry
|
|
54
|
+
return {
|
|
55
|
+
"framelint_baseline_version": BASELINE_VERSION,
|
|
56
|
+
"n_rows": n_rows,
|
|
57
|
+
"columns": columns,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def save_baseline(df: pd.DataFrame, path: str | Path) -> Path:
|
|
62
|
+
"""Write a baseline schema for ``df`` to ``path`` as JSON.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
df: The reference dataset.
|
|
66
|
+
path: Destination file path.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The path written to.
|
|
70
|
+
"""
|
|
71
|
+
baseline = build_baseline(df)
|
|
72
|
+
out = Path(path)
|
|
73
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
74
|
+
out.write_text(json.dumps(baseline, indent=2), encoding="utf-8")
|
|
75
|
+
return out
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def load_baseline(baseline: BaselineLike) -> dict[str, Any]:
|
|
79
|
+
"""Load a baseline from a path or accept an already-loaded dict.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
baseline: A path to a baseline JSON file, or a baseline dict.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
The baseline dict.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
FileNotFoundError: If a path is given but does not exist.
|
|
89
|
+
ValueError: If the file does not contain a valid framelint baseline.
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(baseline, dict):
|
|
92
|
+
data = baseline
|
|
93
|
+
else:
|
|
94
|
+
path = Path(baseline)
|
|
95
|
+
if not path.is_file():
|
|
96
|
+
raise FileNotFoundError(f"baseline file not found: {path}")
|
|
97
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
98
|
+
if "columns" not in data or "framelint_baseline_version" not in data:
|
|
99
|
+
raise ValueError("invalid baseline: missing required keys")
|
|
100
|
+
return data
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def compute_drift(df: pd.DataFrame, baseline: BaselineLike, config: Config) -> list[Finding]:
|
|
104
|
+
"""Compare a dataset against a baseline and report schema drift.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
df: The new dataset to compare.
|
|
108
|
+
baseline: The reference baseline (path or dict).
|
|
109
|
+
config: Thresholds controlling distribution-shift sensitivity.
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
A list of drift findings (added/removed columns, dtype changes, null-rate
|
|
113
|
+
increases, and numeric mean shifts).
|
|
114
|
+
"""
|
|
115
|
+
data = load_baseline(baseline)
|
|
116
|
+
base_columns: dict[str, Any] = data["columns"]
|
|
117
|
+
findings: list[Finding] = []
|
|
118
|
+
n_rows = len(df)
|
|
119
|
+
current = set(map(str, df.columns))
|
|
120
|
+
baseline_cols = set(base_columns)
|
|
121
|
+
|
|
122
|
+
for removed in sorted(baseline_cols - current):
|
|
123
|
+
findings.append(
|
|
124
|
+
Finding(
|
|
125
|
+
check="drift",
|
|
126
|
+
severity=Severity.ERROR,
|
|
127
|
+
message=f"Column '{removed}' present in baseline is missing from the data.",
|
|
128
|
+
column=removed,
|
|
129
|
+
details={"kind": "removed_column"},
|
|
130
|
+
)
|
|
131
|
+
)
|
|
132
|
+
for added in sorted(current - baseline_cols):
|
|
133
|
+
findings.append(
|
|
134
|
+
Finding(
|
|
135
|
+
check="drift",
|
|
136
|
+
severity=Severity.WARNING,
|
|
137
|
+
message=f"Column '{added}' is new relative to the baseline.",
|
|
138
|
+
column=added,
|
|
139
|
+
details={"kind": "added_column"},
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
for col in sorted(baseline_cols & current):
|
|
144
|
+
base = base_columns[col]
|
|
145
|
+
series = df[col]
|
|
146
|
+
findings.extend(_column_drift(col, series, base, n_rows, config))
|
|
147
|
+
return findings
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _column_drift(
|
|
151
|
+
col: str, series: pd.Series, base: dict[str, Any], n_rows: int, config: Config
|
|
152
|
+
) -> list[Finding]:
|
|
153
|
+
"""Report drift for a single column present in both datasets."""
|
|
154
|
+
findings: list[Finding] = []
|
|
155
|
+
current_dtype = str(series.dtype)
|
|
156
|
+
if current_dtype != base["dtype"]:
|
|
157
|
+
findings.append(
|
|
158
|
+
Finding(
|
|
159
|
+
check="drift",
|
|
160
|
+
severity=Severity.WARNING,
|
|
161
|
+
message=f"Column '{col}' dtype changed: {base['dtype']} -> {current_dtype}.",
|
|
162
|
+
column=col,
|
|
163
|
+
details={"kind": "dtype_change", "from": base["dtype"], "to": current_dtype},
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
null_rate = float(series.isna().sum()) / n_rows if n_rows else 0.0
|
|
168
|
+
increase = null_rate - float(base["null_rate"])
|
|
169
|
+
if increase > config.drift_null_rate_increase:
|
|
170
|
+
findings.append(
|
|
171
|
+
Finding(
|
|
172
|
+
check="drift",
|
|
173
|
+
severity=Severity.WARNING,
|
|
174
|
+
message=f"Column '{col}' null rate rose from {base['null_rate']:.1%} "
|
|
175
|
+
f"to {null_rate:.1%}.",
|
|
176
|
+
column=col,
|
|
177
|
+
details={
|
|
178
|
+
"kind": "null_rate_increase",
|
|
179
|
+
"from": base["null_rate"],
|
|
180
|
+
"to": round(null_rate, 6),
|
|
181
|
+
},
|
|
182
|
+
)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
base_numeric = base.get("numeric")
|
|
186
|
+
if base_numeric and pd.api.types.is_numeric_dtype(series):
|
|
187
|
+
values = series.dropna().astype(float)
|
|
188
|
+
std = float(base_numeric["std"])
|
|
189
|
+
if not values.empty and std > 0:
|
|
190
|
+
shift = abs(float(values.mean()) - float(base_numeric["mean"])) / std
|
|
191
|
+
if shift > config.drift_mean_shift:
|
|
192
|
+
findings.append(
|
|
193
|
+
Finding(
|
|
194
|
+
check="drift",
|
|
195
|
+
severity=Severity.WARNING,
|
|
196
|
+
message=f"Column '{col}' mean shifted {shift:.1f} std from baseline.",
|
|
197
|
+
column=col,
|
|
198
|
+
details={
|
|
199
|
+
"kind": "mean_shift",
|
|
200
|
+
"shift_in_std": round(shift, 6),
|
|
201
|
+
"baseline_mean": base_numeric["mean"],
|
|
202
|
+
"current_mean": round(float(values.mean()), 6),
|
|
203
|
+
},
|
|
204
|
+
)
|
|
205
|
+
)
|
|
206
|
+
return findings
|