framelint 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
framelint/__init__.py ADDED
@@ -0,0 +1,33 @@
1
+ """framelint: a lightweight data-quality profiler and CI gate for tabular data.
2
+
3
+ Scan a :class:`pandas.DataFrame` or a CSV/Parquet file and get a clear,
4
+ machine-readable data-quality report. The same report doubles as a CI gate:
5
+ it can fail a build (non-zero exit code) when quality drops below configurable
6
+ thresholds.
7
+
8
+ Example:
9
+ >>> import framelint
10
+ >>> report = framelint.scan("sales.csv")
11
+ >>> report.summary() # pretty console table
12
+ >>> report.passed # bool, based on thresholds
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from framelint.api import save_baseline, scan
18
+ from framelint.config import ColumnRule, Config
19
+ from framelint.models import Finding, Severity
20
+ from framelint.report import Report
21
+
22
+ __version__ = "0.1.0"
23
+
24
+ __all__ = [
25
+ "ColumnRule",
26
+ "Config",
27
+ "Finding",
28
+ "Report",
29
+ "Severity",
30
+ "__version__",
31
+ "save_baseline",
32
+ "scan",
33
+ ]
framelint/api.py ADDED
@@ -0,0 +1,73 @@
1
+ """The high-level public API: :func:`scan` and :func:`save_baseline`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from framelint.baseline import BaselineLike, compute_drift
9
+ from framelint.baseline import save_baseline as _save_baseline
10
+ from framelint.checks import run_all_checks
11
+ from framelint.config import Config
12
+ from framelint.loaders import DataSource, load_data
13
+ from framelint.report import Report
14
+
15
+
16
+ def scan(
17
+ source: DataSource,
18
+ *,
19
+ config: Config | dict[str, Any] | None = None,
20
+ baseline: BaselineLike | None = None,
21
+ ) -> Report:
22
+ """Scan a dataset and return a data-quality :class:`Report`.
23
+
24
+ Args:
25
+ source: A :class:`pandas.DataFrame`, or a path to a CSV/Parquet file.
26
+ config: A :class:`Config`, a dict of overrides, or ``None`` for defaults.
27
+ baseline: An optional baseline (path or dict) to enable schema-drift
28
+ detection against a previously-captured schema.
29
+
30
+ Returns:
31
+ A :class:`Report` describing the findings and overall pass/fail status.
32
+ """
33
+ from framelint import __version__
34
+
35
+ cfg = _coerce_config(config)
36
+ df = load_data(source)
37
+
38
+ findings = run_all_checks(df, cfg)
39
+ if baseline is not None:
40
+ findings.extend(compute_drift(df, baseline, cfg))
41
+ findings.sort(key=lambda f: f.severity.rank, reverse=True)
42
+
43
+ return Report(
44
+ n_rows=len(df),
45
+ n_columns=df.shape[1],
46
+ columns=[str(c) for c in df.columns],
47
+ findings=findings,
48
+ fail_on=cfg.fail_on,
49
+ framelint_version=__version__,
50
+ )
51
+
52
+
53
+ def save_baseline(source: DataSource, path: str | Path) -> Path:
54
+ """Capture a baseline schema from ``source`` and write it to ``path``.
55
+
56
+ Args:
57
+ source: A :class:`pandas.DataFrame`, or a path to a CSV/Parquet file.
58
+ path: Destination path for the baseline JSON.
59
+
60
+ Returns:
61
+ The path the baseline was written to.
62
+ """
63
+ df = load_data(source)
64
+ return _save_baseline(df, path)
65
+
66
+
67
+ def _coerce_config(config: Config | dict[str, Any] | None) -> Config:
68
+ """Turn the public ``config`` argument into a concrete :class:`Config`."""
69
+ if config is None:
70
+ return Config()
71
+ if isinstance(config, Config):
72
+ return config
73
+ return Config.from_dict(config)
framelint/baseline.py ADDED
@@ -0,0 +1,206 @@
1
+ """Baseline schema capture and schema-drift detection.
2
+
3
+ A baseline records a dataset's structure and a few key per-column statistics to
4
+ a JSON file. A later dataset can then be compared against it to surface added or
5
+ removed columns, dtype changes, and large distribution shifts.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from pathlib import Path
12
+ from typing import Any, Union
13
+
14
+ import pandas as pd
15
+
16
+ from framelint.config import Config
17
+ from framelint.models import Finding, Severity
18
+
19
+ BASELINE_VERSION = 1
20
+
21
+ BaselineLike = Union[str, Path, dict[str, Any]]
22
+
23
+
24
+ def build_baseline(df: pd.DataFrame) -> dict[str, Any]:
25
+ """Capture a JSON-serializable baseline schema from a DataFrame.
26
+
27
+ Args:
28
+ df: The reference dataset.
29
+
30
+ Returns:
31
+ A dict describing the schema: row count and, per column, dtype, null
32
+ rate, distinct count, and (for numeric columns) summary statistics.
33
+ """
34
+ n_rows = len(df)
35
+ columns: dict[str, Any] = {}
36
+ for col in df.columns:
37
+ series = df[col]
38
+ null_count = int(series.isna().sum())
39
+ entry: dict[str, Any] = {
40
+ "dtype": str(series.dtype),
41
+ "null_rate": round(null_count / n_rows, 6) if n_rows else 0.0,
42
+ "n_unique": int(series.nunique(dropna=True)),
43
+ }
44
+ if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
45
+ values = series.dropna().astype(float)
46
+ if not values.empty:
47
+ entry["numeric"] = {
48
+ "mean": round(float(values.mean()), 6),
49
+ "std": round(float(values.std(ddof=0)), 6),
50
+ "min": round(float(values.min()), 6),
51
+ "max": round(float(values.max()), 6),
52
+ }
53
+ columns[str(col)] = entry
54
+ return {
55
+ "framelint_baseline_version": BASELINE_VERSION,
56
+ "n_rows": n_rows,
57
+ "columns": columns,
58
+ }
59
+
60
+
61
+ def save_baseline(df: pd.DataFrame, path: str | Path) -> Path:
62
+ """Write a baseline schema for ``df`` to ``path`` as JSON.
63
+
64
+ Args:
65
+ df: The reference dataset.
66
+ path: Destination file path.
67
+
68
+ Returns:
69
+ The path written to.
70
+ """
71
+ baseline = build_baseline(df)
72
+ out = Path(path)
73
+ out.parent.mkdir(parents=True, exist_ok=True)
74
+ out.write_text(json.dumps(baseline, indent=2), encoding="utf-8")
75
+ return out
76
+
77
+
78
+ def load_baseline(baseline: BaselineLike) -> dict[str, Any]:
79
+ """Load a baseline from a path or accept an already-loaded dict.
80
+
81
+ Args:
82
+ baseline: A path to a baseline JSON file, or a baseline dict.
83
+
84
+ Returns:
85
+ The baseline dict.
86
+
87
+ Raises:
88
+ FileNotFoundError: If a path is given but does not exist.
89
+ ValueError: If the file does not contain a valid framelint baseline.
90
+ """
91
+ if isinstance(baseline, dict):
92
+ data = baseline
93
+ else:
94
+ path = Path(baseline)
95
+ if not path.is_file():
96
+ raise FileNotFoundError(f"baseline file not found: {path}")
97
+ data = json.loads(path.read_text(encoding="utf-8"))
98
+ if "columns" not in data or "framelint_baseline_version" not in data:
99
+ raise ValueError("invalid baseline: missing required keys")
100
+ return data
101
+
102
+
103
+ def compute_drift(df: pd.DataFrame, baseline: BaselineLike, config: Config) -> list[Finding]:
104
+ """Compare a dataset against a baseline and report schema drift.
105
+
106
+ Args:
107
+ df: The new dataset to compare.
108
+ baseline: The reference baseline (path or dict).
109
+ config: Thresholds controlling distribution-shift sensitivity.
110
+
111
+ Returns:
112
+ A list of drift findings (added/removed columns, dtype changes, null-rate
113
+ increases, and numeric mean shifts).
114
+ """
115
+ data = load_baseline(baseline)
116
+ base_columns: dict[str, Any] = data["columns"]
117
+ findings: list[Finding] = []
118
+ n_rows = len(df)
119
+ current = set(map(str, df.columns))
120
+ baseline_cols = set(base_columns)
121
+
122
+ for removed in sorted(baseline_cols - current):
123
+ findings.append(
124
+ Finding(
125
+ check="drift",
126
+ severity=Severity.ERROR,
127
+ message=f"Column '{removed}' present in baseline is missing from the data.",
128
+ column=removed,
129
+ details={"kind": "removed_column"},
130
+ )
131
+ )
132
+ for added in sorted(current - baseline_cols):
133
+ findings.append(
134
+ Finding(
135
+ check="drift",
136
+ severity=Severity.WARNING,
137
+ message=f"Column '{added}' is new relative to the baseline.",
138
+ column=added,
139
+ details={"kind": "added_column"},
140
+ )
141
+ )
142
+
143
+ for col in sorted(baseline_cols & current):
144
+ base = base_columns[col]
145
+ series = df[col]
146
+ findings.extend(_column_drift(col, series, base, n_rows, config))
147
+ return findings
148
+
149
+
150
+ def _column_drift(
151
+ col: str, series: pd.Series, base: dict[str, Any], n_rows: int, config: Config
152
+ ) -> list[Finding]:
153
+ """Report drift for a single column present in both datasets."""
154
+ findings: list[Finding] = []
155
+ current_dtype = str(series.dtype)
156
+ if current_dtype != base["dtype"]:
157
+ findings.append(
158
+ Finding(
159
+ check="drift",
160
+ severity=Severity.WARNING,
161
+ message=f"Column '{col}' dtype changed: {base['dtype']} -> {current_dtype}.",
162
+ column=col,
163
+ details={"kind": "dtype_change", "from": base["dtype"], "to": current_dtype},
164
+ )
165
+ )
166
+
167
+ null_rate = float(series.isna().sum()) / n_rows if n_rows else 0.0
168
+ increase = null_rate - float(base["null_rate"])
169
+ if increase > config.drift_null_rate_increase:
170
+ findings.append(
171
+ Finding(
172
+ check="drift",
173
+ severity=Severity.WARNING,
174
+ message=f"Column '{col}' null rate rose from {base['null_rate']:.1%} "
175
+ f"to {null_rate:.1%}.",
176
+ column=col,
177
+ details={
178
+ "kind": "null_rate_increase",
179
+ "from": base["null_rate"],
180
+ "to": round(null_rate, 6),
181
+ },
182
+ )
183
+ )
184
+
185
+ base_numeric = base.get("numeric")
186
+ if base_numeric and pd.api.types.is_numeric_dtype(series):
187
+ values = series.dropna().astype(float)
188
+ std = float(base_numeric["std"])
189
+ if not values.empty and std > 0:
190
+ shift = abs(float(values.mean()) - float(base_numeric["mean"])) / std
191
+ if shift > config.drift_mean_shift:
192
+ findings.append(
193
+ Finding(
194
+ check="drift",
195
+ severity=Severity.WARNING,
196
+ message=f"Column '{col}' mean shifted {shift:.1f} std from baseline.",
197
+ column=col,
198
+ details={
199
+ "kind": "mean_shift",
200
+ "shift_in_std": round(shift, 6),
201
+ "baseline_mean": base_numeric["mean"],
202
+ "current_mean": round(float(values.mean()), 6),
203
+ },
204
+ )
205
+ )
206
+ return findings