dqscore 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dqscore/__init__.py +40 -0
- dqscore/autoscan.py +60 -0
- dqscore/checks.py +127 -0
- dqscore/cli.py +80 -0
- dqscore/profiling.py +134 -0
- dqscore/py.typed +0 -0
- dqscore/report.py +199 -0
- dqscore/validator.py +189 -0
- dqscore-0.1.0.dist-info/METADATA +184 -0
- dqscore-0.1.0.dist-info/RECORD +14 -0
- dqscore-0.1.0.dist-info/WHEEL +5 -0
- dqscore-0.1.0.dist-info/entry_points.txt +2 -0
- dqscore-0.1.0.dist-info/licenses/LICENSE +21 -0
- dqscore-0.1.0.dist-info/top_level.txt +1 -0
dqscore/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""dqscore — a lightweight data quality toolkit for pandas.
|
|
2
|
+
|
|
3
|
+
Quick start
|
|
4
|
+
-----------
|
|
5
|
+
>>> import pandas as pd
|
|
6
|
+
>>> import dqscore as dq
|
|
7
|
+
>>> df = pd.DataFrame({"id": [1, 2, 2], "age": [30, -1, 41]})
|
|
8
|
+
>>> result = dq.auto_scan(df)
|
|
9
|
+
>>> result.passed
|
|
10
|
+
False
|
|
11
|
+
|
|
12
|
+
Declare expectations explicitly with a :class:`~dqscore.Schema`::
|
|
13
|
+
|
|
14
|
+
schema = dq.Schema("people")
|
|
15
|
+
schema.column("id").not_null().unique()
|
|
16
|
+
schema.column("age").in_range(0, 120)
|
|
17
|
+
report = schema.validate(df)
|
|
18
|
+
print(report.summary())
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from . import checks
|
|
23
|
+
from .autoscan import auto_scan
|
|
24
|
+
from .profiling import Profile, profile
|
|
25
|
+
from .report import CheckResult, ValidationResult
|
|
26
|
+
from .validator import ColumnSchema, Schema
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"Schema",
|
|
32
|
+
"ColumnSchema",
|
|
33
|
+
"profile",
|
|
34
|
+
"Profile",
|
|
35
|
+
"auto_scan",
|
|
36
|
+
"ValidationResult",
|
|
37
|
+
"CheckResult",
|
|
38
|
+
"checks",
|
|
39
|
+
"__version__",
|
|
40
|
+
]
|
dqscore/autoscan.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Zero-config quality scan: infer sensible default checks for any DataFrame."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from .report import ValidationResult
|
|
9
|
+
from .validator import Schema
|
|
10
|
+
|
|
11
|
+
__all__ = ["auto_scan"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _looks_like_id(name: str) -> bool:
|
|
15
|
+
lowered = str(name).lower()
|
|
16
|
+
return lowered == "id" or lowered.endswith("_id") or lowered.endswith("id")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def auto_scan(
|
|
20
|
+
df: pd.DataFrame,
|
|
21
|
+
max_null_pct: float = 0.0,
|
|
22
|
+
name: str = "auto_scan",
|
|
23
|
+
) -> ValidationResult:
|
|
24
|
+
"""Run a quick, opinionated quality scan with no schema required.
|
|
25
|
+
|
|
26
|
+
Heuristics applied:
|
|
27
|
+
|
|
28
|
+
* every column is expected to have at most ``max_null_pct`` percent nulls;
|
|
29
|
+
* columns that look like identifiers (``id`` / ``*_id``) are expected to be
|
|
30
|
+
unique;
|
|
31
|
+
* the frame is expected to have no fully duplicated rows.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
df:
|
|
36
|
+
The DataFrame to scan.
|
|
37
|
+
max_null_pct:
|
|
38
|
+
Allowed percentage of nulls per column before the column's null check
|
|
39
|
+
fails. ``0.0`` means "no nulls allowed".
|
|
40
|
+
"""
|
|
41
|
+
if not isinstance(df, pd.DataFrame):
|
|
42
|
+
raise TypeError("auto_scan() expects a pandas DataFrame")
|
|
43
|
+
|
|
44
|
+
n = len(df)
|
|
45
|
+
threshold = max_null_pct / 100.0
|
|
46
|
+
schema = Schema(name)
|
|
47
|
+
|
|
48
|
+
for col in df.columns:
|
|
49
|
+
series = df[col]
|
|
50
|
+
null_frac = series.isna().mean() if n else 0.0
|
|
51
|
+
if null_frac > threshold:
|
|
52
|
+
# Flag missingness explicitly via the not_null check.
|
|
53
|
+
schema.column(col).not_null()
|
|
54
|
+
if _looks_like_id(col):
|
|
55
|
+
# Identifier-like columns are expected to be unique; this surfaces
|
|
56
|
+
# accidental duplicate keys, a common data quality defect.
|
|
57
|
+
schema.column(col).unique()
|
|
58
|
+
|
|
59
|
+
schema.no_duplicate_rows()
|
|
60
|
+
return schema.validate(df)
|
dqscore/checks.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Low-level data quality checks.
|
|
2
|
+
|
|
3
|
+
Every check takes a :class:`pandas.Series` (or DataFrame, for frame-level
|
|
4
|
+
checks) and returns a boolean mask aligned to the input where ``True`` marks a
|
|
5
|
+
*failing* row. Null handling is deliberate: most checks let nulls pass so that
|
|
6
|
+
``not_null`` is the single source of truth for missing values. Combine checks to
|
|
7
|
+
express richer expectations.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, Iterable, Optional
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"not_null",
|
|
18
|
+
"unique",
|
|
19
|
+
"in_range",
|
|
20
|
+
"in_set",
|
|
21
|
+
"matches",
|
|
22
|
+
"is_numeric",
|
|
23
|
+
"is_integer",
|
|
24
|
+
"is_datetime",
|
|
25
|
+
"string_length",
|
|
26
|
+
"no_duplicate_rows",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _as_bool_mask(mask: pd.Series, index: pd.Index) -> pd.Series:
|
|
31
|
+
"""Coerce a mask to a clean boolean Series aligned to ``index``."""
|
|
32
|
+
return pd.Series(mask, index=index).fillna(False).astype(bool)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def not_null(series: pd.Series) -> pd.Series:
|
|
36
|
+
"""Fail rows whose value is null / NaN / NaT."""
|
|
37
|
+
return series.isna()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def unique(series: pd.Series) -> pd.Series:
|
|
41
|
+
"""Fail rows whose (non-null) value appears more than once."""
|
|
42
|
+
duplicated = series.duplicated(keep=False)
|
|
43
|
+
return _as_bool_mask(duplicated & series.notna(), series.index)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def in_range(
|
|
47
|
+
series: pd.Series,
|
|
48
|
+
min_value: Optional[float] = None,
|
|
49
|
+
max_value: Optional[float] = None,
|
|
50
|
+
inclusive: bool = True,
|
|
51
|
+
) -> pd.Series:
|
|
52
|
+
"""Fail rows outside ``[min_value, max_value]``.
|
|
53
|
+
|
|
54
|
+
Non-numeric, non-null values fail as well. Nulls pass (use ``not_null``).
|
|
55
|
+
"""
|
|
56
|
+
numeric = pd.to_numeric(series, errors="coerce")
|
|
57
|
+
fail = pd.Series(False, index=series.index)
|
|
58
|
+
if min_value is not None:
|
|
59
|
+
fail |= (numeric < min_value) if inclusive else (numeric <= min_value)
|
|
60
|
+
if max_value is not None:
|
|
61
|
+
fail |= (numeric > max_value) if inclusive else (numeric >= max_value)
|
|
62
|
+
non_numeric = numeric.isna() & series.notna()
|
|
63
|
+
fail |= non_numeric
|
|
64
|
+
return _as_bool_mask(fail, series.index)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def in_set(series: pd.Series, allowed: Iterable[Any]) -> pd.Series:
|
|
68
|
+
"""Fail rows whose (non-null) value is not in ``allowed``."""
|
|
69
|
+
allowed_set = set(allowed)
|
|
70
|
+
fail = ~series.isin(allowed_set) & series.notna()
|
|
71
|
+
return _as_bool_mask(fail, series.index)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def matches(series: pd.Series, pattern: str, full_match: bool = False) -> pd.Series:
|
|
75
|
+
"""Fail rows whose (non-null) string value does not match ``pattern``."""
|
|
76
|
+
compiled = re.compile(pattern)
|
|
77
|
+
finder = compiled.fullmatch if full_match else compiled.search
|
|
78
|
+
|
|
79
|
+
def _fails(value: Any) -> bool:
|
|
80
|
+
if pd.isna(value):
|
|
81
|
+
return False
|
|
82
|
+
return finder(str(value)) is None
|
|
83
|
+
|
|
84
|
+
return _as_bool_mask(series.map(_fails), series.index)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_numeric(series: pd.Series) -> pd.Series:
|
|
88
|
+
"""Fail non-null values that cannot be parsed as numbers."""
|
|
89
|
+
coerced = pd.to_numeric(series, errors="coerce")
|
|
90
|
+
return _as_bool_mask(coerced.isna() & series.notna(), series.index)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def is_integer(series: pd.Series) -> pd.Series:
|
|
94
|
+
"""Fail non-null values that are not whole numbers."""
|
|
95
|
+
coerced = pd.to_numeric(series, errors="coerce")
|
|
96
|
+
non_numeric = coerced.isna() & series.notna()
|
|
97
|
+
non_integer = coerced.notna() & (coerced % 1 != 0)
|
|
98
|
+
return _as_bool_mask(non_numeric | non_integer, series.index)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def is_datetime(series: pd.Series, fmt: Optional[str] = None) -> pd.Series:
|
|
102
|
+
"""Fail non-null values that cannot be parsed as dates/times."""
|
|
103
|
+
coerced = pd.to_datetime(series, errors="coerce", format=fmt)
|
|
104
|
+
return _as_bool_mask(coerced.isna() & series.notna(), series.index)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def string_length(
|
|
108
|
+
series: pd.Series,
|
|
109
|
+
min_len: Optional[int] = None,
|
|
110
|
+
max_len: Optional[int] = None,
|
|
111
|
+
) -> pd.Series:
|
|
112
|
+
"""Fail non-null values whose string length is outside the bounds."""
|
|
113
|
+
lengths = series.dropna().astype(str).str.len()
|
|
114
|
+
fail = pd.Series(False, index=series.index)
|
|
115
|
+
if min_len is not None:
|
|
116
|
+
fail.loc[lengths.index] |= lengths < min_len
|
|
117
|
+
if max_len is not None:
|
|
118
|
+
fail.loc[lengths.index] |= lengths > max_len
|
|
119
|
+
return _as_bool_mask(fail, series.index)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def no_duplicate_rows(
|
|
123
|
+
df: pd.DataFrame, subset: Optional[Iterable[str]] = None
|
|
124
|
+
) -> pd.Series:
|
|
125
|
+
"""Fail rows that are exact duplicates (optionally over ``subset``)."""
|
|
126
|
+
subset_list = list(subset) if subset is not None else None
|
|
127
|
+
return _as_bool_mask(df.duplicated(subset=subset_list, keep=False), df.index)
|
dqscore/cli.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""Command-line interface: ``dqscore profile|scan path.csv``."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from . import __version__, auto_scan, profile
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _read(path: str, sep: Optional[str]) -> pd.DataFrame:
|
|
14
|
+
return pd.read_csv(path, sep=sep) if sep else pd.read_csv(path)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
18
|
+
parser = argparse.ArgumentParser(
|
|
19
|
+
prog="dqscore",
|
|
20
|
+
description="Lightweight data quality toolkit for tabular data.",
|
|
21
|
+
)
|
|
22
|
+
parser.add_argument("--version", action="version",
|
|
23
|
+
version=f"dqscore {__version__}")
|
|
24
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
25
|
+
|
|
26
|
+
common = argparse.ArgumentParser(add_help=False)
|
|
27
|
+
common.add_argument("path", help="Path to a CSV/TSV file.")
|
|
28
|
+
common.add_argument("--sep", default=None, help="Field separator (e.g. '\\t').")
|
|
29
|
+
common.add_argument("--html", metavar="FILE", help="Write an HTML report.")
|
|
30
|
+
common.add_argument("--json", metavar="FILE", help="Write a JSON report.")
|
|
31
|
+
|
|
32
|
+
p_profile = sub.add_parser("profile", parents=[common],
|
|
33
|
+
help="Profile every column of the file.")
|
|
34
|
+
p_profile.add_argument("--markdown", metavar="FILE",
|
|
35
|
+
help="Write a Markdown profile.")
|
|
36
|
+
|
|
37
|
+
p_scan = sub.add_parser("scan", parents=[common],
|
|
38
|
+
help="Run a zero-config quality scan.")
|
|
39
|
+
p_scan.add_argument("--max-null-pct", type=float, default=0.0,
|
|
40
|
+
help="Allowed %% of nulls per column (default 0).")
|
|
41
|
+
return parser
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
45
|
+
args = _build_parser().parse_args(argv)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
df = _read(args.path, args.sep)
|
|
49
|
+
except Exception as exc: # pragma: no cover - user input errors
|
|
50
|
+
print(f"error: could not read {args.path}: {exc}", file=sys.stderr)
|
|
51
|
+
return 2
|
|
52
|
+
|
|
53
|
+
if args.command == "profile":
|
|
54
|
+
prof = profile(df)
|
|
55
|
+
print(prof.to_markdown())
|
|
56
|
+
if getattr(args, "markdown", None):
|
|
57
|
+
with open(args.markdown, "w", encoding="utf-8") as fh:
|
|
58
|
+
fh.write(prof.to_markdown())
|
|
59
|
+
if args.html:
|
|
60
|
+
prof.to_html(args.html)
|
|
61
|
+
if args.json:
|
|
62
|
+
import json
|
|
63
|
+
|
|
64
|
+
with open(args.json, "w", encoding="utf-8") as fh:
|
|
65
|
+
json.dump(prof.to_dict(), fh, indent=2, default=str)
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
# scan
|
|
69
|
+
result = auto_scan(df, max_null_pct=args.max_null_pct)
|
|
70
|
+
print(result.summary())
|
|
71
|
+
if args.html:
|
|
72
|
+
result.to_html(args.html)
|
|
73
|
+
if args.json:
|
|
74
|
+
with open(args.json, "w", encoding="utf-8") as fh:
|
|
75
|
+
fh.write(result.to_json())
|
|
76
|
+
return 0 if result.passed else 1
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
if __name__ == "__main__": # pragma: no cover
|
|
80
|
+
raise SystemExit(main())
|
dqscore/profiling.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Lightweight DataFrame profiling: per-column stats, missingness, outliers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from html import escape
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
__all__ = ["profile", "Profile"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class Profile:
|
|
15
|
+
"""Result of :func:`profile`, with export helpers."""
|
|
16
|
+
|
|
17
|
+
columns: List[Dict[str, Any]]
|
|
18
|
+
n_rows: int
|
|
19
|
+
n_cols: int
|
|
20
|
+
|
|
21
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
22
|
+
return {
|
|
23
|
+
"n_rows": self.n_rows,
|
|
24
|
+
"n_cols": self.n_cols,
|
|
25
|
+
"columns": self.columns,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def to_frame(self) -> pd.DataFrame:
|
|
29
|
+
"""Return the profile as a tidy DataFrame (one row per column)."""
|
|
30
|
+
return pd.DataFrame(self.columns)
|
|
31
|
+
|
|
32
|
+
def to_markdown(self) -> str:
|
|
33
|
+
df = self.to_frame()
|
|
34
|
+
cols = [c for c in ["column", "dtype", "missing", "missing_pct",
|
|
35
|
+
"unique", "distinct_pct", "mean", "min", "max",
|
|
36
|
+
"outliers_iqr", "top_value"] if c in df.columns]
|
|
37
|
+
lines = [
|
|
38
|
+
f"# Data Profile — {self.n_rows} rows × {self.n_cols} columns",
|
|
39
|
+
"",
|
|
40
|
+
"| " + " | ".join(cols) + " |",
|
|
41
|
+
"| " + " | ".join(["---"] * len(cols)) + " |",
|
|
42
|
+
]
|
|
43
|
+
for _, row in df[cols].iterrows():
|
|
44
|
+
lines.append("| " + " | ".join(str(row[c]) for c in cols) + " |")
|
|
45
|
+
return "\n".join(lines)
|
|
46
|
+
|
|
47
|
+
def to_html(self, path: Optional[str] = None) -> str:
|
|
48
|
+
table = self.to_frame().to_html(index=False, na_rep="", border=0)
|
|
49
|
+
html = f"""<!DOCTYPE html>
|
|
50
|
+
<html lang="en"><head><meta charset="utf-8"><title>Data Profile</title>
|
|
51
|
+
<style>
|
|
52
|
+
body {{ font-family:-apple-system,Segoe UI,Roboto,sans-serif; margin:2rem;
|
|
53
|
+
color:#1f2328; }}
|
|
54
|
+
h1 {{ font-size:1.4rem; }}
|
|
55
|
+
table {{ border-collapse:collapse; width:100%; font-size:.88rem; }}
|
|
56
|
+
th,td {{ border-bottom:1px solid #d0d7de; padding:.45rem .6rem;
|
|
57
|
+
text-align:left; }}
|
|
58
|
+
th {{ background:#f6f8fa; position:sticky; top:0; }}
|
|
59
|
+
</style></head><body>
|
|
60
|
+
<h1>Data Profile — {self.n_rows} rows × {self.n_cols} columns</h1>
|
|
61
|
+
{table}
|
|
62
|
+
</body></html>"""
|
|
63
|
+
if path:
|
|
64
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
65
|
+
fh.write(html)
|
|
66
|
+
return html
|
|
67
|
+
|
|
68
|
+
def __repr__(self) -> str: # pragma: no cover - cosmetic
|
|
69
|
+
return f"<Profile rows={self.n_rows} cols={self.n_cols}>"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _round(value: Any, n: int = 4) -> Any:
|
|
73
|
+
try:
|
|
74
|
+
return round(float(value), n)
|
|
75
|
+
except (TypeError, ValueError):
|
|
76
|
+
return value
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def profile(df: pd.DataFrame) -> Profile:
|
|
80
|
+
"""Build a per-column profile of ``df``.
|
|
81
|
+
|
|
82
|
+
For numeric columns this adds descriptive statistics and an IQR-based
|
|
83
|
+
outlier count. For other columns it records the most frequent value.
|
|
84
|
+
"""
|
|
85
|
+
if not isinstance(df, pd.DataFrame):
|
|
86
|
+
raise TypeError("profile() expects a pandas DataFrame")
|
|
87
|
+
|
|
88
|
+
n = len(df)
|
|
89
|
+
columns: List[Dict[str, Any]] = []
|
|
90
|
+
|
|
91
|
+
for col in df.columns:
|
|
92
|
+
s = df[col]
|
|
93
|
+
missing = int(s.isna().sum())
|
|
94
|
+
nunique = int(s.nunique(dropna=True))
|
|
95
|
+
info: Dict[str, Any] = {
|
|
96
|
+
"column": str(col),
|
|
97
|
+
"dtype": str(s.dtype),
|
|
98
|
+
"count": int(s.count()),
|
|
99
|
+
"missing": missing,
|
|
100
|
+
"missing_pct": _round(missing / n * 100, 2) if n else 0.0,
|
|
101
|
+
"unique": nunique,
|
|
102
|
+
"distinct_pct": _round(nunique / n * 100, 2) if n else 0.0,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if pd.api.types.is_numeric_dtype(s) and not pd.api.types.is_bool_dtype(s):
|
|
106
|
+
numeric = s.dropna()
|
|
107
|
+
if len(numeric):
|
|
108
|
+
q1, q3 = numeric.quantile(0.25), numeric.quantile(0.75)
|
|
109
|
+
iqr = q3 - q1
|
|
110
|
+
low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
|
|
111
|
+
outliers = int(((numeric < low) | (numeric > high)).sum())
|
|
112
|
+
info.update(
|
|
113
|
+
{
|
|
114
|
+
"mean": _round(numeric.mean()),
|
|
115
|
+
"std": _round(numeric.std()),
|
|
116
|
+
"min": _round(numeric.min()),
|
|
117
|
+
"q25": _round(q1),
|
|
118
|
+
"median": _round(numeric.median()),
|
|
119
|
+
"q75": _round(q3),
|
|
120
|
+
"max": _round(numeric.max()),
|
|
121
|
+
"zeros": int((numeric == 0).sum()),
|
|
122
|
+
"negatives": int((numeric < 0).sum()),
|
|
123
|
+
"outliers_iqr": outliers,
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
vc = s.value_counts(dropna=True)
|
|
128
|
+
if len(vc):
|
|
129
|
+
info["top_value"] = str(vc.index[0])
|
|
130
|
+
info["top_freq"] = int(vc.iloc[0])
|
|
131
|
+
|
|
132
|
+
columns.append(info)
|
|
133
|
+
|
|
134
|
+
return Profile(columns=columns, n_rows=n, n_cols=df.shape[1])
|
dqscore/py.typed
ADDED
|
File without changes
|
dqscore/report.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Result objects for validation runs, with scoring and export helpers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from html import escape
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
8
|
+
|
|
9
|
+
__all__ = ["CheckResult", "ValidationResult"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CheckResult:
|
|
14
|
+
"""Outcome of a single check against a single column (or the frame)."""
|
|
15
|
+
|
|
16
|
+
check: str
|
|
17
|
+
column: str
|
|
18
|
+
passed: bool
|
|
19
|
+
n_failing: int
|
|
20
|
+
n_total: int
|
|
21
|
+
params: Dict[str, Any] = field(default_factory=dict)
|
|
22
|
+
failing_index: List[Any] = field(default_factory=list)
|
|
23
|
+
sample_values: List[Any] = field(default_factory=list)
|
|
24
|
+
message: str = ""
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def pass_rate(self) -> float:
|
|
28
|
+
"""Fraction of rows that satisfied this check (0.0 - 1.0)."""
|
|
29
|
+
if self.n_total == 0:
|
|
30
|
+
return 1.0
|
|
31
|
+
return (self.n_total - self.n_failing) / self.n_total
|
|
32
|
+
|
|
33
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
34
|
+
return {
|
|
35
|
+
"check": self.check,
|
|
36
|
+
"column": self.column,
|
|
37
|
+
"passed": self.passed,
|
|
38
|
+
"n_failing": self.n_failing,
|
|
39
|
+
"n_total": self.n_total,
|
|
40
|
+
"pass_rate": round(self.pass_rate, 4),
|
|
41
|
+
"params": self.params,
|
|
42
|
+
"failing_index": [str(i) for i in self.failing_index],
|
|
43
|
+
"sample_values": [_safe(v) for v in self.sample_values],
|
|
44
|
+
"message": self.message,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ValidationResult:
|
|
50
|
+
"""Aggregated outcome of validating a DataFrame against a schema."""
|
|
51
|
+
|
|
52
|
+
results: List[CheckResult]
|
|
53
|
+
n_rows: int
|
|
54
|
+
schema_name: str = "schema"
|
|
55
|
+
|
|
56
|
+
# -- summary properties -------------------------------------------------
|
|
57
|
+
@property
|
|
58
|
+
def passed(self) -> bool:
|
|
59
|
+
return all(r.passed for r in self.results)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def n_checks(self) -> int:
|
|
63
|
+
return len(self.results)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def n_passed(self) -> int:
|
|
67
|
+
return sum(1 for r in self.results if r.passed)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def n_failed(self) -> int:
|
|
71
|
+
return self.n_checks - self.n_passed
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def score(self) -> float:
|
|
75
|
+
"""Percentage of checks that passed (0 - 100)."""
|
|
76
|
+
if not self.results:
|
|
77
|
+
return 100.0
|
|
78
|
+
return round(self.n_passed / self.n_checks * 100, 2)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def failures(self) -> List[CheckResult]:
|
|
82
|
+
return [r for r in self.results if not r.passed]
|
|
83
|
+
|
|
84
|
+
# -- exports ------------------------------------------------------------
|
|
85
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
86
|
+
return {
|
|
87
|
+
"schema": self.schema_name,
|
|
88
|
+
"passed": self.passed,
|
|
89
|
+
"score": self.score,
|
|
90
|
+
"n_rows": self.n_rows,
|
|
91
|
+
"n_checks": self.n_checks,
|
|
92
|
+
"n_passed": self.n_passed,
|
|
93
|
+
"n_failed": self.n_failed,
|
|
94
|
+
"results": [r.to_dict() for r in self.results],
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
def to_json(self, indent: int = 2) -> str:
|
|
98
|
+
return json.dumps(self.to_dict(), indent=indent, default=_safe)
|
|
99
|
+
|
|
100
|
+
def summary(self) -> str:
|
|
101
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
102
|
+
lines = [
|
|
103
|
+
f"Data Quality Report - schema: {self.schema_name}",
|
|
104
|
+
"=" * 52,
|
|
105
|
+
f"Status : {status}",
|
|
106
|
+
f"Score : {self.score}% ({self.n_passed}/{self.n_checks} checks)",
|
|
107
|
+
f"Rows : {self.n_rows}",
|
|
108
|
+
"-" * 52,
|
|
109
|
+
]
|
|
110
|
+
for r in self.results:
|
|
111
|
+
mark = "PASS" if r.passed else "FAIL"
|
|
112
|
+
detail = "" if r.passed else f" ({r.n_failing} failing)"
|
|
113
|
+
lines.append(f"[{mark}] {r.column}.{r.check}{detail}")
|
|
114
|
+
return "\n".join(lines)
|
|
115
|
+
|
|
116
|
+
def to_markdown(self) -> str:
|
|
117
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
118
|
+
lines = [
|
|
119
|
+
f"# Data Quality Report — `{self.schema_name}`",
|
|
120
|
+
"",
|
|
121
|
+
f"**Status:** {status} · **Score:** {self.score}% "
|
|
122
|
+
f"· **Checks:** {self.n_passed}/{self.n_checks} passed "
|
|
123
|
+
f"· **Rows:** {self.n_rows}",
|
|
124
|
+
"",
|
|
125
|
+
"| Status | Column | Check | Failing | Pass rate |",
|
|
126
|
+
"| :----: | ------ | ----- | ------: | --------: |",
|
|
127
|
+
]
|
|
128
|
+
for r in self.results:
|
|
129
|
+
mark = "✅" if r.passed else "❌"
|
|
130
|
+
lines.append(
|
|
131
|
+
f"| {mark} | `{r.column}` | {r.check} | "
|
|
132
|
+
f"{r.n_failing} | {r.pass_rate:.1%} |"
|
|
133
|
+
)
|
|
134
|
+
return "\n".join(lines)
|
|
135
|
+
|
|
136
|
+
def to_html(self, path: Optional[str] = None) -> str:
|
|
137
|
+
rows = []
|
|
138
|
+
for r in self.results:
|
|
139
|
+
color = "#1a7f37" if r.passed else "#cf222e"
|
|
140
|
+
mark = "PASS" if r.passed else "FAIL"
|
|
141
|
+
rows.append(
|
|
142
|
+
f"<tr><td style='color:{color};font-weight:600'>{mark}</td>"
|
|
143
|
+
f"<td><code>{escape(str(r.column))}</code></td>"
|
|
144
|
+
f"<td>{escape(r.check)}</td>"
|
|
145
|
+
f"<td style='text-align:right'>{r.n_failing}</td>"
|
|
146
|
+
f"<td style='text-align:right'>{r.pass_rate:.1%}</td></tr>"
|
|
147
|
+
)
|
|
148
|
+
status = "PASSED" if self.passed else "FAILED"
|
|
149
|
+
status_color = "#1a7f37" if self.passed else "#cf222e"
|
|
150
|
+
html = f"""<!DOCTYPE html>
|
|
151
|
+
<html lang="en"><head><meta charset="utf-8">
|
|
152
|
+
<title>DQ Report - {escape(self.schema_name)}</title>
|
|
153
|
+
<style>
|
|
154
|
+
body {{ font-family: -apple-system, Segoe UI, Roboto, sans-serif;
|
|
155
|
+
margin: 2rem; color: #1f2328; }}
|
|
156
|
+
h1 {{ font-size: 1.4rem; }}
|
|
157
|
+
.badge {{ display:inline-block; padding:.2rem .6rem; border-radius:6px;
|
|
158
|
+
color:#fff; font-weight:600; background:{status_color}; }}
|
|
159
|
+
table {{ border-collapse: collapse; width: 100%; margin-top: 1rem; }}
|
|
160
|
+
th, td {{ border-bottom: 1px solid #d0d7de; padding: .5rem .75rem;
|
|
161
|
+
text-align: left; font-size: .92rem; }}
|
|
162
|
+
th {{ background:#f6f8fa; }}
|
|
163
|
+
.meta {{ color:#57606a; margin:.5rem 0 1rem; }}
|
|
164
|
+
</style></head><body>
|
|
165
|
+
<h1>Data Quality Report — {escape(self.schema_name)}</h1>
|
|
166
|
+
<p><span class="badge">{status}</span></p>
|
|
167
|
+
<p class="meta">Score {self.score}% · {self.n_passed}/{self.n_checks} checks
|
|
168
|
+
passed · {self.n_rows} rows</p>
|
|
169
|
+
<table><thead><tr><th>Status</th><th>Column</th><th>Check</th>
|
|
170
|
+
<th style="text-align:right">Failing</th>
|
|
171
|
+
<th style="text-align:right">Pass rate</th></tr></thead>
|
|
172
|
+
<tbody>{''.join(rows)}</tbody></table>
|
|
173
|
+
</body></html>"""
|
|
174
|
+
if path:
|
|
175
|
+
with open(path, "w", encoding="utf-8") as fh:
|
|
176
|
+
fh.write(html)
|
|
177
|
+
return html
|
|
178
|
+
|
|
179
|
+
def __repr__(self) -> str: # pragma: no cover - cosmetic
|
|
180
|
+
return (
|
|
181
|
+
f"<ValidationResult passed={self.passed} score={self.score}% "
|
|
182
|
+
f"checks={self.n_passed}/{self.n_checks}>"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _safe(value: Any) -> Any:
|
|
187
|
+
"""Make numpy / pandas scalars JSON-serialisable."""
|
|
188
|
+
try:
|
|
189
|
+
import numpy as np
|
|
190
|
+
|
|
191
|
+
if isinstance(value, np.generic):
|
|
192
|
+
return value.item()
|
|
193
|
+
except Exception: # pragma: no cover
|
|
194
|
+
pass
|
|
195
|
+
if value is None:
|
|
196
|
+
return None
|
|
197
|
+
if isinstance(value, (str, int, float, bool)):
|
|
198
|
+
return value
|
|
199
|
+
return str(value)
|
dqscore/validator.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""A small, fluent schema API for declaring expectations and validating data.
|
|
2
|
+
|
|
3
|
+
Example
|
|
4
|
+
-------
|
|
5
|
+
>>> import dqscore as dq
|
|
6
|
+
>>> schema = dq.Schema("customers")
|
|
7
|
+
>>> schema.column("id").not_null().unique()
|
|
8
|
+
>>> schema.column("age").in_range(0, 120)
|
|
9
|
+
>>> schema.column("email").matches(r"^[^@]+@[^@]+\\.[^@]+$")
|
|
10
|
+
>>> schema.no_duplicate_rows()
|
|
11
|
+
>>> result = schema.validate(df) # doctest: +SKIP
|
|
12
|
+
>>> print(result.summary()) # doctest: +SKIP
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from . import checks
|
|
21
|
+
from .report import CheckResult, ValidationResult
|
|
22
|
+
|
|
23
|
+
__all__ = ["Schema", "ColumnSchema"]
|
|
24
|
+
|
|
25
|
+
# A registered check: display name, function(series)->mask, params for the report
|
|
26
|
+
_ColumnCheck = Tuple[str, Callable[[pd.Series], pd.Series], Dict[str, Any]]
|
|
27
|
+
_FrameCheck = Tuple[str, Callable[[pd.DataFrame], pd.Series], Dict[str, Any]]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ColumnSchema:
|
|
31
|
+
"""Collects the checks declared for a single column. Methods chain."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, name: str, parent: "Schema") -> None:
|
|
34
|
+
self.name = name
|
|
35
|
+
self._parent = parent
|
|
36
|
+
self._checks: List[_ColumnCheck] = []
|
|
37
|
+
self.required = True
|
|
38
|
+
|
|
39
|
+
def _add(self, name: str, fn: Callable[[pd.Series], pd.Series], **params: Any):
|
|
40
|
+
self._checks.append((name, fn, params))
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
# -- expectations -------------------------------------------------------
|
|
44
|
+
def not_null(self) -> "ColumnSchema":
|
|
45
|
+
return self._add("not_null", checks.not_null)
|
|
46
|
+
|
|
47
|
+
def unique(self) -> "ColumnSchema":
|
|
48
|
+
return self._add("unique", checks.unique)
|
|
49
|
+
|
|
50
|
+
def in_range(
|
|
51
|
+
self,
|
|
52
|
+
min_value: Optional[float] = None,
|
|
53
|
+
max_value: Optional[float] = None,
|
|
54
|
+
inclusive: bool = True,
|
|
55
|
+
) -> "ColumnSchema":
|
|
56
|
+
return self._add(
|
|
57
|
+
"in_range",
|
|
58
|
+
lambda s: checks.in_range(s, min_value, max_value, inclusive),
|
|
59
|
+
min_value=min_value,
|
|
60
|
+
max_value=max_value,
|
|
61
|
+
inclusive=inclusive,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def in_set(self, allowed: Iterable[Any]) -> "ColumnSchema":
|
|
65
|
+
allowed = list(allowed)
|
|
66
|
+
return self._add("in_set", lambda s: checks.in_set(s, allowed), allowed=allowed)
|
|
67
|
+
|
|
68
|
+
def matches(self, pattern: str, full_match: bool = False) -> "ColumnSchema":
|
|
69
|
+
return self._add(
|
|
70
|
+
"matches",
|
|
71
|
+
lambda s: checks.matches(s, pattern, full_match),
|
|
72
|
+
pattern=pattern,
|
|
73
|
+
full_match=full_match,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def is_numeric(self) -> "ColumnSchema":
|
|
77
|
+
return self._add("is_numeric", checks.is_numeric)
|
|
78
|
+
|
|
79
|
+
def is_integer(self) -> "ColumnSchema":
|
|
80
|
+
return self._add("is_integer", checks.is_integer)
|
|
81
|
+
|
|
82
|
+
def is_datetime(self, fmt: Optional[str] = None) -> "ColumnSchema":
|
|
83
|
+
return self._add("is_datetime", lambda s: checks.is_datetime(s, fmt), fmt=fmt)
|
|
84
|
+
|
|
85
|
+
def string_length(
|
|
86
|
+
self, min_len: Optional[int] = None, max_len: Optional[int] = None
|
|
87
|
+
) -> "ColumnSchema":
|
|
88
|
+
return self._add(
|
|
89
|
+
"string_length",
|
|
90
|
+
lambda s: checks.string_length(s, min_len, max_len),
|
|
91
|
+
min_len=min_len,
|
|
92
|
+
max_len=max_len,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def custom(
|
|
96
|
+
self, fn: Callable[[pd.Series], pd.Series], name: str = "custom"
|
|
97
|
+
) -> "ColumnSchema":
|
|
98
|
+
"""Register a user function returning a mask (``True`` == failing)."""
|
|
99
|
+
return self._add(name, fn)
|
|
100
|
+
|
|
101
|
+
# -- convenience to keep chaining onto the schema ----------------------
|
|
102
|
+
def column(self, name: str) -> "ColumnSchema":
|
|
103
|
+
return self._parent.column(name)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class Schema:
|
|
107
|
+
"""A collection of column- and frame-level expectations."""
|
|
108
|
+
|
|
109
|
+
def __init__(self, name: str = "schema") -> None:
|
|
110
|
+
self.name = name
|
|
111
|
+
self._columns: Dict[str, ColumnSchema] = {}
|
|
112
|
+
self._frame_checks: List[_FrameCheck] = []
|
|
113
|
+
|
|
114
|
+
def column(self, name: str) -> ColumnSchema:
|
|
115
|
+
"""Return (creating if needed) the schema for ``name``."""
|
|
116
|
+
if name not in self._columns:
|
|
117
|
+
self._columns[name] = ColumnSchema(name, self)
|
|
118
|
+
return self._columns[name]
|
|
119
|
+
|
|
120
|
+
def no_duplicate_rows(
|
|
121
|
+
self, subset: Optional[Iterable[str]] = None
|
|
122
|
+
) -> "Schema":
|
|
123
|
+
subset = list(subset) if subset is not None else None
|
|
124
|
+
self._frame_checks.append(
|
|
125
|
+
(
|
|
126
|
+
"no_duplicate_rows",
|
|
127
|
+
lambda df: checks.no_duplicate_rows(df, subset),
|
|
128
|
+
{"subset": subset},
|
|
129
|
+
)
|
|
130
|
+
)
|
|
131
|
+
return self
|
|
132
|
+
|
|
133
|
+
def validate(self, df: pd.DataFrame) -> ValidationResult:
|
|
134
|
+
"""Run every declared check against ``df`` and collect the results."""
|
|
135
|
+
if not isinstance(df, pd.DataFrame):
|
|
136
|
+
raise TypeError("validate() expects a pandas DataFrame")
|
|
137
|
+
|
|
138
|
+
results: List[CheckResult] = []
|
|
139
|
+
n_total = len(df)
|
|
140
|
+
|
|
141
|
+
for col_name, col in self._columns.items():
|
|
142
|
+
if col_name not in df.columns:
|
|
143
|
+
if col.required:
|
|
144
|
+
results.append(
|
|
145
|
+
CheckResult(
|
|
146
|
+
check="column_exists",
|
|
147
|
+
column=col_name,
|
|
148
|
+
passed=False,
|
|
149
|
+
n_failing=n_total,
|
|
150
|
+
n_total=n_total,
|
|
151
|
+
message=f"Column '{col_name}' is missing.",
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
series = df[col_name]
|
|
157
|
+
for check_name, fn, params in col._checks:
|
|
158
|
+
mask = checks._as_bool_mask(fn(series), series.index)
|
|
159
|
+
n_failing = int(mask.sum())
|
|
160
|
+
failing_idx = list(df.index[mask][:10])
|
|
161
|
+
results.append(
|
|
162
|
+
CheckResult(
|
|
163
|
+
check=check_name,
|
|
164
|
+
column=col_name,
|
|
165
|
+
passed=n_failing == 0,
|
|
166
|
+
n_failing=n_failing,
|
|
167
|
+
n_total=n_total,
|
|
168
|
+
params=params,
|
|
169
|
+
failing_index=failing_idx,
|
|
170
|
+
sample_values=list(series[mask].head(10)),
|
|
171
|
+
)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
for check_name, fn, params in self._frame_checks:
|
|
175
|
+
mask = checks._as_bool_mask(fn(df), df.index)
|
|
176
|
+
n_failing = int(mask.sum())
|
|
177
|
+
results.append(
|
|
178
|
+
CheckResult(
|
|
179
|
+
check=check_name,
|
|
180
|
+
column="<frame>",
|
|
181
|
+
passed=n_failing == 0,
|
|
182
|
+
n_failing=n_failing,
|
|
183
|
+
n_total=n_total,
|
|
184
|
+
params=params,
|
|
185
|
+
failing_index=list(df.index[mask][:10]),
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
return ValidationResult(results=results, n_rows=n_total, schema_name=self.name)
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dqscore
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan.
|
|
5
|
+
Author-email: YOUR NAME <you@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/YOUR_USERNAME/dqscore
|
|
8
|
+
Project-URL: Repository, https://github.com/YOUR_USERNAME/dqscore
|
|
9
|
+
Project-URL: Issues, https://github.com/YOUR_USERNAME/dqscore/issues
|
|
10
|
+
Keywords: data-quality,pandas,validation,data-profiling,etl,dataframe
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering
|
|
21
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
22
|
+
Requires-Python: >=3.8
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pandas>=1.3
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
|
|
31
|
+
# dqscore
|
|
32
|
+
|
|
33
|
+
> A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
|
|
34
|
+
> expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
|
|
35
|
+
> no config files required.
|
|
36
|
+
|
|
37
|
+
[](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
|
|
38
|
+
[](https://www.python.org/)
|
|
39
|
+
[](LICENSE)
|
|
40
|
+
|
|
41
|
+
`dqscore` helps you catch the boring-but-costly data problems — nulls where there
|
|
42
|
+
shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
|
|
43
|
+
they reach a model, a dashboard, or a stakeholder.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
## Why this exists ?
|
|
47
|
+
Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
|
|
48
|
+
The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
|
|
49
|
+
But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
|
|
50
|
+
dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
|
|
51
|
+
It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
|
|
52
|
+
The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Why dqscore?
|
|
57
|
+
|
|
58
|
+
- **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
|
|
59
|
+
- **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
|
|
60
|
+
- **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
|
|
61
|
+
- **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
|
|
62
|
+
so it drops straight into a pipeline or pre-commit hook.
|
|
63
|
+
- **One dependency:** pandas.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install dqscore
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Or install the latest from source:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
git clone https://github.com/dgvj-work/dqscore.git
|
|
77
|
+
cd dqscore
|
|
78
|
+
pip install -e ".[dev]"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Quick start
|
|
84
|
+
|
|
85
|
+
### 1. Profile a DataFrame
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import pandas as pd
|
|
89
|
+
import dqscore as dq
|
|
90
|
+
|
|
91
|
+
df = pd.read_csv("customers.csv")
|
|
92
|
+
profile = dq.profile(df)
|
|
93
|
+
|
|
94
|
+
print(profile.to_markdown()) # per-column stats
|
|
95
|
+
profile.to_html("profile.html")
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### 2. Validate against a schema
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
schema = dq.Schema("customers")
|
|
102
|
+
schema.column("id").not_null().unique()
|
|
103
|
+
schema.column("age").in_range(0, 120)
|
|
104
|
+
schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
|
|
105
|
+
schema.column("country").in_set(["US", "CA", "MX"])
|
|
106
|
+
schema.no_duplicate_rows()
|
|
107
|
+
|
|
108
|
+
result = schema.validate(df)
|
|
109
|
+
|
|
110
|
+
print(result.summary()) # human-readable report
|
|
111
|
+
print("Quality score:", result.score)
|
|
112
|
+
result.to_html("dq_report.html")
|
|
113
|
+
|
|
114
|
+
if not result.passed:
|
|
115
|
+
raise SystemExit("Data quality checks failed")
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 3. Zero-config scan
|
|
119
|
+
|
|
120
|
+
When you just want a quick read on a new file:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
|
|
124
|
+
print(result.summary())
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Command line
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Profile every column
|
|
133
|
+
dqscore profile data.csv --html profile.html
|
|
134
|
+
|
|
135
|
+
# Quick quality scan (exit code 1 if it fails — great for CI)
|
|
136
|
+
dqscore scan data.csv --json report.json
|
|
137
|
+
dqscore scan data.csv --max-null-pct 5
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Available checks
|
|
143
|
+
|
|
144
|
+
| Method | Fails when… |
|
|
145
|
+
| ----------------------------------- | -------------------------------------------- |
|
|
146
|
+
| `not_null()` | value is null / NaN / NaT |
|
|
147
|
+
| `unique()` | a non-null value occurs more than once |
|
|
148
|
+
| `in_range(min, max, inclusive)` | numeric value is outside the bounds |
|
|
149
|
+
| `in_set([...])` | value is not one of the allowed values |
|
|
150
|
+
| `matches(pattern, full_match)` | string does not match the regex |
|
|
151
|
+
| `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
|
|
152
|
+
| `is_datetime(fmt)` | value can't be parsed as a date/time |
|
|
153
|
+
| `string_length(min_len, max_len)` | string length is out of bounds |
|
|
154
|
+
| `custom(fn, name)` | your function returns `True` for a row |
|
|
155
|
+
| `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
|
|
156
|
+
|
|
157
|
+
Checks chain on a column and most let nulls pass, so `not_null()` stays the single
|
|
158
|
+
source of truth for missing values:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
schema.column("score").not_null().is_numeric().in_range(0, 100)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
---
|
|
165
|
+
|
|
166
|
+
## Reports & scoring
|
|
167
|
+
|
|
168
|
+
A `ValidationResult` gives you:
|
|
169
|
+
|
|
170
|
+
- `result.passed` — `True`/`False`
|
|
171
|
+
- `result.score` — percentage of checks passed (0–100)
|
|
172
|
+
- `result.failures` — only the failing checks (with sample failing values & indices)
|
|
173
|
+
- `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Contributing
|
|
178
|
+
|
|
179
|
+
Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
180
|
+
Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
dqscore/__init__.py,sha256=B2m7qEYsRqUXwiECq_tmb3O-QMF-zNymlp9-d45xURk,922
|
|
2
|
+
dqscore/autoscan.py,sha256=69kngyUDK5dxqELPRjSp4WfdxJvVnUmT7KP9zwSy2MA,1810
|
|
3
|
+
dqscore/checks.py,sha256=QVuuWHIxaoaOnSmDh-j4ds-8fxQywMi2-qLhenWqN1g,4398
|
|
4
|
+
dqscore/cli.py,sha256=SpUTZ2YfJ6h9glMYCwalOKqYv7II5why_eKlSH33u0Q,2821
|
|
5
|
+
dqscore/profiling.py,sha256=tABsjgQUYRRAnkCo6mhdH_YpWE31EEYUozSqcfohTvg,4733
|
|
6
|
+
dqscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
dqscore/report.py,sha256=nMuJS8ccYFUjP7VIdp01Ci8vdQjboG1z7pVgIGRAchg,6978
|
|
8
|
+
dqscore/validator.py,sha256=Hcxky6g-b9nuXp1oQZG2La9QfhsCXBbhnUVWTDh5pgw,6751
|
|
9
|
+
dqscore-0.1.0.dist-info/licenses/LICENSE,sha256=8vHK8oOyyleh7zi03oOw8La7y31Z3ik7qMnPatOPri4,1073
|
|
10
|
+
dqscore-0.1.0.dist-info/METADATA,sha256=812mfKCi0wl1ENNDICiqeEwhpK9PuHP-AF8ZIHRsJXk,7875
|
|
11
|
+
dqscore-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
dqscore-0.1.0.dist-info/entry_points.txt,sha256=ECYp7oL4KZz5nDtBCS78cYzY9fnY5gWL_MQRwuF8iFM,45
|
|
13
|
+
dqscore-0.1.0.dist-info/top_level.txt,sha256=8_eTWrrK-QYmqumHL4p-2p5hW48SHxyx8nLyNv-byaE,8
|
|
14
|
+
dqscore-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Digvijay Waghela
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
dqscore
|