dqscore 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dqscore/__init__.py ADDED
@@ -0,0 +1,40 @@
1
+ """dqscore — a lightweight data quality toolkit for pandas.
2
+
3
+ Quick start
4
+ -----------
5
+ >>> import pandas as pd
6
+ >>> import dqscore as dq
7
+ >>> df = pd.DataFrame({"id": [1, 2, 2], "age": [30, -1, 41]})
8
+ >>> result = dq.auto_scan(df)
9
+ >>> result.passed
10
+ False
11
+
12
+ Declare expectations explicitly with a :class:`~dqscore.Schema`::
13
+
14
+ schema = dq.Schema("people")
15
+ schema.column("id").not_null().unique()
16
+ schema.column("age").in_range(0, 120)
17
+ report = schema.validate(df)
18
+ print(report.summary())
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from . import checks
23
+ from .autoscan import auto_scan
24
+ from .profiling import Profile, profile
25
+ from .report import CheckResult, ValidationResult
26
+ from .validator import ColumnSchema, Schema
27
+
28
+ __version__ = "0.1.0"
29
+
30
+ __all__ = [
31
+ "Schema",
32
+ "ColumnSchema",
33
+ "profile",
34
+ "Profile",
35
+ "auto_scan",
36
+ "ValidationResult",
37
+ "CheckResult",
38
+ "checks",
39
+ "__version__",
40
+ ]
dqscore/autoscan.py ADDED
@@ -0,0 +1,60 @@
1
+ """Zero-config quality scan: infer sensible default checks for any DataFrame."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional
5
+
6
+ import pandas as pd
7
+
8
+ from .report import ValidationResult
9
+ from .validator import Schema
10
+
11
+ __all__ = ["auto_scan"]
12
+
13
+
14
+ def _looks_like_id(name: str) -> bool:
15
+ lowered = str(name).lower()
16
+ return lowered == "id" or lowered.endswith("_id") or lowered.endswith("id")
17
+
18
+
19
+ def auto_scan(
20
+ df: pd.DataFrame,
21
+ max_null_pct: float = 0.0,
22
+ name: str = "auto_scan",
23
+ ) -> ValidationResult:
24
+ """Run a quick, opinionated quality scan with no schema required.
25
+
26
+ Heuristics applied:
27
+
28
+ * every column is expected to have at most ``max_null_pct`` percent nulls;
29
+ * columns that look like identifiers (``id`` / ``*_id``) are expected to be
30
+ unique;
31
+ * the frame is expected to have no fully duplicated rows.
32
+
33
+ Parameters
34
+ ----------
35
+ df:
36
+ The DataFrame to scan.
37
+ max_null_pct:
38
+ Allowed percentage of nulls per column before the column's null check
39
+ fails. ``0.0`` means "no nulls allowed".
40
+ """
41
+ if not isinstance(df, pd.DataFrame):
42
+ raise TypeError("auto_scan() expects a pandas DataFrame")
43
+
44
+ n = len(df)
45
+ threshold = max_null_pct / 100.0
46
+ schema = Schema(name)
47
+
48
+ for col in df.columns:
49
+ series = df[col]
50
+ null_frac = series.isna().mean() if n else 0.0
51
+ if null_frac > threshold:
52
+ # Flag missingness explicitly via the not_null check.
53
+ schema.column(col).not_null()
54
+ if _looks_like_id(col):
55
+ # Identifier-like columns are expected to be unique; this surfaces
56
+ # accidental duplicate keys, a common data quality defect.
57
+ schema.column(col).unique()
58
+
59
+ schema.no_duplicate_rows()
60
+ return schema.validate(df)
dqscore/checks.py ADDED
@@ -0,0 +1,127 @@
1
+ """Low-level data quality checks.
2
+
3
+ Every check takes a :class:`pandas.Series` (or DataFrame, for frame-level
4
+ checks) and returns a boolean mask aligned to the input where ``True`` marks a
5
+ *failing* row. Null handling is deliberate: most checks let nulls pass so that
6
+ ``not_null`` is the single source of truth for missing values. Combine checks to
7
+ express richer expectations.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import Any, Iterable, Optional
13
+
14
+ import pandas as pd
15
+
16
+ __all__ = [
17
+ "not_null",
18
+ "unique",
19
+ "in_range",
20
+ "in_set",
21
+ "matches",
22
+ "is_numeric",
23
+ "is_integer",
24
+ "is_datetime",
25
+ "string_length",
26
+ "no_duplicate_rows",
27
+ ]
28
+
29
+
30
+ def _as_bool_mask(mask: pd.Series, index: pd.Index) -> pd.Series:
31
+ """Coerce a mask to a clean boolean Series aligned to ``index``."""
32
+ return pd.Series(mask, index=index).fillna(False).astype(bool)
33
+
34
+
35
+ def not_null(series: pd.Series) -> pd.Series:
36
+ """Fail rows whose value is null / NaN / NaT."""
37
+ return series.isna()
38
+
39
+
40
+ def unique(series: pd.Series) -> pd.Series:
41
+ """Fail rows whose (non-null) value appears more than once."""
42
+ duplicated = series.duplicated(keep=False)
43
+ return _as_bool_mask(duplicated & series.notna(), series.index)
44
+
45
+
46
+ def in_range(
47
+ series: pd.Series,
48
+ min_value: Optional[float] = None,
49
+ max_value: Optional[float] = None,
50
+ inclusive: bool = True,
51
+ ) -> pd.Series:
52
+ """Fail rows outside ``[min_value, max_value]``.
53
+
54
+ Non-numeric, non-null values fail as well. Nulls pass (use ``not_null``).
55
+ """
56
+ numeric = pd.to_numeric(series, errors="coerce")
57
+ fail = pd.Series(False, index=series.index)
58
+ if min_value is not None:
59
+ fail |= (numeric < min_value) if inclusive else (numeric <= min_value)
60
+ if max_value is not None:
61
+ fail |= (numeric > max_value) if inclusive else (numeric >= max_value)
62
+ non_numeric = numeric.isna() & series.notna()
63
+ fail |= non_numeric
64
+ return _as_bool_mask(fail, series.index)
65
+
66
+
67
+ def in_set(series: pd.Series, allowed: Iterable[Any]) -> pd.Series:
68
+ """Fail rows whose (non-null) value is not in ``allowed``."""
69
+ allowed_set = set(allowed)
70
+ fail = ~series.isin(allowed_set) & series.notna()
71
+ return _as_bool_mask(fail, series.index)
72
+
73
+
74
+ def matches(series: pd.Series, pattern: str, full_match: bool = False) -> pd.Series:
75
+ """Fail rows whose (non-null) string value does not match ``pattern``."""
76
+ compiled = re.compile(pattern)
77
+ finder = compiled.fullmatch if full_match else compiled.search
78
+
79
+ def _fails(value: Any) -> bool:
80
+ if pd.isna(value):
81
+ return False
82
+ return finder(str(value)) is None
83
+
84
+ return _as_bool_mask(series.map(_fails), series.index)
85
+
86
+
87
+ def is_numeric(series: pd.Series) -> pd.Series:
88
+ """Fail non-null values that cannot be parsed as numbers."""
89
+ coerced = pd.to_numeric(series, errors="coerce")
90
+ return _as_bool_mask(coerced.isna() & series.notna(), series.index)
91
+
92
+
93
+ def is_integer(series: pd.Series) -> pd.Series:
94
+ """Fail non-null values that are not whole numbers."""
95
+ coerced = pd.to_numeric(series, errors="coerce")
96
+ non_numeric = coerced.isna() & series.notna()
97
+ non_integer = coerced.notna() & (coerced % 1 != 0)
98
+ return _as_bool_mask(non_numeric | non_integer, series.index)
99
+
100
+
101
+ def is_datetime(series: pd.Series, fmt: Optional[str] = None) -> pd.Series:
102
+ """Fail non-null values that cannot be parsed as dates/times."""
103
+ coerced = pd.to_datetime(series, errors="coerce", format=fmt)
104
+ return _as_bool_mask(coerced.isna() & series.notna(), series.index)
105
+
106
+
107
+ def string_length(
108
+ series: pd.Series,
109
+ min_len: Optional[int] = None,
110
+ max_len: Optional[int] = None,
111
+ ) -> pd.Series:
112
+ """Fail non-null values whose string length is outside the bounds."""
113
+ lengths = series.dropna().astype(str).str.len()
114
+ fail = pd.Series(False, index=series.index)
115
+ if min_len is not None:
116
+ fail.loc[lengths.index] |= lengths < min_len
117
+ if max_len is not None:
118
+ fail.loc[lengths.index] |= lengths > max_len
119
+ return _as_bool_mask(fail, series.index)
120
+
121
+
122
+ def no_duplicate_rows(
123
+ df: pd.DataFrame, subset: Optional[Iterable[str]] = None
124
+ ) -> pd.Series:
125
+ """Fail rows that are exact duplicates (optionally over ``subset``)."""
126
+ subset_list = list(subset) if subset is not None else None
127
+ return _as_bool_mask(df.duplicated(subset=subset_list, keep=False), df.index)
dqscore/cli.py ADDED
@@ -0,0 +1,80 @@
1
+ """Command-line interface: ``dqscore profile|scan path.csv``."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+ from typing import List, Optional
7
+
8
+ import pandas as pd
9
+
10
+ from . import __version__, auto_scan, profile
11
+
12
+
13
+ def _read(path: str, sep: Optional[str]) -> pd.DataFrame:
14
+ return pd.read_csv(path, sep=sep) if sep else pd.read_csv(path)
15
+
16
+
17
+ def _build_parser() -> argparse.ArgumentParser:
18
+ parser = argparse.ArgumentParser(
19
+ prog="dqscore",
20
+ description="Lightweight data quality toolkit for tabular data.",
21
+ )
22
+ parser.add_argument("--version", action="version",
23
+ version=f"dqscore {__version__}")
24
+ sub = parser.add_subparsers(dest="command", required=True)
25
+
26
+ common = argparse.ArgumentParser(add_help=False)
27
+ common.add_argument("path", help="Path to a CSV/TSV file.")
28
+ common.add_argument("--sep", default=None, help="Field separator (e.g. '\\t').")
29
+ common.add_argument("--html", metavar="FILE", help="Write an HTML report.")
30
+ common.add_argument("--json", metavar="FILE", help="Write a JSON report.")
31
+
32
+ p_profile = sub.add_parser("profile", parents=[common],
33
+ help="Profile every column of the file.")
34
+ p_profile.add_argument("--markdown", metavar="FILE",
35
+ help="Write a Markdown profile.")
36
+
37
+ p_scan = sub.add_parser("scan", parents=[common],
38
+ help="Run a zero-config quality scan.")
39
+ p_scan.add_argument("--max-null-pct", type=float, default=0.0,
40
+ help="Allowed %% of nulls per column (default 0).")
41
+ return parser
42
+
43
+
44
+ def main(argv: Optional[List[str]] = None) -> int:
45
+ args = _build_parser().parse_args(argv)
46
+
47
+ try:
48
+ df = _read(args.path, args.sep)
49
+ except Exception as exc: # pragma: no cover - user input errors
50
+ print(f"error: could not read {args.path}: {exc}", file=sys.stderr)
51
+ return 2
52
+
53
+ if args.command == "profile":
54
+ prof = profile(df)
55
+ print(prof.to_markdown())
56
+ if getattr(args, "markdown", None):
57
+ with open(args.markdown, "w", encoding="utf-8") as fh:
58
+ fh.write(prof.to_markdown())
59
+ if args.html:
60
+ prof.to_html(args.html)
61
+ if args.json:
62
+ import json
63
+
64
+ with open(args.json, "w", encoding="utf-8") as fh:
65
+ json.dump(prof.to_dict(), fh, indent=2, default=str)
66
+ return 0
67
+
68
+ # scan
69
+ result = auto_scan(df, max_null_pct=args.max_null_pct)
70
+ print(result.summary())
71
+ if args.html:
72
+ result.to_html(args.html)
73
+ if args.json:
74
+ with open(args.json, "w", encoding="utf-8") as fh:
75
+ fh.write(result.to_json())
76
+ return 0 if result.passed else 1
77
+
78
+
79
+ if __name__ == "__main__": # pragma: no cover
80
+ raise SystemExit(main())
dqscore/profiling.py ADDED
@@ -0,0 +1,134 @@
1
+ """Lightweight DataFrame profiling: per-column stats, missingness, outliers."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from html import escape
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ import pandas as pd
9
+
10
+ __all__ = ["profile", "Profile"]
11
+
12
+
13
+ @dataclass
14
+ class Profile:
15
+ """Result of :func:`profile`, with export helpers."""
16
+
17
+ columns: List[Dict[str, Any]]
18
+ n_rows: int
19
+ n_cols: int
20
+
21
+ def to_dict(self) -> Dict[str, Any]:
22
+ return {
23
+ "n_rows": self.n_rows,
24
+ "n_cols": self.n_cols,
25
+ "columns": self.columns,
26
+ }
27
+
28
+ def to_frame(self) -> pd.DataFrame:
29
+ """Return the profile as a tidy DataFrame (one row per column)."""
30
+ return pd.DataFrame(self.columns)
31
+
32
+ def to_markdown(self) -> str:
33
+ df = self.to_frame()
34
+ cols = [c for c in ["column", "dtype", "missing", "missing_pct",
35
+ "unique", "distinct_pct", "mean", "min", "max",
36
+ "outliers_iqr", "top_value"] if c in df.columns]
37
+ lines = [
38
+ f"# Data Profile — {self.n_rows} rows × {self.n_cols} columns",
39
+ "",
40
+ "| " + " | ".join(cols) + " |",
41
+ "| " + " | ".join(["---"] * len(cols)) + " |",
42
+ ]
43
+ for _, row in df[cols].iterrows():
44
+ lines.append("| " + " | ".join(str(row[c]) for c in cols) + " |")
45
+ return "\n".join(lines)
46
+
47
+ def to_html(self, path: Optional[str] = None) -> str:
48
+ table = self.to_frame().to_html(index=False, na_rep="", border=0)
49
+ html = f"""<!DOCTYPE html>
50
+ <html lang="en"><head><meta charset="utf-8"><title>Data Profile</title>
51
+ <style>
52
+ body {{ font-family:-apple-system,Segoe UI,Roboto,sans-serif; margin:2rem;
53
+ color:#1f2328; }}
54
+ h1 {{ font-size:1.4rem; }}
55
+ table {{ border-collapse:collapse; width:100%; font-size:.88rem; }}
56
+ th,td {{ border-bottom:1px solid #d0d7de; padding:.45rem .6rem;
57
+ text-align:left; }}
58
+ th {{ background:#f6f8fa; position:sticky; top:0; }}
59
+ </style></head><body>
60
+ <h1>Data Profile — {self.n_rows} rows × {self.n_cols} columns</h1>
61
+ {table}
62
+ </body></html>"""
63
+ if path:
64
+ with open(path, "w", encoding="utf-8") as fh:
65
+ fh.write(html)
66
+ return html
67
+
68
+ def __repr__(self) -> str: # pragma: no cover - cosmetic
69
+ return f"<Profile rows={self.n_rows} cols={self.n_cols}>"
70
+
71
+
72
+ def _round(value: Any, n: int = 4) -> Any:
73
+ try:
74
+ return round(float(value), n)
75
+ except (TypeError, ValueError):
76
+ return value
77
+
78
+
79
+ def profile(df: pd.DataFrame) -> Profile:
80
+ """Build a per-column profile of ``df``.
81
+
82
+ For numeric columns this adds descriptive statistics and an IQR-based
83
+ outlier count. For other columns it records the most frequent value.
84
+ """
85
+ if not isinstance(df, pd.DataFrame):
86
+ raise TypeError("profile() expects a pandas DataFrame")
87
+
88
+ n = len(df)
89
+ columns: List[Dict[str, Any]] = []
90
+
91
+ for col in df.columns:
92
+ s = df[col]
93
+ missing = int(s.isna().sum())
94
+ nunique = int(s.nunique(dropna=True))
95
+ info: Dict[str, Any] = {
96
+ "column": str(col),
97
+ "dtype": str(s.dtype),
98
+ "count": int(s.count()),
99
+ "missing": missing,
100
+ "missing_pct": _round(missing / n * 100, 2) if n else 0.0,
101
+ "unique": nunique,
102
+ "distinct_pct": _round(nunique / n * 100, 2) if n else 0.0,
103
+ }
104
+
105
+ if pd.api.types.is_numeric_dtype(s) and not pd.api.types.is_bool_dtype(s):
106
+ numeric = s.dropna()
107
+ if len(numeric):
108
+ q1, q3 = numeric.quantile(0.25), numeric.quantile(0.75)
109
+ iqr = q3 - q1
110
+ low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
111
+ outliers = int(((numeric < low) | (numeric > high)).sum())
112
+ info.update(
113
+ {
114
+ "mean": _round(numeric.mean()),
115
+ "std": _round(numeric.std()),
116
+ "min": _round(numeric.min()),
117
+ "q25": _round(q1),
118
+ "median": _round(numeric.median()),
119
+ "q75": _round(q3),
120
+ "max": _round(numeric.max()),
121
+ "zeros": int((numeric == 0).sum()),
122
+ "negatives": int((numeric < 0).sum()),
123
+ "outliers_iqr": outliers,
124
+ }
125
+ )
126
+ else:
127
+ vc = s.value_counts(dropna=True)
128
+ if len(vc):
129
+ info["top_value"] = str(vc.index[0])
130
+ info["top_freq"] = int(vc.iloc[0])
131
+
132
+ columns.append(info)
133
+
134
+ return Profile(columns=columns, n_rows=n, n_cols=df.shape[1])
dqscore/py.typed ADDED
File without changes
dqscore/report.py ADDED
@@ -0,0 +1,199 @@
1
+ """Result objects for validation runs, with scoring and export helpers."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ from dataclasses import dataclass, field
6
+ from html import escape
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ __all__ = ["CheckResult", "ValidationResult"]
10
+
11
+
12
+ @dataclass
13
+ class CheckResult:
14
+ """Outcome of a single check against a single column (or the frame)."""
15
+
16
+ check: str
17
+ column: str
18
+ passed: bool
19
+ n_failing: int
20
+ n_total: int
21
+ params: Dict[str, Any] = field(default_factory=dict)
22
+ failing_index: List[Any] = field(default_factory=list)
23
+ sample_values: List[Any] = field(default_factory=list)
24
+ message: str = ""
25
+
26
+ @property
27
+ def pass_rate(self) -> float:
28
+ """Fraction of rows that satisfied this check (0.0 - 1.0)."""
29
+ if self.n_total == 0:
30
+ return 1.0
31
+ return (self.n_total - self.n_failing) / self.n_total
32
+
33
+ def to_dict(self) -> Dict[str, Any]:
34
+ return {
35
+ "check": self.check,
36
+ "column": self.column,
37
+ "passed": self.passed,
38
+ "n_failing": self.n_failing,
39
+ "n_total": self.n_total,
40
+ "pass_rate": round(self.pass_rate, 4),
41
+ "params": self.params,
42
+ "failing_index": [str(i) for i in self.failing_index],
43
+ "sample_values": [_safe(v) for v in self.sample_values],
44
+ "message": self.message,
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class ValidationResult:
50
+ """Aggregated outcome of validating a DataFrame against a schema."""
51
+
52
+ results: List[CheckResult]
53
+ n_rows: int
54
+ schema_name: str = "schema"
55
+
56
+ # -- summary properties -------------------------------------------------
57
+ @property
58
+ def passed(self) -> bool:
59
+ return all(r.passed for r in self.results)
60
+
61
+ @property
62
+ def n_checks(self) -> int:
63
+ return len(self.results)
64
+
65
+ @property
66
+ def n_passed(self) -> int:
67
+ return sum(1 for r in self.results if r.passed)
68
+
69
+ @property
70
+ def n_failed(self) -> int:
71
+ return self.n_checks - self.n_passed
72
+
73
+ @property
74
+ def score(self) -> float:
75
+ """Percentage of checks that passed (0 - 100)."""
76
+ if not self.results:
77
+ return 100.0
78
+ return round(self.n_passed / self.n_checks * 100, 2)
79
+
80
+ @property
81
+ def failures(self) -> List[CheckResult]:
82
+ return [r for r in self.results if not r.passed]
83
+
84
+ # -- exports ------------------------------------------------------------
85
+ def to_dict(self) -> Dict[str, Any]:
86
+ return {
87
+ "schema": self.schema_name,
88
+ "passed": self.passed,
89
+ "score": self.score,
90
+ "n_rows": self.n_rows,
91
+ "n_checks": self.n_checks,
92
+ "n_passed": self.n_passed,
93
+ "n_failed": self.n_failed,
94
+ "results": [r.to_dict() for r in self.results],
95
+ }
96
+
97
+ def to_json(self, indent: int = 2) -> str:
98
+ return json.dumps(self.to_dict(), indent=indent, default=_safe)
99
+
100
+ def summary(self) -> str:
101
+ status = "PASSED" if self.passed else "FAILED"
102
+ lines = [
103
+ f"Data Quality Report - schema: {self.schema_name}",
104
+ "=" * 52,
105
+ f"Status : {status}",
106
+ f"Score : {self.score}% ({self.n_passed}/{self.n_checks} checks)",
107
+ f"Rows : {self.n_rows}",
108
+ "-" * 52,
109
+ ]
110
+ for r in self.results:
111
+ mark = "PASS" if r.passed else "FAIL"
112
+ detail = "" if r.passed else f" ({r.n_failing} failing)"
113
+ lines.append(f"[{mark}] {r.column}.{r.check}{detail}")
114
+ return "\n".join(lines)
115
+
116
+ def to_markdown(self) -> str:
117
+ status = "PASSED" if self.passed else "FAILED"
118
+ lines = [
119
+ f"# Data Quality Report — `{self.schema_name}`",
120
+ "",
121
+ f"**Status:** {status} · **Score:** {self.score}% "
122
+ f"· **Checks:** {self.n_passed}/{self.n_checks} passed "
123
+ f"· **Rows:** {self.n_rows}",
124
+ "",
125
+ "| Status | Column | Check | Failing | Pass rate |",
126
+ "| :----: | ------ | ----- | ------: | --------: |",
127
+ ]
128
+ for r in self.results:
129
+ mark = "✅" if r.passed else "❌"
130
+ lines.append(
131
+ f"| {mark} | `{r.column}` | {r.check} | "
132
+ f"{r.n_failing} | {r.pass_rate:.1%} |"
133
+ )
134
+ return "\n".join(lines)
135
+
136
+ def to_html(self, path: Optional[str] = None) -> str:
137
+ rows = []
138
+ for r in self.results:
139
+ color = "#1a7f37" if r.passed else "#cf222e"
140
+ mark = "PASS" if r.passed else "FAIL"
141
+ rows.append(
142
+ f"<tr><td style='color:{color};font-weight:600'>{mark}</td>"
143
+ f"<td><code>{escape(str(r.column))}</code></td>"
144
+ f"<td>{escape(r.check)}</td>"
145
+ f"<td style='text-align:right'>{r.n_failing}</td>"
146
+ f"<td style='text-align:right'>{r.pass_rate:.1%}</td></tr>"
147
+ )
148
+ status = "PASSED" if self.passed else "FAILED"
149
+ status_color = "#1a7f37" if self.passed else "#cf222e"
150
+ html = f"""<!DOCTYPE html>
151
+ <html lang="en"><head><meta charset="utf-8">
152
+ <title>DQ Report - {escape(self.schema_name)}</title>
153
+ <style>
154
+ body {{ font-family: -apple-system, Segoe UI, Roboto, sans-serif;
155
+ margin: 2rem; color: #1f2328; }}
156
+ h1 {{ font-size: 1.4rem; }}
157
+ .badge {{ display:inline-block; padding:.2rem .6rem; border-radius:6px;
158
+ color:#fff; font-weight:600; background:{status_color}; }}
159
+ table {{ border-collapse: collapse; width: 100%; margin-top: 1rem; }}
160
+ th, td {{ border-bottom: 1px solid #d0d7de; padding: .5rem .75rem;
161
+ text-align: left; font-size: .92rem; }}
162
+ th {{ background:#f6f8fa; }}
163
+ .meta {{ color:#57606a; margin:.5rem 0 1rem; }}
164
+ </style></head><body>
165
+ <h1>Data Quality Report — {escape(self.schema_name)}</h1>
166
+ <p><span class="badge">{status}</span></p>
167
+ <p class="meta">Score {self.score}% · {self.n_passed}/{self.n_checks} checks
168
+ passed · {self.n_rows} rows</p>
169
+ <table><thead><tr><th>Status</th><th>Column</th><th>Check</th>
170
+ <th style="text-align:right">Failing</th>
171
+ <th style="text-align:right">Pass rate</th></tr></thead>
172
+ <tbody>{''.join(rows)}</tbody></table>
173
+ </body></html>"""
174
+ if path:
175
+ with open(path, "w", encoding="utf-8") as fh:
176
+ fh.write(html)
177
+ return html
178
+
179
+ def __repr__(self) -> str: # pragma: no cover - cosmetic
180
+ return (
181
+ f"<ValidationResult passed={self.passed} score={self.score}% "
182
+ f"checks={self.n_passed}/{self.n_checks}>"
183
+ )
184
+
185
+
186
+ def _safe(value: Any) -> Any:
187
+ """Make numpy / pandas scalars JSON-serialisable."""
188
+ try:
189
+ import numpy as np
190
+
191
+ if isinstance(value, np.generic):
192
+ return value.item()
193
+ except Exception: # pragma: no cover
194
+ pass
195
+ if value is None:
196
+ return None
197
+ if isinstance(value, (str, int, float, bool)):
198
+ return value
199
+ return str(value)
dqscore/validator.py ADDED
@@ -0,0 +1,189 @@
1
+ """A small, fluent schema API for declaring expectations and validating data.
2
+
3
+ Example
4
+ -------
5
+ >>> import dqscore as dq
6
+ >>> schema = dq.Schema("customers")
7
+ >>> schema.column("id").not_null().unique()
8
+ >>> schema.column("age").in_range(0, 120)
9
+ >>> schema.column("email").matches(r"^[^@]+@[^@]+\\.[^@]+$")
10
+ >>> schema.no_duplicate_rows()
11
+ >>> result = schema.validate(df) # doctest: +SKIP
12
+ >>> print(result.summary()) # doctest: +SKIP
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
17
+
18
+ import pandas as pd
19
+
20
+ from . import checks
21
+ from .report import CheckResult, ValidationResult
22
+
23
+ __all__ = ["Schema", "ColumnSchema"]
24
+
25
+ # A registered check: display name, function(series)->mask, params for the report
26
+ _ColumnCheck = Tuple[str, Callable[[pd.Series], pd.Series], Dict[str, Any]]
27
+ _FrameCheck = Tuple[str, Callable[[pd.DataFrame], pd.Series], Dict[str, Any]]
28
+
29
+
30
+ class ColumnSchema:
31
+ """Collects the checks declared for a single column. Methods chain."""
32
+
33
+ def __init__(self, name: str, parent: "Schema") -> None:
34
+ self.name = name
35
+ self._parent = parent
36
+ self._checks: List[_ColumnCheck] = []
37
+ self.required = True
38
+
39
+ def _add(self, name: str, fn: Callable[[pd.Series], pd.Series], **params: Any):
40
+ self._checks.append((name, fn, params))
41
+ return self
42
+
43
+ # -- expectations -------------------------------------------------------
44
+ def not_null(self) -> "ColumnSchema":
45
+ return self._add("not_null", checks.not_null)
46
+
47
+ def unique(self) -> "ColumnSchema":
48
+ return self._add("unique", checks.unique)
49
+
50
+ def in_range(
51
+ self,
52
+ min_value: Optional[float] = None,
53
+ max_value: Optional[float] = None,
54
+ inclusive: bool = True,
55
+ ) -> "ColumnSchema":
56
+ return self._add(
57
+ "in_range",
58
+ lambda s: checks.in_range(s, min_value, max_value, inclusive),
59
+ min_value=min_value,
60
+ max_value=max_value,
61
+ inclusive=inclusive,
62
+ )
63
+
64
+ def in_set(self, allowed: Iterable[Any]) -> "ColumnSchema":
65
+ allowed = list(allowed)
66
+ return self._add("in_set", lambda s: checks.in_set(s, allowed), allowed=allowed)
67
+
68
+ def matches(self, pattern: str, full_match: bool = False) -> "ColumnSchema":
69
+ return self._add(
70
+ "matches",
71
+ lambda s: checks.matches(s, pattern, full_match),
72
+ pattern=pattern,
73
+ full_match=full_match,
74
+ )
75
+
76
+ def is_numeric(self) -> "ColumnSchema":
77
+ return self._add("is_numeric", checks.is_numeric)
78
+
79
+ def is_integer(self) -> "ColumnSchema":
80
+ return self._add("is_integer", checks.is_integer)
81
+
82
+ def is_datetime(self, fmt: Optional[str] = None) -> "ColumnSchema":
83
+ return self._add("is_datetime", lambda s: checks.is_datetime(s, fmt), fmt=fmt)
84
+
85
+ def string_length(
86
+ self, min_len: Optional[int] = None, max_len: Optional[int] = None
87
+ ) -> "ColumnSchema":
88
+ return self._add(
89
+ "string_length",
90
+ lambda s: checks.string_length(s, min_len, max_len),
91
+ min_len=min_len,
92
+ max_len=max_len,
93
+ )
94
+
95
+ def custom(
96
+ self, fn: Callable[[pd.Series], pd.Series], name: str = "custom"
97
+ ) -> "ColumnSchema":
98
+ """Register a user function returning a mask (``True`` == failing)."""
99
+ return self._add(name, fn)
100
+
101
+ # -- convenience to keep chaining onto the schema ----------------------
102
+ def column(self, name: str) -> "ColumnSchema":
103
+ return self._parent.column(name)
104
+
105
+
106
+ class Schema:
107
+ """A collection of column- and frame-level expectations."""
108
+
109
+ def __init__(self, name: str = "schema") -> None:
110
+ self.name = name
111
+ self._columns: Dict[str, ColumnSchema] = {}
112
+ self._frame_checks: List[_FrameCheck] = []
113
+
114
+ def column(self, name: str) -> ColumnSchema:
115
+ """Return (creating if needed) the schema for ``name``."""
116
+ if name not in self._columns:
117
+ self._columns[name] = ColumnSchema(name, self)
118
+ return self._columns[name]
119
+
120
+ def no_duplicate_rows(
121
+ self, subset: Optional[Iterable[str]] = None
122
+ ) -> "Schema":
123
+ subset = list(subset) if subset is not None else None
124
+ self._frame_checks.append(
125
+ (
126
+ "no_duplicate_rows",
127
+ lambda df: checks.no_duplicate_rows(df, subset),
128
+ {"subset": subset},
129
+ )
130
+ )
131
+ return self
132
+
133
+ def validate(self, df: pd.DataFrame) -> ValidationResult:
134
+ """Run every declared check against ``df`` and collect the results."""
135
+ if not isinstance(df, pd.DataFrame):
136
+ raise TypeError("validate() expects a pandas DataFrame")
137
+
138
+ results: List[CheckResult] = []
139
+ n_total = len(df)
140
+
141
+ for col_name, col in self._columns.items():
142
+ if col_name not in df.columns:
143
+ if col.required:
144
+ results.append(
145
+ CheckResult(
146
+ check="column_exists",
147
+ column=col_name,
148
+ passed=False,
149
+ n_failing=n_total,
150
+ n_total=n_total,
151
+ message=f"Column '{col_name}' is missing.",
152
+ )
153
+ )
154
+ continue
155
+
156
+ series = df[col_name]
157
+ for check_name, fn, params in col._checks:
158
+ mask = checks._as_bool_mask(fn(series), series.index)
159
+ n_failing = int(mask.sum())
160
+ failing_idx = list(df.index[mask][:10])
161
+ results.append(
162
+ CheckResult(
163
+ check=check_name,
164
+ column=col_name,
165
+ passed=n_failing == 0,
166
+ n_failing=n_failing,
167
+ n_total=n_total,
168
+ params=params,
169
+ failing_index=failing_idx,
170
+ sample_values=list(series[mask].head(10)),
171
+ )
172
+ )
173
+
174
+ for check_name, fn, params in self._frame_checks:
175
+ mask = checks._as_bool_mask(fn(df), df.index)
176
+ n_failing = int(mask.sum())
177
+ results.append(
178
+ CheckResult(
179
+ check=check_name,
180
+ column="<frame>",
181
+ passed=n_failing == 0,
182
+ n_failing=n_failing,
183
+ n_total=n_total,
184
+ params=params,
185
+ failing_index=list(df.index[mask][:10]),
186
+ )
187
+ )
188
+
189
+ return ValidationResult(results=results, n_rows=n_total, schema_name=self.name)
@@ -0,0 +1,184 @@
1
+ Metadata-Version: 2.4
2
+ Name: dqscore
3
+ Version: 0.1.0
4
+ Summary: A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan.
5
+ Author-email: YOUR NAME <you@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/YOUR_USERNAME/dqscore
8
+ Project-URL: Repository, https://github.com/YOUR_USERNAME/dqscore
9
+ Project-URL: Issues, https://github.com/YOUR_USERNAME/dqscore/issues
10
+ Keywords: data-quality,pandas,validation,data-profiling,etl,dataframe
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: Software Development :: Quality Assurance
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: pandas>=1.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: pytest-cov; extra == "dev"
29
+ Dynamic: license-file
30
+
31
+ # dqscore
32
+
33
+ > A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
34
+ > expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
35
+ > no config files required.
36
+
37
+ [![CI](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml/badge.svg)](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
38
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
39
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
40
+
41
+ `dqscore` helps you catch the boring-but-costly data problems — nulls where there
42
+ shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
43
+ they reach a model, a dashboard, or a stakeholder.
44
+
45
+ ---
46
+ ## Why this exists ?
47
+ Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
48
+ The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
49
+ But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
50
+ dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
51
+ It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
52
+ The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
53
+
54
+ ---
55
+
56
+ ## Why dqscore?
57
+
58
+ - **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
59
+ - **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
60
+ - **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
61
+ - **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
62
+ so it drops straight into a pipeline or pre-commit hook.
63
+ - **One dependency:** pandas.
64
+
65
+ ---
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install dqscore
71
+ ```
72
+
73
+ Or install the latest from source:
74
+
75
+ ```bash
76
+ git clone https://github.com/dgvj-work/dqscore.git
77
+ cd dqscore
78
+ pip install -e ".[dev]"
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Quick start
84
+
85
+ ### 1. Profile a DataFrame
86
+
87
+ ```python
88
+ import pandas as pd
89
+ import dqscore as dq
90
+
91
+ df = pd.read_csv("customers.csv")
92
+ profile = dq.profile(df)
93
+
94
+ print(profile.to_markdown()) # per-column stats
95
+ profile.to_html("profile.html")
96
+ ```
97
+
98
+ ### 2. Validate against a schema
99
+
100
+ ```python
101
+ schema = dq.Schema("customers")
102
+ schema.column("id").not_null().unique()
103
+ schema.column("age").in_range(0, 120)
104
+ schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
105
+ schema.column("country").in_set(["US", "CA", "MX"])
106
+ schema.no_duplicate_rows()
107
+
108
+ result = schema.validate(df)
109
+
110
+ print(result.summary()) # human-readable report
111
+ print("Quality score:", result.score)
112
+ result.to_html("dq_report.html")
113
+
114
+ if not result.passed:
115
+ raise SystemExit("Data quality checks failed")
116
+ ```
117
+
118
+ ### 3. Zero-config scan
119
+
120
+ When you just want a quick read on a new file:
121
+
122
+ ```python
123
+ result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
124
+ print(result.summary())
125
+ ```
126
+
127
+ ---
128
+
129
+ ## Command line
130
+
131
+ ```bash
132
+ # Profile every column
133
+ dqscore profile data.csv --html profile.html
134
+
135
+ # Quick quality scan (exit code 1 if it fails — great for CI)
136
+ dqscore scan data.csv --json report.json
137
+ dqscore scan data.csv --max-null-pct 5
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Available checks
143
+
144
+ | Method | Fails when… |
145
+ | ----------------------------------- | -------------------------------------------- |
146
+ | `not_null()` | value is null / NaN / NaT |
147
+ | `unique()` | a non-null value occurs more than once |
148
+ | `in_range(min, max, inclusive)` | numeric value is outside the bounds |
149
+ | `in_set([...])` | value is not one of the allowed values |
150
+ | `matches(pattern, full_match)` | string does not match the regex |
151
+ | `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
152
+ | `is_datetime(fmt)` | value can't be parsed as a date/time |
153
+ | `string_length(min_len, max_len)` | string length is out of bounds |
154
+ | `custom(fn, name)` | your function returns `True` for a row |
155
+ | `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
156
+
157
+ Checks chain on a column and most let nulls pass, so `not_null()` stays the single
158
+ source of truth for missing values:
159
+
160
+ ```python
161
+ schema.column("score").not_null().is_numeric().in_range(0, 100)
162
+ ```
163
+
164
+ ---
165
+
166
+ ## Reports & scoring
167
+
168
+ A `ValidationResult` gives you:
169
+
170
+ - `result.passed` — `True`/`False`
171
+ - `result.score` — percentage of checks passed (0–100)
172
+ - `result.failures` — only the failing checks (with sample failing values & indices)
173
+ - `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
174
+
175
+ ---
176
+
177
+ ## Contributing
178
+
179
+ Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
180
+ Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
181
+
182
+ ## License
183
+
184
+ [MIT](LICENSE)
@@ -0,0 +1,14 @@
1
+ dqscore/__init__.py,sha256=B2m7qEYsRqUXwiECq_tmb3O-QMF-zNymlp9-d45xURk,922
2
+ dqscore/autoscan.py,sha256=69kngyUDK5dxqELPRjSp4WfdxJvVnUmT7KP9zwSy2MA,1810
3
+ dqscore/checks.py,sha256=QVuuWHIxaoaOnSmDh-j4ds-8fxQywMi2-qLhenWqN1g,4398
4
+ dqscore/cli.py,sha256=SpUTZ2YfJ6h9glMYCwalOKqYv7II5why_eKlSH33u0Q,2821
5
+ dqscore/profiling.py,sha256=tABsjgQUYRRAnkCo6mhdH_YpWE31EEYUozSqcfohTvg,4733
6
+ dqscore/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ dqscore/report.py,sha256=nMuJS8ccYFUjP7VIdp01Ci8vdQjboG1z7pVgIGRAchg,6978
8
+ dqscore/validator.py,sha256=Hcxky6g-b9nuXp1oQZG2La9QfhsCXBbhnUVWTDh5pgw,6751
9
+ dqscore-0.1.0.dist-info/licenses/LICENSE,sha256=8vHK8oOyyleh7zi03oOw8La7y31Z3ik7qMnPatOPri4,1073
10
+ dqscore-0.1.0.dist-info/METADATA,sha256=812mfKCi0wl1ENNDICiqeEwhpK9PuHP-AF8ZIHRsJXk,7875
11
+ dqscore-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ dqscore-0.1.0.dist-info/entry_points.txt,sha256=ECYp7oL4KZz5nDtBCS78cYzY9fnY5gWL_MQRwuF8iFM,45
13
+ dqscore-0.1.0.dist-info/top_level.txt,sha256=8_eTWrrK-QYmqumHL4p-2p5hW48SHxyx8nLyNv-byaE,8
14
+ dqscore-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dqscore = dqscore.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Digvijay Waghela
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ dqscore