dqscore 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dqscore-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Digvijay Waghela
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
dqscore-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,184 @@
1
+ Metadata-Version: 2.4
2
+ Name: dqscore
3
+ Version: 0.1.0
4
+ Summary: A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan.
5
+ Author-email: YOUR NAME <you@example.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/YOUR_USERNAME/dqscore
8
+ Project-URL: Repository, https://github.com/YOUR_USERNAME/dqscore
9
+ Project-URL: Issues, https://github.com/YOUR_USERNAME/dqscore/issues
10
+ Keywords: data-quality,pandas,validation,data-profiling,etl,dataframe
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: Software Development :: Quality Assurance
22
+ Requires-Python: >=3.8
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: pandas>=1.3
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=7.0; extra == "dev"
28
+ Requires-Dist: pytest-cov; extra == "dev"
29
+ Dynamic: license-file
30
+
31
+ # dqscore
32
+
33
+ > A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
34
+ > expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
35
+ > no config files required.
36
+
37
+ [![CI](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml/badge.svg)](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
38
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
39
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
40
+
41
+ `dqscore` helps you catch the boring-but-costly data problems — nulls where there
42
+ shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
43
+ they reach a model, a dashboard, or a stakeholder.
44
+
45
+ ---
46
+ ## Why this exists ?
47
+ Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
48
+ The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
49
+ But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
50
+ dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
51
+ It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
52
+ The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
53
+
54
+ ---
55
+
56
+ ## Why dqscore?
57
+
58
+ - **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
59
+ - **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
60
+ - **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
61
+ - **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
62
+ so it drops straight into a pipeline or pre-commit hook.
63
+ - **One dependency:** pandas.
64
+
65
+ ---
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install dqscore
71
+ ```
72
+
73
+ Or install the latest from source:
74
+
75
+ ```bash
76
+ git clone https://github.com/dgvj-work/dqscore.git
77
+ cd dqscore
78
+ pip install -e ".[dev]"
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Quick start
84
+
85
+ ### 1. Profile a DataFrame
86
+
87
+ ```python
88
+ import pandas as pd
89
+ import dqscore as dq
90
+
91
+ df = pd.read_csv("customers.csv")
92
+ profile = dq.profile(df)
93
+
94
+ print(profile.to_markdown()) # per-column stats
95
+ profile.to_html("profile.html")
96
+ ```
97
+
98
+ ### 2. Validate against a schema
99
+
100
+ ```python
101
+ schema = dq.Schema("customers")
102
+ schema.column("id").not_null().unique()
103
+ schema.column("age").in_range(0, 120)
104
+ schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
105
+ schema.column("country").in_set(["US", "CA", "MX"])
106
+ schema.no_duplicate_rows()
107
+
108
+ result = schema.validate(df)
109
+
110
+ print(result.summary()) # human-readable report
111
+ print("Quality score:", result.score)
112
+ result.to_html("dq_report.html")
113
+
114
+ if not result.passed:
115
+ raise SystemExit("Data quality checks failed")
116
+ ```
117
+
118
+ ### 3. Zero-config scan
119
+
120
+ When you just want a quick read on a new file:
121
+
122
+ ```python
123
+ result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
124
+ print(result.summary())
125
+ ```
126
+
127
+ ---
128
+
129
+ ## Command line
130
+
131
+ ```bash
132
+ # Profile every column
133
+ dqscore profile data.csv --html profile.html
134
+
135
+ # Quick quality scan (exit code 1 if it fails — great for CI)
136
+ dqscore scan data.csv --json report.json
137
+ dqscore scan data.csv --max-null-pct 5
138
+ ```
139
+
140
+ ---
141
+
142
+ ## Available checks
143
+
144
+ | Method | Fails when… |
145
+ | ----------------------------------- | -------------------------------------------- |
146
+ | `not_null()` | value is null / NaN / NaT |
147
+ | `unique()` | a non-null value occurs more than once |
148
+ | `in_range(min, max, inclusive)` | numeric value is outside the bounds |
149
+ | `in_set([...])` | value is not one of the allowed values |
150
+ | `matches(pattern, full_match)` | string does not match the regex |
151
+ | `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
152
+ | `is_datetime(fmt)` | value can't be parsed as a date/time |
153
+ | `string_length(min_len, max_len)` | string length is out of bounds |
154
+ | `custom(fn, name)` | your function returns `True` for a row |
155
+ | `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
156
+
157
+ Checks chain on a column and most let nulls pass, so `not_null()` stays the single
158
+ source of truth for missing values:
159
+
160
+ ```python
161
+ schema.column("score").not_null().is_numeric().in_range(0, 100)
162
+ ```
163
+
164
+ ---
165
+
166
+ ## Reports & scoring
167
+
168
+ A `ValidationResult` gives you:
169
+
170
+ - `result.passed` — `True`/`False`
171
+ - `result.score` — percentage of checks passed (0–100)
172
+ - `result.failures` — only the failing checks (with sample failing values & indices)
173
+ - `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
174
+
175
+ ---
176
+
177
+ ## Contributing
178
+
179
+ Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
180
+ Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
181
+
182
+ ## License
183
+
184
+ [MIT](LICENSE)
@@ -0,0 +1,154 @@
1
+ # dqscore
2
+
3
+ > A lightweight **data quality toolkit for pandas** — profile any DataFrame, declare
4
+ > expectations with a fluent schema, or run a zero-config scan. No heavy dependencies,
5
+ > no config files required.
6
+
7
+ [![CI](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml/badge.svg)](https://github.com/dgvj-work/dqscore/actions/workflows/ci.yml)
8
+ [![Python](https://img.shields.io/badge/python-3.8%2B-blue.svg)](https://www.python.org/)
9
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
10
+
11
+ `dqscore` helps you catch the boring-but-costly data problems — nulls where there
12
+ shouldn't be any, duplicate keys, out-of-range values, malformed strings — before
13
+ they reach a model, a dashboard, or a stakeholder.
14
+
15
+ ---
16
+ ## Why this exists ?
17
+ Data quality issues are the silent killers of analytics and ML work. A null in the wrong column, a duplicate primary key, a value outside its expected range — these don't crash your pipeline. They quietly corrupt your output, and you find out three weeks later in a stakeholder meeting.
18
+ The Python ecosystem already has excellent tools for this. Great Expectations is comprehensive and battle-tested. Pandera offers powerful schema-based validation. ydata-profiling produces rich exploratory reports. If you're building a long-lived production data platform, those are the right answers.
19
+ But there's a gap in shape. When an analyst gets a fresh CSV and wants a fast read on whether it's trustworthy, the existing tools ask for a lot upfront — a schema, a config, a project structure, sometimes a framework integration. The lightest possible question — is this data OK? — doesn't have a one-line answer in any of them. And once you do set up checks, getting a single number you can put on a dashboard, or a non-zero exit code you can wire into CI, often needs custom code on top.
20
+ dqscore is built for that middle ground. It has one dependency (pandas) and three things to learn: profile a DataFrame, declare a schema with a fluent API, or run a zero-config scan that infers sensible defaults. Every validation produces a 0–100 quality score and a report that exports to HTML, Markdown, or JSON. The CLI returns exit code 1 on failure, so dqscore scan data.csv drops straight into a CI pipeline or a pre-commit hook with no glue code.
21
+ It's not a replacement for Great Expectations or pandera. It's the tool you reach for at the start of a project, or when reviewing a new dataset, or when you want a simple quality gate in CI without standing up a whole framework. That's the gap, and I think it's a useful one to fill — especially for individuals, smaller teams, and educators where the ceremony of heavier tools is the actual barrier to checking data at all.
22
+ The package is MIT-licensed and feedback is welcome. If a check is missing, a report format would be useful, or the auto-scan heuristics could be smarter for your data, open an issue.
23
+
24
+ ---
25
+
26
+ ## Why dqscore?
27
+
28
+ - **Tiny surface area.** Three things to learn: `profile`, `Schema`, `auto_scan`.
29
+ - **Readable reports.** Every result exports to dict, JSON, Markdown, or styled HTML.
30
+ - **Scoreable.** Each validation produces a 0–100 quality score for dashboards/CI.
31
+ - **CLI included.** `dqscore scan data.csv` returns a non-zero exit code on failure,
32
+ so it drops straight into a pipeline or pre-commit hook.
33
+ - **One dependency:** pandas.
34
+
35
+ ---
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install dqscore
41
+ ```
42
+
43
+ Or install the latest from source:
44
+
45
+ ```bash
46
+ git clone https://github.com/dgvj-work/dqscore.git
47
+ cd dqscore
48
+ pip install -e ".[dev]"
49
+ ```
50
+
51
+ ---
52
+
53
+ ## Quick start
54
+
55
+ ### 1. Profile a DataFrame
56
+
57
+ ```python
58
+ import pandas as pd
59
+ import dqscore as dq
60
+
61
+ df = pd.read_csv("customers.csv")
62
+ profile = dq.profile(df)
63
+
64
+ print(profile.to_markdown()) # per-column stats
65
+ profile.to_html("profile.html")
66
+ ```
67
+
68
+ ### 2. Validate against a schema
69
+
70
+ ```python
71
+ schema = dq.Schema("customers")
72
+ schema.column("id").not_null().unique()
73
+ schema.column("age").in_range(0, 120)
74
+ schema.column("email").matches(r"^[^@]+@[^@]+\.[^@]+$")
75
+ schema.column("country").in_set(["US", "CA", "MX"])
76
+ schema.no_duplicate_rows()
77
+
78
+ result = schema.validate(df)
79
+
80
+ print(result.summary()) # human-readable report
81
+ print("Quality score:", result.score)
82
+ result.to_html("dq_report.html")
83
+
84
+ if not result.passed:
85
+ raise SystemExit("Data quality checks failed")
86
+ ```
87
+
88
+ ### 3. Zero-config scan
89
+
90
+ When you just want a quick read on a new file:
91
+
92
+ ```python
93
+ result = dq.auto_scan(df) # checks nulls, duplicate keys, duplicate rows
94
+ print(result.summary())
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Command line
100
+
101
+ ```bash
102
+ # Profile every column
103
+ dqscore profile data.csv --html profile.html
104
+
105
+ # Quick quality scan (exit code 1 if it fails — great for CI)
106
+ dqscore scan data.csv --json report.json
107
+ dqscore scan data.csv --max-null-pct 5
108
+ ```
109
+
110
+ ---
111
+
112
+ ## Available checks
113
+
114
+ | Method | Fails when… |
115
+ | ----------------------------------- | -------------------------------------------- |
116
+ | `not_null()` | value is null / NaN / NaT |
117
+ | `unique()` | a non-null value occurs more than once |
118
+ | `in_range(min, max, inclusive)` | numeric value is outside the bounds |
119
+ | `in_set([...])` | value is not one of the allowed values |
120
+ | `matches(pattern, full_match)` | string does not match the regex |
121
+ | `is_numeric()` / `is_integer()` | value can't be parsed as a number / integer |
122
+ | `is_datetime(fmt)` | value can't be parsed as a date/time |
123
+ | `string_length(min_len, max_len)` | string length is out of bounds |
124
+ | `custom(fn, name)` | your function returns `True` for a row |
125
+ | `Schema.no_duplicate_rows(subset)` | rows are exact duplicates |
126
+
127
+ Checks chain on a column and most let nulls pass, so `not_null()` stays the single
128
+ source of truth for missing values:
129
+
130
+ ```python
131
+ schema.column("score").not_null().is_numeric().in_range(0, 100)
132
+ ```
133
+
134
+ ---
135
+
136
+ ## Reports & scoring
137
+
138
+ A `ValidationResult` gives you:
139
+
140
+ - `result.passed` — `True`/`False`
141
+ - `result.score` — percentage of checks passed (0–100)
142
+ - `result.failures` — only the failing checks (with sample failing values & indices)
143
+ - `result.summary()` / `to_markdown()` / `to_json()` / `to_html(path)`
144
+
145
+ ---
146
+
147
+ ## Contributing
148
+
149
+ Contributions and feedback are very welcome — see [CONTRIBUTING.md](CONTRIBUTING.md).
150
+ Found a bug or want a new check? [Open an issue](https://github.com/dgvj-work/dqscore/issues).
151
+
152
+ ## License
153
+
154
+ [MIT](LICENSE)
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "dqscore"
7
+ version = "0.1.0"
8
+ description = "A lightweight data quality toolkit for pandas: profiling, validation schemas, and a zero-config scan."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "YOUR NAME", email = "you@example.com" }]
13
+ keywords = ["data-quality", "pandas", "validation", "data-profiling", "etl", "dataframe"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.8",
20
+ "Programming Language :: Python :: 3.9",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Scientific/Engineering",
25
+ "Topic :: Software Development :: Quality Assurance",
26
+ ]
27
+ dependencies = ["pandas>=1.3"]
28
+
29
+ [project.urls]
30
+ Homepage = "https://github.com/YOUR_USERNAME/dqscore"
31
+ Repository = "https://github.com/YOUR_USERNAME/dqscore"
32
+ Issues = "https://github.com/YOUR_USERNAME/dqscore/issues"
33
+
34
+ [project.optional-dependencies]
35
+ dev = ["pytest>=7.0", "pytest-cov"]
36
+
37
+ [project.scripts]
38
+ dqscore = "dqscore.cli:main"
39
+
40
+ [tool.setuptools.packages.find]
41
+ where = ["src"]
42
+
43
+ [tool.setuptools.package-data]
44
+ dqscore = ["py.typed"]
45
+
46
+ [tool.pytest.ini_options]
47
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,40 @@
1
+ """dqscore — a lightweight data quality toolkit for pandas.
2
+
3
+ Quick start
4
+ -----------
5
+ >>> import pandas as pd
6
+ >>> import dqscore as dq
7
+ >>> df = pd.DataFrame({"id": [1, 2, 2], "age": [30, -1, 41]})
8
+ >>> result = dq.auto_scan(df)
9
+ >>> result.passed
10
+ False
11
+
12
+ Declare expectations explicitly with a :class:`~dqscore.Schema`::
13
+
14
+ schema = dq.Schema("people")
15
+ schema.column("id").not_null().unique()
16
+ schema.column("age").in_range(0, 120)
17
+ report = schema.validate(df)
18
+ print(report.summary())
19
+ """
20
+ from __future__ import annotations
21
+
22
+ from . import checks
23
+ from .autoscan import auto_scan
24
+ from .profiling import Profile, profile
25
+ from .report import CheckResult, ValidationResult
26
+ from .validator import ColumnSchema, Schema
27
+
28
+ __version__ = "0.1.0"
29
+
30
+ __all__ = [
31
+ "Schema",
32
+ "ColumnSchema",
33
+ "profile",
34
+ "Profile",
35
+ "auto_scan",
36
+ "ValidationResult",
37
+ "CheckResult",
38
+ "checks",
39
+ "__version__",
40
+ ]
@@ -0,0 +1,60 @@
1
+ """Zero-config quality scan: infer sensible default checks for any DataFrame."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Optional
5
+
6
+ import pandas as pd
7
+
8
+ from .report import ValidationResult
9
+ from .validator import Schema
10
+
11
+ __all__ = ["auto_scan"]
12
+
13
+
14
+ def _looks_like_id(name: str) -> bool:
15
+ lowered = str(name).lower()
16
+ return lowered == "id" or lowered.endswith("_id") or lowered.endswith("id")
17
+
18
+
19
+ def auto_scan(
20
+ df: pd.DataFrame,
21
+ max_null_pct: float = 0.0,
22
+ name: str = "auto_scan",
23
+ ) -> ValidationResult:
24
+ """Run a quick, opinionated quality scan with no schema required.
25
+
26
+ Heuristics applied:
27
+
28
+ * every column is expected to have at most ``max_null_pct`` percent nulls;
29
+ * columns that look like identifiers (``id`` / ``*_id``) are expected to be
30
+ unique;
31
+ * the frame is expected to have no fully duplicated rows.
32
+
33
+ Parameters
34
+ ----------
35
+ df:
36
+ The DataFrame to scan.
37
+ max_null_pct:
38
+ Allowed percentage of nulls per column before the column's null check
39
+ fails. ``0.0`` means "no nulls allowed".
40
+ """
41
+ if not isinstance(df, pd.DataFrame):
42
+ raise TypeError("auto_scan() expects a pandas DataFrame")
43
+
44
+ n = len(df)
45
+ threshold = max_null_pct / 100.0
46
+ schema = Schema(name)
47
+
48
+ for col in df.columns:
49
+ series = df[col]
50
+ null_frac = series.isna().mean() if n else 0.0
51
+ if null_frac > threshold:
52
+ # Flag missingness explicitly via the not_null check.
53
+ schema.column(col).not_null()
54
+ if _looks_like_id(col):
55
+ # Identifier-like columns are expected to be unique; this surfaces
56
+ # accidental duplicate keys, a common data quality defect.
57
+ schema.column(col).unique()
58
+
59
+ schema.no_duplicate_rows()
60
+ return schema.validate(df)
@@ -0,0 +1,127 @@
1
+ """Low-level data quality checks.
2
+
3
+ Every check takes a :class:`pandas.Series` (or DataFrame, for frame-level
4
+ checks) and returns a boolean mask aligned to the input where ``True`` marks a
5
+ *failing* row. Null handling is deliberate: most checks let nulls pass so that
6
+ ``not_null`` is the single source of truth for missing values. Combine checks to
7
+ express richer expectations.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from typing import Any, Iterable, Optional
13
+
14
+ import pandas as pd
15
+
16
+ __all__ = [
17
+ "not_null",
18
+ "unique",
19
+ "in_range",
20
+ "in_set",
21
+ "matches",
22
+ "is_numeric",
23
+ "is_integer",
24
+ "is_datetime",
25
+ "string_length",
26
+ "no_duplicate_rows",
27
+ ]
28
+
29
+
30
+ def _as_bool_mask(mask: pd.Series, index: pd.Index) -> pd.Series:
31
+ """Coerce a mask to a clean boolean Series aligned to ``index``."""
32
+ return pd.Series(mask, index=index).fillna(False).astype(bool)
33
+
34
+
35
+ def not_null(series: pd.Series) -> pd.Series:
36
+ """Fail rows whose value is null / NaN / NaT."""
37
+ return series.isna()
38
+
39
+
40
+ def unique(series: pd.Series) -> pd.Series:
41
+ """Fail rows whose (non-null) value appears more than once."""
42
+ duplicated = series.duplicated(keep=False)
43
+ return _as_bool_mask(duplicated & series.notna(), series.index)
44
+
45
+
46
+ def in_range(
47
+ series: pd.Series,
48
+ min_value: Optional[float] = None,
49
+ max_value: Optional[float] = None,
50
+ inclusive: bool = True,
51
+ ) -> pd.Series:
52
+ """Fail rows outside ``[min_value, max_value]``.
53
+
54
+ Non-numeric, non-null values fail as well. Nulls pass (use ``not_null``).
55
+ """
56
+ numeric = pd.to_numeric(series, errors="coerce")
57
+ fail = pd.Series(False, index=series.index)
58
+ if min_value is not None:
59
+ fail |= (numeric < min_value) if inclusive else (numeric <= min_value)
60
+ if max_value is not None:
61
+ fail |= (numeric > max_value) if inclusive else (numeric >= max_value)
62
+ non_numeric = numeric.isna() & series.notna()
63
+ fail |= non_numeric
64
+ return _as_bool_mask(fail, series.index)
65
+
66
+
67
+ def in_set(series: pd.Series, allowed: Iterable[Any]) -> pd.Series:
68
+ """Fail rows whose (non-null) value is not in ``allowed``."""
69
+ allowed_set = set(allowed)
70
+ fail = ~series.isin(allowed_set) & series.notna()
71
+ return _as_bool_mask(fail, series.index)
72
+
73
+
74
+ def matches(series: pd.Series, pattern: str, full_match: bool = False) -> pd.Series:
75
+ """Fail rows whose (non-null) string value does not match ``pattern``."""
76
+ compiled = re.compile(pattern)
77
+ finder = compiled.fullmatch if full_match else compiled.search
78
+
79
+ def _fails(value: Any) -> bool:
80
+ if pd.isna(value):
81
+ return False
82
+ return finder(str(value)) is None
83
+
84
+ return _as_bool_mask(series.map(_fails), series.index)
85
+
86
+
87
+ def is_numeric(series: pd.Series) -> pd.Series:
88
+ """Fail non-null values that cannot be parsed as numbers."""
89
+ coerced = pd.to_numeric(series, errors="coerce")
90
+ return _as_bool_mask(coerced.isna() & series.notna(), series.index)
91
+
92
+
93
+ def is_integer(series: pd.Series) -> pd.Series:
94
+ """Fail non-null values that are not whole numbers."""
95
+ coerced = pd.to_numeric(series, errors="coerce")
96
+ non_numeric = coerced.isna() & series.notna()
97
+ non_integer = coerced.notna() & (coerced % 1 != 0)
98
+ return _as_bool_mask(non_numeric | non_integer, series.index)
99
+
100
+
101
+ def is_datetime(series: pd.Series, fmt: Optional[str] = None) -> pd.Series:
102
+ """Fail non-null values that cannot be parsed as dates/times."""
103
+ coerced = pd.to_datetime(series, errors="coerce", format=fmt)
104
+ return _as_bool_mask(coerced.isna() & series.notna(), series.index)
105
+
106
+
107
+ def string_length(
108
+ series: pd.Series,
109
+ min_len: Optional[int] = None,
110
+ max_len: Optional[int] = None,
111
+ ) -> pd.Series:
112
+ """Fail non-null values whose string length is outside the bounds."""
113
+ lengths = series.dropna().astype(str).str.len()
114
+ fail = pd.Series(False, index=series.index)
115
+ if min_len is not None:
116
+ fail.loc[lengths.index] |= lengths < min_len
117
+ if max_len is not None:
118
+ fail.loc[lengths.index] |= lengths > max_len
119
+ return _as_bool_mask(fail, series.index)
120
+
121
+
122
+ def no_duplicate_rows(
123
+ df: pd.DataFrame, subset: Optional[Iterable[str]] = None
124
+ ) -> pd.Series:
125
+ """Fail rows that are exact duplicates (optionally over ``subset``)."""
126
+ subset_list = list(subset) if subset is not None else None
127
+ return _as_bool_mask(df.duplicated(subset=subset_list, keep=False), df.index)