aqualisys 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aqualisys/__init__.py +19 -0
- aqualisys/checker.py +98 -0
- aqualisys/checks/__init__.py +3 -0
- aqualisys/checks/base.py +55 -0
- aqualisys/checks/rules.py +130 -0
- aqualisys/cli.py +44 -0
- aqualisys/config.py +112 -0
- aqualisys/logging/__init__.py +3 -0
- aqualisys/logging/base.py +37 -0
- aqualisys/logging/sqlite.py +113 -0
- aqualisys-0.1.0.dist-info/METADATA +59 -0
- aqualisys-0.1.0.dist-info/RECORD +15 -0
- aqualisys-0.1.0.dist-info/WHEEL +4 -0
- aqualisys-0.1.0.dist-info/entry_points.txt +2 -0
- aqualisys-0.1.0.dist-info/licenses/LICENSE +21 -0
aqualisys/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Aqualisys: a Polars-first data quality toolkit.
|
|
3
|
+
|
|
4
|
+
Expose the key classes so downstream users can import from `aqualisys`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .checker import DataQualityChecker, RuleBundle
|
|
8
|
+
from .checks.rules import AcceptedValuesRule, NotNullRule, RelationshipRule, UniqueRule
|
|
9
|
+
from .logging.sqlite import SQLiteRunLogger
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AcceptedValuesRule",
|
|
13
|
+
"DataQualityChecker",
|
|
14
|
+
"NotNullRule",
|
|
15
|
+
"RelationshipRule",
|
|
16
|
+
"RuleBundle",
|
|
17
|
+
"SQLiteRunLogger",
|
|
18
|
+
"UniqueRule",
|
|
19
|
+
]
|
aqualisys/checker.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from collections.abc import Callable, Iterable, Sequence
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import polars as pl
|
|
7
|
+
except ModuleNotFoundError:
|
|
8
|
+
# pragma: no cover - optional dependency in some environments
|
|
9
|
+
pl = None # type: ignore
|
|
10
|
+
|
|
11
|
+
from .checks.base import BaseRule, RuleContext, RuleResult, RuleSeverity
|
|
12
|
+
from .logging.base import RunLogger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(slots=True)
|
|
16
|
+
class RuleBundle:
|
|
17
|
+
name: str
|
|
18
|
+
description: str
|
|
19
|
+
rule_factory: Callable[[], Sequence[BaseRule]]
|
|
20
|
+
|
|
21
|
+
def rules(self) -> list[BaseRule]:
|
|
22
|
+
return list(self.rule_factory())
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(slots=True)
|
|
26
|
+
class ValidationReport:
|
|
27
|
+
run_id: str
|
|
28
|
+
dataset_name: str
|
|
29
|
+
results: list[RuleResult]
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def passed(self) -> bool:
|
|
33
|
+
return all(result.passed for result in self.results)
|
|
34
|
+
|
|
35
|
+
@property
|
|
36
|
+
def failed_rules(self) -> list[RuleResult]:
|
|
37
|
+
return [result for result in self.results if not result.passed]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DataQualityChecker:
|
|
41
|
+
"""Coordinates rule execution and logging."""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
rules: Iterable[BaseRule] | None = None,
|
|
46
|
+
bundles: Iterable[RuleBundle] | None = None,
|
|
47
|
+
logger: RunLogger | None = None,
|
|
48
|
+
fail_fast: bool = False,
|
|
49
|
+
) -> None:
|
|
50
|
+
self._rules: list[BaseRule] = list(rules or [])
|
|
51
|
+
for bundle in bundles or []:
|
|
52
|
+
self._rules.extend(bundle.rules())
|
|
53
|
+
self._logger = logger
|
|
54
|
+
self._fail_fast = fail_fast
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def rules(self) -> list[BaseRule]:
|
|
58
|
+
return list(self._rules)
|
|
59
|
+
|
|
60
|
+
def add_rules(self, *rules: BaseRule) -> None:
|
|
61
|
+
self._rules.extend(rules)
|
|
62
|
+
|
|
63
|
+
def run(
|
|
64
|
+
self,
|
|
65
|
+
dataframe: "pl.DataFrame",
|
|
66
|
+
dataset_name: str,
|
|
67
|
+
run_id: str | None = None,
|
|
68
|
+
) -> ValidationReport:
|
|
69
|
+
if pl is None: # pragma: no cover - guard for environments without polars
|
|
70
|
+
raise RuntimeError("polars is required to run validations")
|
|
71
|
+
|
|
72
|
+
run_id = run_id or str(uuid4())
|
|
73
|
+
context = RuleContext(dataset_name=dataset_name, run_id=run_id)
|
|
74
|
+
results: list[RuleResult] = []
|
|
75
|
+
|
|
76
|
+
if self._logger:
|
|
77
|
+
self._logger.log_run_started(context)
|
|
78
|
+
|
|
79
|
+
for rule in self._rules:
|
|
80
|
+
result = rule.evaluate(dataframe)
|
|
81
|
+
results.append(result)
|
|
82
|
+
if self._logger:
|
|
83
|
+
self._logger.log_rule_result(context, result)
|
|
84
|
+
if (
|
|
85
|
+
self._fail_fast
|
|
86
|
+
and not result.passed
|
|
87
|
+
and rule.severity is RuleSeverity.ERROR
|
|
88
|
+
):
|
|
89
|
+
break
|
|
90
|
+
|
|
91
|
+
if self._logger:
|
|
92
|
+
self._logger.log_run_completed(context, results)
|
|
93
|
+
|
|
94
|
+
return ValidationReport(
|
|
95
|
+
run_id=run_id,
|
|
96
|
+
dataset_name=dataset_name,
|
|
97
|
+
results=results,
|
|
98
|
+
)
|
aqualisys/checks/base.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from collections.abc import Mapping
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
from typing import Any, Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import polars as pl
|
|
9
|
+
except ModuleNotFoundError:
|
|
10
|
+
# pragma: no cover - polars is an optional runtime dependency in tests
|
|
11
|
+
pl = None # type: ignore
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RuleSeverity(StrEnum):
|
|
15
|
+
ERROR = "error"
|
|
16
|
+
WARN = "warn"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RuleStatus(StrEnum):
|
|
20
|
+
PASSED = "passed"
|
|
21
|
+
FAILED = "failed"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(slots=True)
|
|
25
|
+
class RuleResult:
|
|
26
|
+
"""Represents the outcome of a single rule."""
|
|
27
|
+
|
|
28
|
+
rule_name: str
|
|
29
|
+
status: RuleStatus
|
|
30
|
+
message: str
|
|
31
|
+
severity: RuleSeverity
|
|
32
|
+
metrics: Mapping[str, Any] | None = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def passed(self) -> bool:
|
|
36
|
+
return self.status is RuleStatus.PASSED
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(slots=True)
|
|
40
|
+
class RuleContext:
|
|
41
|
+
dataset_name: str
|
|
42
|
+
run_id: str
|
|
43
|
+
executed_at: datetime = datetime.now(tz=UTC)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@runtime_checkable
|
|
47
|
+
class BaseRule(Protocol):
|
|
48
|
+
"""All validation rules must follow this shape."""
|
|
49
|
+
|
|
50
|
+
name: str
|
|
51
|
+
description: str
|
|
52
|
+
severity: RuleSeverity
|
|
53
|
+
|
|
54
|
+
def evaluate(self, df: "pl.DataFrame") -> RuleResult: # pragma: no cover - protocol
|
|
55
|
+
...
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
|
|
4
|
+
import polars as pl
|
|
5
|
+
|
|
6
|
+
from .base import BaseRule, RuleResult, RuleSeverity, RuleStatus
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(slots=True)
|
|
10
|
+
class ColumnRule(BaseRule):
|
|
11
|
+
column: str
|
|
12
|
+
severity: RuleSeverity = RuleSeverity.ERROR
|
|
13
|
+
description: str | None = None
|
|
14
|
+
|
|
15
|
+
def __post_init__(self) -> None:
|
|
16
|
+
self.description = (
|
|
17
|
+
self.description or f"{self.__class__.__name__} on {self.column}"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def name(self) -> str:
|
|
22
|
+
return f"{self.__class__.__name__}::{self.column}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NotNullRule(ColumnRule):
|
|
26
|
+
def evaluate(self, df: pl.DataFrame) -> RuleResult:
|
|
27
|
+
nulls = df.select(pl.col(self.column).is_null().sum().alias("nulls")).item()
|
|
28
|
+
status = RuleStatus.PASSED if nulls == 0 else RuleStatus.FAILED
|
|
29
|
+
message = (
|
|
30
|
+
"column has no nulls"
|
|
31
|
+
if status is RuleStatus.PASSED
|
|
32
|
+
else f"{nulls} null values found"
|
|
33
|
+
)
|
|
34
|
+
return RuleResult(
|
|
35
|
+
rule_name=self.name,
|
|
36
|
+
status=status,
|
|
37
|
+
message=message,
|
|
38
|
+
severity=self.severity,
|
|
39
|
+
metrics={"null_count": nulls},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class UniqueRule(ColumnRule):
|
|
44
|
+
def evaluate(self, df: pl.DataFrame) -> RuleResult:
|
|
45
|
+
total_rows = df.height
|
|
46
|
+
unique_rows = df.select(pl.col(self.column).n_unique().alias("unique")).item()
|
|
47
|
+
duplicates = total_rows - unique_rows
|
|
48
|
+
status = RuleStatus.PASSED if duplicates == 0 else RuleStatus.FAILED
|
|
49
|
+
message = (
|
|
50
|
+
"column values are unique"
|
|
51
|
+
if status is RuleStatus.PASSED
|
|
52
|
+
else f"{duplicates} duplicate rows found"
|
|
53
|
+
)
|
|
54
|
+
return RuleResult(
|
|
55
|
+
rule_name=self.name,
|
|
56
|
+
status=status,
|
|
57
|
+
message=message,
|
|
58
|
+
severity=self.severity,
|
|
59
|
+
metrics={"duplicate_count": duplicates},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class AcceptedValuesRule(ColumnRule):
|
|
64
|
+
def __init__(
|
|
65
|
+
self,
|
|
66
|
+
column: str,
|
|
67
|
+
allowed_values: Iterable[str | int | float],
|
|
68
|
+
severity: RuleSeverity = RuleSeverity.ERROR,
|
|
69
|
+
):
|
|
70
|
+
super().__init__(column=column, severity=severity)
|
|
71
|
+
self.allowed_values = tuple(dict.fromkeys(allowed_values)) # stable + deduped
|
|
72
|
+
|
|
73
|
+
def evaluate(self, df: pl.DataFrame) -> RuleResult:
|
|
74
|
+
violations = (
|
|
75
|
+
df.filter(~pl.col(self.column).is_in(self.allowed_values))
|
|
76
|
+
.select(pl.len())
|
|
77
|
+
.item()
|
|
78
|
+
)
|
|
79
|
+
status = RuleStatus.PASSED if violations == 0 else RuleStatus.FAILED
|
|
80
|
+
message = (
|
|
81
|
+
"column values match allowed set"
|
|
82
|
+
if status is RuleStatus.PASSED
|
|
83
|
+
else f"{violations} disallowed values detected"
|
|
84
|
+
)
|
|
85
|
+
return RuleResult(
|
|
86
|
+
rule_name=self.name,
|
|
87
|
+
status=status,
|
|
88
|
+
message=message,
|
|
89
|
+
severity=self.severity,
|
|
90
|
+
metrics={
|
|
91
|
+
"violation_count": violations,
|
|
92
|
+
"allowed_values": self.allowed_values,
|
|
93
|
+
},
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class RelationshipRule(ColumnRule):
|
|
98
|
+
def __init__(
|
|
99
|
+
self,
|
|
100
|
+
column: str,
|
|
101
|
+
reference_df: pl.DataFrame,
|
|
102
|
+
reference_column: str,
|
|
103
|
+
severity: RuleSeverity = RuleSeverity.ERROR,
|
|
104
|
+
) -> None:
|
|
105
|
+
super().__init__(column=column, severity=severity)
|
|
106
|
+
self._reference_df = reference_df.select(reference_column)
|
|
107
|
+
self._reference_column = reference_column
|
|
108
|
+
|
|
109
|
+
def evaluate(self, df: pl.DataFrame) -> RuleResult:
|
|
110
|
+
reference_set = set(self._reference_df[self._reference_column].to_list())
|
|
111
|
+
violations = (
|
|
112
|
+
df.filter(~pl.col(self.column).is_in(reference_set)).select(pl.len()).item()
|
|
113
|
+
)
|
|
114
|
+
status = RuleStatus.PASSED if violations == 0 else RuleStatus.FAILED
|
|
115
|
+
message = (
|
|
116
|
+
"referential integrity holds"
|
|
117
|
+
if status is RuleStatus.PASSED
|
|
118
|
+
else f"{violations} values missing from reference {self._reference_column}"
|
|
119
|
+
)
|
|
120
|
+
return RuleResult(
|
|
121
|
+
rule_name=self.name,
|
|
122
|
+
status=status,
|
|
123
|
+
message=message,
|
|
124
|
+
severity=self.severity,
|
|
125
|
+
metrics={
|
|
126
|
+
"violation_count": violations,
|
|
127
|
+
"reference_column": self._reference_column,
|
|
128
|
+
"reference_size": len(reference_set),
|
|
129
|
+
},
|
|
130
|
+
)
|
aqualisys/cli.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
from .config import ValidationSuiteConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@click.group()
|
|
10
|
+
def cli() -> None:
|
|
11
|
+
"""CLI entry point for running data quality suites."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@cli.command("validate")
|
|
15
|
+
@click.argument(
|
|
16
|
+
"config_path",
|
|
17
|
+
type=click.Path(exists=True, dir_okay=False, path_type=Path),
|
|
18
|
+
)
|
|
19
|
+
def validate_command(config_path: Path) -> None:
|
|
20
|
+
"""Run the configured validation suite and emit a JSON summary."""
|
|
21
|
+
|
|
22
|
+
suite = ValidationSuiteConfig.from_yaml(config_path)
|
|
23
|
+
dataframe = suite.load_dataframe()
|
|
24
|
+
checker = suite.build_checker()
|
|
25
|
+
report = checker.run(dataframe, dataset_name=suite.dataset_name)
|
|
26
|
+
|
|
27
|
+
summary = {
|
|
28
|
+
"run_id": report.run_id,
|
|
29
|
+
"dataset": report.dataset_name,
|
|
30
|
+
"passed": report.passed,
|
|
31
|
+
"failed_rules": [result.rule_name for result in report.failed_rules],
|
|
32
|
+
}
|
|
33
|
+
click.echo(json.dumps(summary, indent=2))
|
|
34
|
+
|
|
35
|
+
if not report.passed:
|
|
36
|
+
raise SystemExit(1)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def run() -> None:
|
|
40
|
+
cli(prog_name="aqualisys")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
if __name__ == "__main__": # pragma: no cover - script entry point
|
|
44
|
+
run()
|
aqualisys/config.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from collections.abc import Callable, Mapping
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, ClassVar
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import polars as pl
|
|
10
|
+
except ModuleNotFoundError:
|
|
11
|
+
# pragma: no cover - optional dependency
|
|
12
|
+
pl = None # type: ignore
|
|
13
|
+
|
|
14
|
+
from .checker import DataQualityChecker
|
|
15
|
+
from .checks.base import BaseRule
|
|
16
|
+
from .checks.rules import AcceptedValuesRule, NotNullRule, RelationshipRule, UniqueRule
|
|
17
|
+
from .logging.sqlite import SQLiteRunLogger
|
|
18
|
+
|
|
19
|
+
RuleFactory = Callable[[Mapping[str, Any]], BaseRule]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _build_not_null(config: Mapping[str, Any]) -> BaseRule:
|
|
23
|
+
return NotNullRule(column=config["column"])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_unique(config: Mapping[str, Any]) -> BaseRule:
|
|
27
|
+
return UniqueRule(column=config["column"])
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _build_accepted(config: Mapping[str, Any]) -> BaseRule:
|
|
31
|
+
return AcceptedValuesRule(
|
|
32
|
+
column=config["column"],
|
|
33
|
+
allowed_values=config["allowed_values"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _build_relationship(config: Mapping[str, Any]) -> BaseRule:
|
|
38
|
+
if pl is None:
|
|
39
|
+
# pragma: no cover - config is still valid without runtime Polars
|
|
40
|
+
raise RuntimeError("polars is required for relationship rules")
|
|
41
|
+
ref_path = Path(config["reference"]["path"])
|
|
42
|
+
ref_format = config["reference"].get("format", "parquet")
|
|
43
|
+
if ref_format == "parquet":
|
|
44
|
+
reference_df = pl.read_parquet(ref_path)
|
|
45
|
+
elif ref_format == "csv":
|
|
46
|
+
reference_df = pl.read_csv(ref_path)
|
|
47
|
+
else: # pragma: no cover - validated elsewhere
|
|
48
|
+
raise ValueError(f"unsupported reference format: {ref_format}")
|
|
49
|
+
return RelationshipRule(
|
|
50
|
+
column=config["column"],
|
|
51
|
+
reference_df=reference_df,
|
|
52
|
+
reference_column=config["reference"]["column"],
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
RULE_BUILDERS: dict[str, RuleFactory] = {
|
|
57
|
+
"not_null": _build_not_null,
|
|
58
|
+
"unique": _build_unique,
|
|
59
|
+
"accepted_values": _build_accepted,
|
|
60
|
+
"relationship": _build_relationship,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass(slots=True)
|
|
65
|
+
class ValidationSuiteConfig:
|
|
66
|
+
dataset_name: str
|
|
67
|
+
dataset_path: Path
|
|
68
|
+
dataset_format: str = "parquet"
|
|
69
|
+
fail_fast: bool = False
|
|
70
|
+
rules: list[Mapping[str, Any]] | None = None
|
|
71
|
+
logger_path: Path = Path("aqualisys_runs.db")
|
|
72
|
+
|
|
73
|
+
SUPPORTED_FORMATS: ClassVar[set[str]] = {"parquet", "csv"}
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_yaml(cls, path: str | Path) -> "ValidationSuiteConfig":
|
|
77
|
+
data = yaml.safe_load(Path(path).read_text())
|
|
78
|
+
return cls(
|
|
79
|
+
dataset_name=data["dataset"]["name"],
|
|
80
|
+
dataset_path=Path(data["dataset"]["path"]),
|
|
81
|
+
dataset_format=data["dataset"].get("format", "parquet"),
|
|
82
|
+
fail_fast=data.get("fail_fast", False),
|
|
83
|
+
rules=data.get("rules", []),
|
|
84
|
+
logger_path=Path(data.get("logger", {}).get("path", "aqualisys_runs.db")),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def load_dataframe(self) -> "pl.DataFrame":
|
|
88
|
+
if pl is None: # pragma: no cover - guard for environments lacking polars
|
|
89
|
+
raise RuntimeError("polars is required to load dataframes")
|
|
90
|
+
if self.dataset_format not in self.SUPPORTED_FORMATS:
|
|
91
|
+
raise ValueError(f"unsupported dataset format: {self.dataset_format}")
|
|
92
|
+
if self.dataset_format == "parquet":
|
|
93
|
+
return pl.read_parquet(self.dataset_path)
|
|
94
|
+
return pl.read_csv(self.dataset_path)
|
|
95
|
+
|
|
96
|
+
def build_rules(self) -> list[BaseRule]:
|
|
97
|
+
all_rules: list[BaseRule] = []
|
|
98
|
+
for config in self.rules or []:
|
|
99
|
+
rule_type = config["type"]
|
|
100
|
+
builder = RULE_BUILDERS.get(rule_type)
|
|
101
|
+
if not builder:
|
|
102
|
+
raise ValueError(f"unknown rule type: {rule_type}")
|
|
103
|
+
all_rules.append(builder(config))
|
|
104
|
+
return all_rules
|
|
105
|
+
|
|
106
|
+
def build_checker(self) -> DataQualityChecker:
|
|
107
|
+
logger = SQLiteRunLogger(self.logger_path)
|
|
108
|
+
return DataQualityChecker(
|
|
109
|
+
rules=self.build_rules(),
|
|
110
|
+
logger=logger,
|
|
111
|
+
fail_fast=self.fail_fast,
|
|
112
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
|
|
6
|
+
from ..checks.base import RuleContext, RuleResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(slots=True)
|
|
10
|
+
class RunSummary:
|
|
11
|
+
run_id: str
|
|
12
|
+
dataset_name: str
|
|
13
|
+
started_at: datetime
|
|
14
|
+
finished_at: datetime = field(default_factory=lambda: datetime.now(tz=UTC))
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RunLogger(ABC):
|
|
18
|
+
"""Interface for persisting run metadata + rule outcomes."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def log_run_started(self, context: RuleContext) -> None:
|
|
22
|
+
"""Persist metadata that a run has started."""
|
|
23
|
+
... # pragma: no cover - interface
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def log_rule_result(self, context: RuleContext, result: RuleResult) -> None:
|
|
27
|
+
"""Persist the outcome of a single rule execution."""
|
|
28
|
+
... # pragma: no cover - interface
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def log_run_completed(
|
|
32
|
+
self,
|
|
33
|
+
context: RuleContext,
|
|
34
|
+
results: Iterable[RuleResult],
|
|
35
|
+
) -> None:
|
|
36
|
+
"""Persist that a run finished, including summary counts."""
|
|
37
|
+
... # pragma: no cover
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sqlite3
|
|
3
|
+
from collections.abc import Iterable
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..checks.base import RuleContext, RuleResult
|
|
8
|
+
from .base import RunLogger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SQLiteRunLogger(RunLogger):
|
|
12
|
+
"""Persists run + rule records to a lightweight SQLite database."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, db_path: str | Path = "aqualisys_runs.db") -> None:
|
|
15
|
+
self.db_path = Path(db_path)
|
|
16
|
+
self._ensure_schema()
|
|
17
|
+
|
|
18
|
+
def _connect(self) -> sqlite3.Connection:
|
|
19
|
+
conn = sqlite3.connect(self.db_path)
|
|
20
|
+
conn.execute("PRAGMA journal_mode=WAL;")
|
|
21
|
+
return conn
|
|
22
|
+
|
|
23
|
+
def _ensure_schema(self) -> None:
|
|
24
|
+
with self._connect() as conn:
|
|
25
|
+
conn.execute("""
|
|
26
|
+
CREATE TABLE IF NOT EXISTS runs
|
|
27
|
+
(
|
|
28
|
+
run_id TEXT PRIMARY KEY,
|
|
29
|
+
dataset_name TEXT NOT NULL,
|
|
30
|
+
started_at TEXT NOT NULL,
|
|
31
|
+
finished_at TEXT,
|
|
32
|
+
total_rules INTEGER DEFAULT 0,
|
|
33
|
+
failed_rules INTEGER DEFAULT 0
|
|
34
|
+
)
|
|
35
|
+
""")
|
|
36
|
+
conn.execute("""
|
|
37
|
+
CREATE TABLE IF NOT EXISTS rule_results
|
|
38
|
+
(
|
|
39
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
40
|
+
run_id TEXT NOT NULL,
|
|
41
|
+
rule_name TEXT NOT NULL,
|
|
42
|
+
status TEXT NOT NULL,
|
|
43
|
+
severity TEXT NOT NULL,
|
|
44
|
+
message TEXT NOT NULL,
|
|
45
|
+
metrics TEXT,
|
|
46
|
+
recorded_at TEXT NOT NULL,
|
|
47
|
+
FOREIGN KEY (run_id) REFERENCES runs (run_id)
|
|
48
|
+
)
|
|
49
|
+
""")
|
|
50
|
+
conn.commit()
|
|
51
|
+
|
|
52
|
+
def log_run_started(self, context: RuleContext) -> None:
|
|
53
|
+
with self._connect() as conn:
|
|
54
|
+
conn.execute(
|
|
55
|
+
"""
|
|
56
|
+
INSERT OR REPLACE INTO runs(run_id, dataset_name, started_at)
|
|
57
|
+
VALUES (?, ?, ?)
|
|
58
|
+
""",
|
|
59
|
+
(context.run_id, context.dataset_name, context.executed_at.isoformat()),
|
|
60
|
+
)
|
|
61
|
+
conn.commit()
|
|
62
|
+
|
|
63
|
+
def log_rule_result(self, context: RuleContext, result: RuleResult) -> None:
|
|
64
|
+
with self._connect() as conn:
|
|
65
|
+
conn.execute(
|
|
66
|
+
"""
|
|
67
|
+
INSERT INTO rule_results(
|
|
68
|
+
run_id,
|
|
69
|
+
rule_name,
|
|
70
|
+
status,
|
|
71
|
+
severity,
|
|
72
|
+
message,
|
|
73
|
+
metrics,
|
|
74
|
+
recorded_at
|
|
75
|
+
)
|
|
76
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
77
|
+
""",
|
|
78
|
+
(
|
|
79
|
+
context.run_id,
|
|
80
|
+
result.rule_name,
|
|
81
|
+
result.status.value,
|
|
82
|
+
result.severity.value,
|
|
83
|
+
result.message,
|
|
84
|
+
json.dumps(result.metrics or {}, default=str),
|
|
85
|
+
datetime.now(tz=UTC).isoformat(),
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
conn.commit()
|
|
89
|
+
|
|
90
|
+
def log_run_completed(
|
|
91
|
+
self,
|
|
92
|
+
context: RuleContext,
|
|
93
|
+
results: Iterable[RuleResult],
|
|
94
|
+
) -> None:
|
|
95
|
+
results_list = list(results)
|
|
96
|
+
failed = sum(1 for result in results_list if not result.passed)
|
|
97
|
+
with self._connect() as conn:
|
|
98
|
+
conn.execute(
|
|
99
|
+
"""
|
|
100
|
+
UPDATE runs
|
|
101
|
+
SET finished_at = ?,
|
|
102
|
+
total_rules = ?,
|
|
103
|
+
failed_rules = ?
|
|
104
|
+
WHERE run_id = ?
|
|
105
|
+
""",
|
|
106
|
+
(
|
|
107
|
+
datetime.now(tz=UTC).isoformat(),
|
|
108
|
+
len(results_list),
|
|
109
|
+
failed,
|
|
110
|
+
context.run_id,
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
conn.commit()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aqualisys
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Polars-first data-quality and data-validation toolkit.
|
|
5
|
+
Author-email: Aqualisys Maintainers <maintainers@aqualisys.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: click>=8.1.7
|
|
10
|
+
Requires-Dist: polars>=0.20.0
|
|
11
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: black>=24.3.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: mypy>=1.8.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest>=7.4.4; extra == 'dev'
|
|
17
|
+
Requires-Dist: ruff>=0.2.1; extra == 'dev'
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Aqualisys
|
|
21
|
+
|
|
22
|
+
Polars-first data-quality toolkit delivering deterministic validation, structured logging, and a composable rule registry.
|
|
23
|
+
|
|
24
|
+
## Why Aqualisys?
|
|
25
|
+
- **Declarative rules**: ship reusable expectations such as not-null, uniqueness, accepted-values, and referential checks.
|
|
26
|
+
- **Deterministic logging**: every run is persisted to SQLite (JSON-friendly) for audits and debugging.
|
|
27
|
+
- **Pipeline-ready**: run from Python code or via `aqualisys validate configs/orders.yml` in CI.
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
```bash
|
|
31
|
+
python -m venv .venv && source .venv/bin/activate
|
|
32
|
+
pip install -e .[dev]
|
|
33
|
+
pytest
|
|
34
|
+
aqualisys validate configs/orders.yml
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage Example
|
|
38
|
+
```python
|
|
39
|
+
import polars as pl
|
|
40
|
+
from aqualisys import DataQualityChecker, NotNullRule, UniqueRule, SQLiteRunLogger
|
|
41
|
+
|
|
42
|
+
df = pl.DataFrame({"order_id": [1, 2, 3], "status": ["pending", "shipped", "shipped"]})
|
|
43
|
+
checker = DataQualityChecker(
|
|
44
|
+
rules=[NotNullRule("order_id"), UniqueRule("order_id")],
|
|
45
|
+
logger=SQLiteRunLogger("artifacts/example_runs.db"),
|
|
46
|
+
)
|
|
47
|
+
report = checker.run(df, dataset_name="orders")
|
|
48
|
+
assert report.passed
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Project Structure
|
|
52
|
+
- `src/aqualisys/`: library source (rules, checker, logging, CLI).
|
|
53
|
+
- `tests/`: pytest suites (unit + integration).
|
|
54
|
+
- `configs/`: sample validation suite definitions.
|
|
55
|
+
- `docs/`: roadmap and design notes.
|
|
56
|
+
|
|
57
|
+
See `docs/PUBLISHING.md` for uv-based build and release steps once you are ready to publish a new version.
|
|
58
|
+
|
|
59
|
+
See `docs/ROADMAP.md` for the multi-week implementation plan inspired by the Start Data Engineering guide.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
aqualisys/__init__.py,sha256=uoq8i4KvvZP8LE42T0bWcAyIGYiZBB5QhAWNQptcQmI,483
|
|
2
|
+
aqualisys/checker.py,sha256=2-t8c-kuTxeEimMzWoXgPc_sOw3Pz5MtQVs-XZ2ocEY,2774
|
|
3
|
+
aqualisys/cli.py,sha256=muaadBuLCIPLxuIaPM0Bo4bVh8gjhFaMATukyUnTm_4,1082
|
|
4
|
+
aqualisys/config.py,sha256=qL9VLXv7ROTZrzfi3tQBaBsrugwVr_54fkR2Jqrl81I,3893
|
|
5
|
+
aqualisys/checks/__init__.py,sha256=LgjTJqFLU0cjrSfbsz9WArlz1zFhUusYsIBDlfry-8g,164
|
|
6
|
+
aqualisys/checks/base.py,sha256=TKjpufvSWwR12JtW63ubWMOFcSf3wyS5MTKQlo76eg8,1204
|
|
7
|
+
aqualisys/checks/rules.py,sha256=sBQWtWNbzSqFiuElcZAjqQWHw3ax07B3nHf-mcJTJk8,4303
|
|
8
|
+
aqualisys/logging/__init__.py,sha256=4dQpKjbidZeB6yLBP1TvcpcNQ8q4r2vZOnaptpx9nis,67
|
|
9
|
+
aqualisys/logging/base.py,sha256=nSIzV1uu9MVuKBEHqI0wQwgiWBVl1QaVFUSFdGp2yUY,1111
|
|
10
|
+
aqualisys/logging/sqlite.py,sha256=Zrcp-myNrpPqJeb1330fwGqHcP5uMbucbWQvWoPHNaA,3782
|
|
11
|
+
aqualisys-0.1.0.dist-info/METADATA,sha256=B18N61ONqgz3anSXYxIW30ZBGPa55HnEueIKEmXXLEc,2121
|
|
12
|
+
aqualisys-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
aqualisys-0.1.0.dist-info/entry_points.txt,sha256=FN2StKUy9iH4Q1-dLZk_tF4uWeFqjJ8-LRLhw8jCEmQ,48
|
|
14
|
+
aqualisys-0.1.0.dist-info/licenses/LICENSE,sha256=nPSYvbzst5Xo16pBTGo5Ju5BwC32lsEmKh8IPT4hxxA,1067
|
|
15
|
+
aqualisys-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Absolentia
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|