qualis 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qualis/__init__.py +22 -0
- qualis/adapters/__init__.py +0 -0
- qualis/adapters/console.py +144 -0
- qualis/adapters/duckdb/__init__.py +0 -0
- qualis/adapters/duckdb/adapter.py +208 -0
- qualis/adapters/duckdb/sql_templates.py +77 -0
- qualis/adapters/in_memory/__init__.py +0 -0
- qualis/adapters/in_memory/adapter.py +146 -0
- qualis/adapters/in_memory/reference_data.py +25 -0
- qualis/adapters/postgres/__init__.py +0 -0
- qualis/adapters/postgres/adapter.py +256 -0
- qualis/adapters/postgres/sql_templates.py +84 -0
- qualis/bootstrap.py +51 -0
- qualis/cli/__init__.py +1 -0
- qualis/cli/main.py +619 -0
- qualis/cli/review_cmd.py +58 -0
- qualis/config/__init__.py +0 -0
- qualis/config/context_loader.py +69 -0
- qualis/config/loader.py +193 -0
- qualis/config/settings.py +20 -0
- qualis/config/standards.py +39 -0
- qualis/discover/__init__.py +0 -0
- qualis/discover/evidence_builder.py +32 -0
- qualis/discover/profiler.py +152 -0
- qualis/discover/suggester.py +252 -0
- qualis/discover/writer.py +83 -0
- qualis/domain/__init__.py +0 -0
- qualis/domain/checks.py +10 -0
- qualis/domain/context.py +72 -0
- qualis/domain/enums.py +65 -0
- qualis/domain/evidence.py +50 -0
- qualis/domain/models.py +72 -0
- qualis/domain/params.py +78 -0
- qualis/domain/rule_engine.py +312 -0
- qualis/domain/scoring.py +88 -0
- qualis/domain/standards.py +47 -0
- qualis/engine/__init__.py +0 -0
- qualis/engine/checker.py +85 -0
- qualis/engine/diff.py +96 -0
- qualis/github/__init__.py +7 -0
- qualis/github/__main__.py +32 -0
- qualis/github/comment.py +76 -0
- qualis/ports/__init__.py +0 -0
- qualis/ports/database.py +72 -0
- qualis/ports/notifier.py +10 -0
- qualis/ports/reference_data.py +22 -0
- qualis/py.typed +0 -0
- qualis/report/__init__.py +0 -0
- qualis/report/loader.py +55 -0
- qualis/report/scorecard.py +166 -0
- qualis/report/template.html.j2 +327 -0
- qualis/review/__init__.py +0 -0
- qualis/review/state_machine.py +81 -0
- qualis-0.3.1.dist-info/METADATA +38 -0
- qualis-0.3.1.dist-info/RECORD +58 -0
- qualis-0.3.1.dist-info/WHEEL +4 -0
- qualis-0.3.1.dist-info/entry_points.txt +2 -0
- qualis-0.3.1.dist-info/licenses/LICENSE +176 -0
qualis/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Qualis — Data quality framework that tells you WHAT failed."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.3.1"
|
|
6
|
+
|
|
7
|
+
from qualis.domain.enums import DQDimension, RunStatus, Severity
|
|
8
|
+
from qualis.domain.models import DatasetScore, Rule, Violation
|
|
9
|
+
from qualis.ports.database import DatabasePort
|
|
10
|
+
from qualis.ports.notifier import NotifierPort
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"DQDimension",
|
|
14
|
+
"DatabasePort",
|
|
15
|
+
"DatasetScore",
|
|
16
|
+
"NotifierPort",
|
|
17
|
+
"Rule",
|
|
18
|
+
"RunStatus",
|
|
19
|
+
"Severity",
|
|
20
|
+
"Violation",
|
|
21
|
+
"__version__",
|
|
22
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from qualis.domain.models import DatasetScore
|
|
11
|
+
from qualis.engine.diff import ScoreDiff
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def print_score(score: DatasetScore, console: Console | None = None) -> None:
|
|
15
|
+
"""Print a rich-formatted terminal scorecard for *score*.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
score:
|
|
20
|
+
The ``DatasetScore`` to render.
|
|
21
|
+
console:
|
|
22
|
+
Optional ``rich.console.Console`` instance. A default ``Console``
|
|
23
|
+
(stdout, auto-detect colour) is created when not provided.
|
|
24
|
+
"""
|
|
25
|
+
c = console or Console()
|
|
26
|
+
|
|
27
|
+
pct = int(score.aggregate_score * 100)
|
|
28
|
+
if pct >= 90:
|
|
29
|
+
color, status = "green", "PASSING"
|
|
30
|
+
elif pct >= 70:
|
|
31
|
+
color, status = "yellow", "WARNING"
|
|
32
|
+
else:
|
|
33
|
+
color, status = "red", "FAILING"
|
|
34
|
+
|
|
35
|
+
header = (
|
|
36
|
+
f"[bold magenta]QUALIS[/] · Data Quality Report\n\n"
|
|
37
|
+
f" Score: [bold {color}]{pct} / 100[/]\n"
|
|
38
|
+
f" Status: [{color}]● {status}[/]"
|
|
39
|
+
)
|
|
40
|
+
c.print(Panel(header, border_style="bright_black", padding=(1, 4)))
|
|
41
|
+
|
|
42
|
+
table = Table(show_header=True, header_style="bold")
|
|
43
|
+
table.add_column("Dimension", style="cyan", min_width=16)
|
|
44
|
+
table.add_column("Score", justify="right", min_width=8)
|
|
45
|
+
table.add_column("Checks", justify="right", min_width=10)
|
|
46
|
+
table.add_column("", min_width=3)
|
|
47
|
+
|
|
48
|
+
for ds in score.dimension_scores:
|
|
49
|
+
pct_dim = int(ds.score * 100)
|
|
50
|
+
if ds.score >= 0.9:
|
|
51
|
+
indicator = "[green]✓[/]"
|
|
52
|
+
elif ds.score >= 0.7:
|
|
53
|
+
indicator = "[yellow]⚠[/]"
|
|
54
|
+
else:
|
|
55
|
+
indicator = "[red]✗[/]"
|
|
56
|
+
table.add_row(
|
|
57
|
+
ds.dimension.value.capitalize(),
|
|
58
|
+
f"{pct_dim}%",
|
|
59
|
+
f"{ds.passed}/{ds.total_checks}",
|
|
60
|
+
indicator,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
c.print(table)
|
|
64
|
+
|
|
65
|
+
if score.total_violations > 0:
|
|
66
|
+
c.print(
|
|
67
|
+
f"\n[bold]{score.total_violations} violation(s)[/] "
|
|
68
|
+
f"({score.critical_violations} critical)"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def print_diff(diff: ScoreDiff, console: Console | None = None) -> None:
|
|
73
|
+
"""Print a rich-formatted terminal diff between two DatasetScore snapshots.
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
diff:
|
|
78
|
+
The ``ScoreDiff`` to render.
|
|
79
|
+
console:
|
|
80
|
+
Optional ``rich.console.Console`` instance. A default ``Console``
|
|
81
|
+
(stdout, auto-detect colour) is created when not provided.
|
|
82
|
+
"""
|
|
83
|
+
c = console or Console()
|
|
84
|
+
|
|
85
|
+
before_pct = int(diff.before_aggregate * 100)
|
|
86
|
+
after_pct = int(diff.after_aggregate * 100)
|
|
87
|
+
delta_pct = after_pct - before_pct
|
|
88
|
+
|
|
89
|
+
if delta_pct > 0:
|
|
90
|
+
delta_str = f"[green]↑ +{delta_pct}[/]"
|
|
91
|
+
elif delta_pct < 0:
|
|
92
|
+
delta_str = f"[red]↓ {delta_pct}[/]"
|
|
93
|
+
else:
|
|
94
|
+
delta_str = "[dim]—[/]"
|
|
95
|
+
|
|
96
|
+
header = (
|
|
97
|
+
"[bold magenta]QUALIS[/] · Score Diff\n\n"
|
|
98
|
+
f" Before: [bold]{before_pct} / 100[/]\n"
|
|
99
|
+
f" After: [bold]{after_pct} / 100[/]\n"
|
|
100
|
+
f" Delta: {delta_str}"
|
|
101
|
+
)
|
|
102
|
+
c.print(Panel(header, border_style="bright_black", padding=(1, 4)))
|
|
103
|
+
|
|
104
|
+
table = Table(show_header=True, header_style="bold")
|
|
105
|
+
table.add_column("Dimension", style="cyan", min_width=16)
|
|
106
|
+
table.add_column("Before", justify="right", min_width=8)
|
|
107
|
+
table.add_column("After", justify="right", min_width=8)
|
|
108
|
+
table.add_column("Delta", justify="right", min_width=8)
|
|
109
|
+
table.add_column("", min_width=3)
|
|
110
|
+
|
|
111
|
+
for dd in diff.dimension_deltas:
|
|
112
|
+
before_val = f"{int(dd.before_score * 100)}%" if dd.before_score is not None else "—"
|
|
113
|
+
after_val = f"{int(dd.after_score * 100)}%" if dd.after_score is not None else "—"
|
|
114
|
+
|
|
115
|
+
dim_delta = int(dd.delta * 100)
|
|
116
|
+
if dim_delta > 0:
|
|
117
|
+
delta_cell = f"[green]↑ +{dim_delta}[/]"
|
|
118
|
+
indicator = "[green]✓[/]"
|
|
119
|
+
elif dim_delta < 0:
|
|
120
|
+
delta_cell = f"[red]↓ {dim_delta}[/]"
|
|
121
|
+
indicator = "[red]✗[/]"
|
|
122
|
+
else:
|
|
123
|
+
delta_cell = "[dim]—[/]"
|
|
124
|
+
indicator = "[green]✓[/]"
|
|
125
|
+
|
|
126
|
+
table.add_row(
|
|
127
|
+
dd.dimension.value.capitalize(),
|
|
128
|
+
before_val,
|
|
129
|
+
after_val,
|
|
130
|
+
delta_cell,
|
|
131
|
+
indicator,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
c.print(table)
|
|
135
|
+
|
|
136
|
+
viol_delta = diff.after_violations - diff.before_violations
|
|
137
|
+
if viol_delta != 0:
|
|
138
|
+
sign = "+" if viol_delta > 0 else ""
|
|
139
|
+
color = "red" if viol_delta > 0 else "green"
|
|
140
|
+
c.print(
|
|
141
|
+
f"\nViolations: [bold]{diff.before_violations}[/] → "
|
|
142
|
+
f"[bold]{diff.after_violations}[/] "
|
|
143
|
+
f"([{color}]{sign}{viol_delta}[/])"
|
|
144
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
6
|
+
|
|
7
|
+
from qualis.adapters.duckdb.sql_templates import (
|
|
8
|
+
BETWEEN_SQL,
|
|
9
|
+
IN_SET_SQL,
|
|
10
|
+
NOT_NEGATIVE_SQL,
|
|
11
|
+
NOT_NULL_SQL,
|
|
12
|
+
REFERENCE_LOOKUP_SQL,
|
|
13
|
+
REGEX_SQL,
|
|
14
|
+
ROW_COUNT_SQL,
|
|
15
|
+
TABLE_EXISTS_SQL,
|
|
16
|
+
UNIQUE_SQL,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from collections.abc import Iterator
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _qualified(schema: str, table: str) -> str:
|
|
24
|
+
"""Return a qualified table reference for use inside SQL strings."""
|
|
25
|
+
if schema:
|
|
26
|
+
return f'"{schema}"."{table}"'
|
|
27
|
+
return f'"{table}"'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DuckDBAdapter:
|
|
31
|
+
"""DuckDB-backed implementation of ``DatabasePort``.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
database:
|
|
36
|
+
Path to a persistent DuckDB file, or ``":memory:"`` (default) for an
|
|
37
|
+
in-process, session-only database.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, database: str = ":memory:") -> None:
|
|
41
|
+
self._con = duckdb.connect(database)
|
|
42
|
+
|
|
43
|
+
# ------------------------------------------------------------------
|
|
44
|
+
# Registration helpers
|
|
45
|
+
# ------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
def register_csv(self, table_name: str, path: str) -> None:
|
|
48
|
+
"""Create a table from a CSV file using DuckDB's auto-detection."""
|
|
49
|
+
self._con.execute(
|
|
50
|
+
f'CREATE TABLE "{table_name}" AS SELECT * FROM read_csv_auto(\'{path}\')'
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def register_parquet(self, table_name: str, path: str) -> None:
|
|
54
|
+
"""Create a table from a Parquet file."""
|
|
55
|
+
self._con.execute(
|
|
56
|
+
f'CREATE TABLE "{table_name}" AS SELECT * FROM read_parquet(\'{path}\')'
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# ------------------------------------------------------------------
|
|
60
|
+
# DatabasePort implementation
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
def query(self, sql: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
|
|
64
|
+
"""Execute *sql* and return a list of dicts keyed by column name."""
|
|
65
|
+
rel = self._con.execute(sql)
|
|
66
|
+
columns = [desc[0] for desc in rel.description or []]
|
|
67
|
+
return [dict(zip(columns, row, strict=True)) for row in rel.fetchall()]
|
|
68
|
+
|
|
69
|
+
def execute(self, sql: str, params: dict[str, Any] | None = None) -> int:
|
|
70
|
+
"""Execute a DML statement; return the number of rows affected."""
|
|
71
|
+
self._con.execute(sql)
|
|
72
|
+
# DuckDB does not expose rowcount directly; return 0 as a safe default
|
|
73
|
+
return 0
|
|
74
|
+
|
|
75
|
+
def stream(
|
|
76
|
+
self,
|
|
77
|
+
sql: str,
|
|
78
|
+
params: dict[str, Any] | None = None,
|
|
79
|
+
chunk_size: int = 10_000,
|
|
80
|
+
) -> Iterator[list[dict[str, Any]]]:
|
|
81
|
+
"""Stream query results in chunks of *chunk_size* rows."""
|
|
82
|
+
rel = self._con.execute(sql)
|
|
83
|
+
columns = [desc[0] for desc in rel.description or []]
|
|
84
|
+
while True:
|
|
85
|
+
rows = rel.fetchmany(chunk_size)
|
|
86
|
+
if not rows:
|
|
87
|
+
break
|
|
88
|
+
yield [dict(zip(columns, row, strict=True)) for row in rows]
|
|
89
|
+
|
|
90
|
+
def table_exists(self, schema: str, table: str) -> bool:
|
|
91
|
+
"""Return True when the named table is registered in the database."""
|
|
92
|
+
result = self._con.execute(
|
|
93
|
+
TABLE_EXISTS_SQL.format(table=table)
|
|
94
|
+
).fetchone()
|
|
95
|
+
return bool(result and result[0] > 0)
|
|
96
|
+
|
|
97
|
+
# ------------------------------------------------------------------
|
|
98
|
+
# Check methods
|
|
99
|
+
# ------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
def check_not_null(self, schema: str, table: str, column: str) -> dict[str, int]:
|
|
102
|
+
table_ref = _qualified(schema, table)
|
|
103
|
+
sql = NOT_NULL_SQL.format(column=column, table=table_ref)
|
|
104
|
+
row = self._con.execute(sql).fetchone()
|
|
105
|
+
null_count, total_count = (row[0], row[1]) if row else (0, 0)
|
|
106
|
+
return {"null_count": int(null_count), "total_count": int(total_count)}
|
|
107
|
+
|
|
108
|
+
def check_unique(self, schema: str, table: str, column: str) -> dict[str, int]:
|
|
109
|
+
table_ref = _qualified(schema, table)
|
|
110
|
+
sql = UNIQUE_SQL.format(column=column, table=table_ref)
|
|
111
|
+
row = self._con.execute(sql).fetchone()
|
|
112
|
+
if row is None:
|
|
113
|
+
# No duplicate groups at all — zero duplicates
|
|
114
|
+
total_row = self._con.execute(
|
|
115
|
+
f"SELECT COUNT(*) FROM {table_ref}"
|
|
116
|
+
).fetchone()
|
|
117
|
+
total = int(total_row[0]) if total_row else 0
|
|
118
|
+
return {"duplicate_count": 0, "total_count": total}
|
|
119
|
+
dup_count = int(row[0])
|
|
120
|
+
# Re-query total so it is accurate regardless of UNIQUE_SQL shape
|
|
121
|
+
total_row = self._con.execute(
|
|
122
|
+
f"SELECT COUNT(*) FROM {table_ref}"
|
|
123
|
+
).fetchone()
|
|
124
|
+
total = int(total_row[0]) if total_row else 0
|
|
125
|
+
return {"duplicate_count": dup_count, "total_count": total}
|
|
126
|
+
|
|
127
|
+
def check_between(
|
|
128
|
+
self,
|
|
129
|
+
schema: str,
|
|
130
|
+
table: str,
|
|
131
|
+
column: str,
|
|
132
|
+
min_val: str,
|
|
133
|
+
max_val: str,
|
|
134
|
+
) -> dict[str, int]:
|
|
135
|
+
table_ref = _qualified(schema, table)
|
|
136
|
+
sql = BETWEEN_SQL.format(
|
|
137
|
+
column=column, table=table_ref, min_val=min_val, max_val=max_val
|
|
138
|
+
)
|
|
139
|
+
row = self._con.execute(sql).fetchone()
|
|
140
|
+
out_of_range, total = (row[0], row[1]) if row else (0, 0)
|
|
141
|
+
return {"out_of_range_count": int(out_of_range), "total_count": int(total)}
|
|
142
|
+
|
|
143
|
+
def check_regex(
|
|
144
|
+
self,
|
|
145
|
+
schema: str,
|
|
146
|
+
table: str,
|
|
147
|
+
column: str,
|
|
148
|
+
pattern: str,
|
|
149
|
+
) -> dict[str, int]:
|
|
150
|
+
table_ref = _qualified(schema, table)
|
|
151
|
+
sql = REGEX_SQL.format(column=column, table=table_ref, pattern=pattern)
|
|
152
|
+
row = self._con.execute(sql).fetchone()
|
|
153
|
+
non_matching, total = (row[0], row[1]) if row else (0, 0)
|
|
154
|
+
return {"non_matching_count": int(non_matching), "total_count": int(total)}
|
|
155
|
+
|
|
156
|
+
def check_in_set(
|
|
157
|
+
self,
|
|
158
|
+
schema: str,
|
|
159
|
+
table: str,
|
|
160
|
+
column: str,
|
|
161
|
+
values: list[str],
|
|
162
|
+
) -> dict[str, int]:
|
|
163
|
+
table_ref = _qualified(schema, table)
|
|
164
|
+
escaped = [v.replace("'", "''") for v in values]
|
|
165
|
+
value_list = ", ".join(f"'{v}'" for v in escaped)
|
|
166
|
+
sql = IN_SET_SQL.format(column=column, table=table_ref, value_list=value_list)
|
|
167
|
+
row = self._con.execute(sql).fetchone()
|
|
168
|
+
invalid, total = (row[0], row[1]) if row else (0, 0)
|
|
169
|
+
return {"invalid_count": int(invalid), "total_count": int(total)}
|
|
170
|
+
|
|
171
|
+
def check_row_count(self, schema: str, table: str) -> dict[str, int]:
|
|
172
|
+
table_ref = _qualified(schema, table)
|
|
173
|
+
sql = ROW_COUNT_SQL.format(table=table_ref)
|
|
174
|
+
row = self._con.execute(sql).fetchone()
|
|
175
|
+
count = int(row[0]) if row else 0
|
|
176
|
+
return {"row_count": count}
|
|
177
|
+
|
|
178
|
+
def check_not_negative(self, schema: str, table: str, column: str) -> dict[str, int]:
|
|
179
|
+
table_ref = _qualified(schema, table)
|
|
180
|
+
sql = NOT_NEGATIVE_SQL.format(column=column, table=table_ref)
|
|
181
|
+
row = self._con.execute(sql).fetchone()
|
|
182
|
+
negative, total = (row[0], row[1]) if row else (0, 0)
|
|
183
|
+
return {"negative_count": int(negative), "total_count": int(total)}
|
|
184
|
+
|
|
185
|
+
def check_reference_lookup(
|
|
186
|
+
self,
|
|
187
|
+
schema: str,
|
|
188
|
+
table: str,
|
|
189
|
+
column: str,
|
|
190
|
+
valid_values: list[str],
|
|
191
|
+
) -> dict[str, int]:
|
|
192
|
+
table_ref = _qualified(schema, table)
|
|
193
|
+
escaped = [v.replace("'", "''") for v in valid_values]
|
|
194
|
+
value_list = ", ".join(f"'{v}'" for v in escaped) or "NULL"
|
|
195
|
+
sql = REFERENCE_LOOKUP_SQL.format(
|
|
196
|
+
column=column, table=table_ref, value_list=value_list,
|
|
197
|
+
)
|
|
198
|
+
row = self._con.execute(sql).fetchone()
|
|
199
|
+
invalid, total = (row[0], row[1]) if row else (0, 0)
|
|
200
|
+
return {"invalid_count": int(invalid), "total_count": int(total)}
|
|
201
|
+
|
|
202
|
+
# ------------------------------------------------------------------
|
|
203
|
+
# Lifecycle
|
|
204
|
+
# ------------------------------------------------------------------
|
|
205
|
+
|
|
206
|
+
def close(self) -> None:
|
|
207
|
+
"""Close the underlying DuckDB connection."""
|
|
208
|
+
self._con.close()
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# ---------------------------------------------------------------------------
|
|
4
|
+
# DuckDB SQL templates
|
|
5
|
+
# ---------------------------------------------------------------------------
|
|
6
|
+
# All column references use double-quoted identifiers to handle names that
|
|
7
|
+
# are reserved keywords or contain special characters.
|
|
8
|
+
|
|
9
|
+
NOT_NULL_SQL = (
|
|
10
|
+
'SELECT COUNT(*) FILTER (WHERE "{column}" IS NULL) AS null_count, '
|
|
11
|
+
"COUNT(*) AS total_count "
|
|
12
|
+
"FROM {table}"
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
UNIQUE_SQL = (
|
|
16
|
+
"SELECT COUNT(*) AS duplicate_count, COUNT(*) + "
|
|
17
|
+
"(SELECT COUNT(*) FROM {table}) - COUNT(*) AS total_count "
|
|
18
|
+
"FROM ("
|
|
19
|
+
' SELECT "{column}" FROM {table} '
|
|
20
|
+
' GROUP BY "{column}" HAVING COUNT(*) > 1'
|
|
21
|
+
") dup"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
BETWEEN_SQL = (
|
|
25
|
+
"SELECT "
|
|
26
|
+
" COUNT(*) FILTER ("
|
|
27
|
+
' WHERE CAST("{column}" AS VARCHAR) < \'{min_val}\' '
|
|
28
|
+
' OR CAST("{column}" AS VARCHAR) > \'{max_val}\''
|
|
29
|
+
" ) AS out_of_range_count, "
|
|
30
|
+
" COUNT(*) AS total_count "
|
|
31
|
+
"FROM {table}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
REGEX_SQL = (
|
|
35
|
+
"SELECT "
|
|
36
|
+
" COUNT(*) FILTER ("
|
|
37
|
+
' WHERE "{column}" IS NULL '
|
|
38
|
+
' OR NOT regexp_matches(CAST("{column}" AS VARCHAR), \'{pattern}\')'
|
|
39
|
+
" ) AS non_matching_count, "
|
|
40
|
+
" COUNT(*) AS total_count "
|
|
41
|
+
"FROM {table}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
IN_SET_SQL = (
|
|
45
|
+
"SELECT "
|
|
46
|
+
" COUNT(*) FILTER ("
|
|
47
|
+
' WHERE "{column}" IS NULL '
|
|
48
|
+
' OR CAST("{column}" AS VARCHAR) NOT IN ({value_list})'
|
|
49
|
+
" ) AS invalid_count, "
|
|
50
|
+
" COUNT(*) AS total_count "
|
|
51
|
+
"FROM {table}"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
ROW_COUNT_SQL = "SELECT COUNT(*) AS row_count FROM {table}"
|
|
55
|
+
|
|
56
|
+
NOT_NEGATIVE_SQL = (
|
|
57
|
+
"SELECT "
|
|
58
|
+
' COUNT(*) FILTER (WHERE "{column}" IS NOT NULL AND "{column}" < 0) '
|
|
59
|
+
" AS negative_count, "
|
|
60
|
+
" COUNT(*) AS total_count "
|
|
61
|
+
"FROM {table}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
REFERENCE_LOOKUP_SQL = (
|
|
65
|
+
"SELECT "
|
|
66
|
+
" COUNT(*) FILTER ("
|
|
67
|
+
' WHERE "{column}" IS NOT NULL '
|
|
68
|
+
' AND CAST("{column}" AS VARCHAR) NOT IN ({value_list})'
|
|
69
|
+
" ) AS invalid_count, "
|
|
70
|
+
" COUNT(*) AS total_count "
|
|
71
|
+
"FROM {table}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
TABLE_EXISTS_SQL = (
|
|
75
|
+
"SELECT COUNT(*) AS cnt FROM information_schema.tables "
|
|
76
|
+
"WHERE table_name = '{table}'"
|
|
77
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from collections.abc import Iterator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class InMemoryAdapter:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._tables: dict[str, list[dict[str, Any]]] = {}
|
|
13
|
+
|
|
14
|
+
def add_table(self, schema: str, table: str, rows: list[dict[str, Any]]) -> None:
|
|
15
|
+
self._tables[f"{schema}.{table}"] = list(rows)
|
|
16
|
+
|
|
17
|
+
def _get_rows(self, schema: str, table: str) -> list[dict[str, Any]]:
|
|
18
|
+
key = f"{schema}.{table}"
|
|
19
|
+
if key not in self._tables:
|
|
20
|
+
raise ValueError(f"Table {key} not found")
|
|
21
|
+
return self._tables[key]
|
|
22
|
+
|
|
23
|
+
def query(self, sql: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
|
|
24
|
+
table_key = self._extract_table_from_sql(sql)
|
|
25
|
+
if table_key and table_key in self._tables:
|
|
26
|
+
return list(self._tables[table_key])
|
|
27
|
+
return []
|
|
28
|
+
|
|
29
|
+
def execute(self, sql: str, params: dict[str, Any] | None = None) -> int:
|
|
30
|
+
return 0
|
|
31
|
+
|
|
32
|
+
def stream(
|
|
33
|
+
self,
|
|
34
|
+
sql: str,
|
|
35
|
+
params: dict[str, Any] | None = None,
|
|
36
|
+
chunk_size: int = 10_000,
|
|
37
|
+
) -> Iterator[list[dict[str, Any]]]:
|
|
38
|
+
table_key = self._extract_table_from_sql(sql)
|
|
39
|
+
if table_key and table_key in self._tables:
|
|
40
|
+
rows = self._tables[table_key]
|
|
41
|
+
for i in range(0, len(rows), chunk_size):
|
|
42
|
+
yield rows[i : i + chunk_size]
|
|
43
|
+
|
|
44
|
+
def table_exists(self, schema: str, table: str) -> bool:
|
|
45
|
+
return f"{schema}.{table}" in self._tables
|
|
46
|
+
|
|
47
|
+
def check_not_null(self, schema: str, table: str, column: str) -> dict[str, int]:
|
|
48
|
+
rows = self._get_rows(schema, table)
|
|
49
|
+
null_count = sum(1 for r in rows if r.get(column) is None)
|
|
50
|
+
return {"null_count": null_count, "total_count": len(rows)}
|
|
51
|
+
|
|
52
|
+
def check_unique(self, schema: str, table: str, column: str) -> dict[str, int]:
|
|
53
|
+
rows = self._get_rows(schema, table)
|
|
54
|
+
values = [r.get(column) for r in rows if r.get(column) is not None]
|
|
55
|
+
duplicate_count = len(values) - len(set(values))
|
|
56
|
+
return {"duplicate_count": duplicate_count, "total_count": len(rows)}
|
|
57
|
+
|
|
58
|
+
def check_between(
|
|
59
|
+
self,
|
|
60
|
+
schema: str,
|
|
61
|
+
table: str,
|
|
62
|
+
column: str,
|
|
63
|
+
min_val: str,
|
|
64
|
+
max_val: str,
|
|
65
|
+
) -> dict[str, int]:
|
|
66
|
+
rows = self._get_rows(schema, table)
|
|
67
|
+
out_count = 0
|
|
68
|
+
checked = 0
|
|
69
|
+
for r in rows:
|
|
70
|
+
val = r.get(column)
|
|
71
|
+
if val is None:
|
|
72
|
+
continue
|
|
73
|
+
checked += 1
|
|
74
|
+
if str(val) < min_val or str(val) > max_val:
|
|
75
|
+
out_count += 1
|
|
76
|
+
return {"out_of_range_count": out_count, "total_count": len(rows), "checked": checked}
|
|
77
|
+
|
|
78
|
+
def check_regex(
|
|
79
|
+
self,
|
|
80
|
+
schema: str,
|
|
81
|
+
table: str,
|
|
82
|
+
column: str,
|
|
83
|
+
pattern: str,
|
|
84
|
+
) -> dict[str, int]:
|
|
85
|
+
rows = self._get_rows(schema, table)
|
|
86
|
+
compiled = re.compile(pattern)
|
|
87
|
+
non_matching = sum(
|
|
88
|
+
1
|
|
89
|
+
for r in rows
|
|
90
|
+
if r.get(column) is None or not compiled.match(str(r.get(column)))
|
|
91
|
+
)
|
|
92
|
+
return {"non_matching_count": non_matching, "total_count": len(rows)}
|
|
93
|
+
|
|
94
|
+
def check_in_set(
|
|
95
|
+
self,
|
|
96
|
+
schema: str,
|
|
97
|
+
table: str,
|
|
98
|
+
column: str,
|
|
99
|
+
values: list[str],
|
|
100
|
+
) -> dict[str, int]:
|
|
101
|
+
rows = self._get_rows(schema, table)
|
|
102
|
+
allowed = set(values)
|
|
103
|
+
invalid_count = sum(
|
|
104
|
+
1
|
|
105
|
+
for r in rows
|
|
106
|
+
if r.get(column) is None or str(r.get(column)) not in allowed
|
|
107
|
+
)
|
|
108
|
+
return {"invalid_count": invalid_count, "total_count": len(rows)}
|
|
109
|
+
|
|
110
|
+
def check_row_count(self, schema: str, table: str) -> dict[str, int]:
|
|
111
|
+
rows = self._get_rows(schema, table)
|
|
112
|
+
return {"row_count": len(rows)}
|
|
113
|
+
|
|
114
|
+
def check_not_negative(
|
|
115
|
+
self,
|
|
116
|
+
schema: str,
|
|
117
|
+
table: str,
|
|
118
|
+
column: str,
|
|
119
|
+
) -> dict[str, int]:
|
|
120
|
+
rows = self._get_rows(schema, table)
|
|
121
|
+
negative_count = sum(
|
|
122
|
+
1
|
|
123
|
+
for r in rows
|
|
124
|
+
if r.get(column) is not None and (r[column]) < 0
|
|
125
|
+
)
|
|
126
|
+
return {"negative_count": negative_count, "total_count": len(rows)}
|
|
127
|
+
|
|
128
|
+
def check_reference_lookup(
|
|
129
|
+
self,
|
|
130
|
+
schema: str,
|
|
131
|
+
table: str,
|
|
132
|
+
column: str,
|
|
133
|
+
valid_values: list[str],
|
|
134
|
+
) -> dict[str, int]:
|
|
135
|
+
rows = self._get_rows(schema, table)
|
|
136
|
+
valid_set = set(valid_values)
|
|
137
|
+
invalid_count = sum(
|
|
138
|
+
1
|
|
139
|
+
for r in rows
|
|
140
|
+
if r.get(column) is not None and r.get(column) not in valid_set
|
|
141
|
+
)
|
|
142
|
+
return {"invalid_count": invalid_count, "total_count": len(rows)}
|
|
143
|
+
|
|
144
|
+
def _extract_table_from_sql(self, sql: str) -> str | None:
|
|
145
|
+
match = re.search(r"FROM\s+(\S+)", sql, re.IGNORECASE)
|
|
146
|
+
return match.group(1) if match else None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""In-memory ReferenceDataPort -- dict-backed, for testing and small datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class InMemoryReferenceData:
|
|
7
|
+
"""ReferenceDataPort backed by a dict.
|
|
8
|
+
|
|
9
|
+
Use ``register(reference, key_column, values)`` to load data;
|
|
10
|
+
``load_values(reference, key_column)`` to retrieve it.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(self) -> None:
|
|
14
|
+
self._data: dict[tuple[str, str], list[str]] = {}
|
|
15
|
+
|
|
16
|
+
def register(self, reference: str, key_column: str, values: list[str]) -> None:
|
|
17
|
+
self._data[(reference, key_column)] = list(values)
|
|
18
|
+
|
|
19
|
+
def load_values(self, reference: str, key_column: str) -> list[str]:
|
|
20
|
+
key = (reference, key_column)
|
|
21
|
+
if key not in self._data:
|
|
22
|
+
raise KeyError(
|
|
23
|
+
f"Reference {reference!r} with key column {key_column!r} not registered"
|
|
24
|
+
)
|
|
25
|
+
return list(self._data[key])
|
|
File without changes
|