qualis 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. qualis/__init__.py +22 -0
  2. qualis/adapters/__init__.py +0 -0
  3. qualis/adapters/console.py +144 -0
  4. qualis/adapters/duckdb/__init__.py +0 -0
  5. qualis/adapters/duckdb/adapter.py +208 -0
  6. qualis/adapters/duckdb/sql_templates.py +77 -0
  7. qualis/adapters/in_memory/__init__.py +0 -0
  8. qualis/adapters/in_memory/adapter.py +146 -0
  9. qualis/adapters/in_memory/reference_data.py +25 -0
  10. qualis/adapters/postgres/__init__.py +0 -0
  11. qualis/adapters/postgres/adapter.py +256 -0
  12. qualis/adapters/postgres/sql_templates.py +84 -0
  13. qualis/bootstrap.py +51 -0
  14. qualis/cli/__init__.py +1 -0
  15. qualis/cli/main.py +619 -0
  16. qualis/cli/review_cmd.py +58 -0
  17. qualis/config/__init__.py +0 -0
  18. qualis/config/context_loader.py +69 -0
  19. qualis/config/loader.py +193 -0
  20. qualis/config/settings.py +20 -0
  21. qualis/config/standards.py +39 -0
  22. qualis/discover/__init__.py +0 -0
  23. qualis/discover/evidence_builder.py +32 -0
  24. qualis/discover/profiler.py +152 -0
  25. qualis/discover/suggester.py +252 -0
  26. qualis/discover/writer.py +83 -0
  27. qualis/domain/__init__.py +0 -0
  28. qualis/domain/checks.py +10 -0
  29. qualis/domain/context.py +72 -0
  30. qualis/domain/enums.py +65 -0
  31. qualis/domain/evidence.py +50 -0
  32. qualis/domain/models.py +72 -0
  33. qualis/domain/params.py +78 -0
  34. qualis/domain/rule_engine.py +312 -0
  35. qualis/domain/scoring.py +88 -0
  36. qualis/domain/standards.py +47 -0
  37. qualis/engine/__init__.py +0 -0
  38. qualis/engine/checker.py +85 -0
  39. qualis/engine/diff.py +96 -0
  40. qualis/github/__init__.py +7 -0
  41. qualis/github/__main__.py +32 -0
  42. qualis/github/comment.py +76 -0
  43. qualis/ports/__init__.py +0 -0
  44. qualis/ports/database.py +72 -0
  45. qualis/ports/notifier.py +10 -0
  46. qualis/ports/reference_data.py +22 -0
  47. qualis/py.typed +0 -0
  48. qualis/report/__init__.py +0 -0
  49. qualis/report/loader.py +55 -0
  50. qualis/report/scorecard.py +166 -0
  51. qualis/report/template.html.j2 +327 -0
  52. qualis/review/__init__.py +0 -0
  53. qualis/review/state_machine.py +81 -0
  54. qualis-0.3.1.dist-info/METADATA +38 -0
  55. qualis-0.3.1.dist-info/RECORD +58 -0
  56. qualis-0.3.1.dist-info/WHEEL +4 -0
  57. qualis-0.3.1.dist-info/entry_points.txt +2 -0
  58. qualis-0.3.1.dist-info/licenses/LICENSE +176 -0
qualis/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ """Qualis — Data quality framework that tells you WHAT failed."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.3.1"
6
+
7
+ from qualis.domain.enums import DQDimension, RunStatus, Severity
8
+ from qualis.domain.models import DatasetScore, Rule, Violation
9
+ from qualis.ports.database import DatabasePort
10
+ from qualis.ports.notifier import NotifierPort
11
+
12
+ __all__ = [
13
+ "DQDimension",
14
+ "DatabasePort",
15
+ "DatasetScore",
16
+ "NotifierPort",
17
+ "Rule",
18
+ "RunStatus",
19
+ "Severity",
20
+ "Violation",
21
+ "__version__",
22
+ ]
File without changes
@@ -0,0 +1,144 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+
9
+ if TYPE_CHECKING:
10
+ from qualis.domain.models import DatasetScore
11
+ from qualis.engine.diff import ScoreDiff
12
+
13
+
14
+ def print_score(score: DatasetScore, console: Console | None = None) -> None:
15
+ """Print a rich-formatted terminal scorecard for *score*.
16
+
17
+ Parameters
18
+ ----------
19
+ score:
20
+ The ``DatasetScore`` to render.
21
+ console:
22
+ Optional ``rich.console.Console`` instance. A default ``Console``
23
+ (stdout, auto-detect colour) is created when not provided.
24
+ """
25
+ c = console or Console()
26
+
27
+ pct = int(score.aggregate_score * 100)
28
+ if pct >= 90:
29
+ color, status = "green", "PASSING"
30
+ elif pct >= 70:
31
+ color, status = "yellow", "WARNING"
32
+ else:
33
+ color, status = "red", "FAILING"
34
+
35
+ header = (
36
+ f"[bold magenta]QUALIS[/] · Data Quality Report\n\n"
37
+ f" Score: [bold {color}]{pct} / 100[/]\n"
38
+ f" Status: [{color}]● {status}[/]"
39
+ )
40
+ c.print(Panel(header, border_style="bright_black", padding=(1, 4)))
41
+
42
+ table = Table(show_header=True, header_style="bold")
43
+ table.add_column("Dimension", style="cyan", min_width=16)
44
+ table.add_column("Score", justify="right", min_width=8)
45
+ table.add_column("Checks", justify="right", min_width=10)
46
+ table.add_column("", min_width=3)
47
+
48
+ for ds in score.dimension_scores:
49
+ pct_dim = int(ds.score * 100)
50
+ if ds.score >= 0.9:
51
+ indicator = "[green]✓[/]"
52
+ elif ds.score >= 0.7:
53
+ indicator = "[yellow]⚠[/]"
54
+ else:
55
+ indicator = "[red]✗[/]"
56
+ table.add_row(
57
+ ds.dimension.value.capitalize(),
58
+ f"{pct_dim}%",
59
+ f"{ds.passed}/{ds.total_checks}",
60
+ indicator,
61
+ )
62
+
63
+ c.print(table)
64
+
65
+ if score.total_violations > 0:
66
+ c.print(
67
+ f"\n[bold]{score.total_violations} violation(s)[/] "
68
+ f"({score.critical_violations} critical)"
69
+ )
70
+
71
+
72
+ def print_diff(diff: ScoreDiff, console: Console | None = None) -> None:
73
+ """Print a rich-formatted terminal diff between two DatasetScore snapshots.
74
+
75
+ Parameters
76
+ ----------
77
+ diff:
78
+ The ``ScoreDiff`` to render.
79
+ console:
80
+ Optional ``rich.console.Console`` instance. A default ``Console``
81
+ (stdout, auto-detect colour) is created when not provided.
82
+ """
83
+ c = console or Console()
84
+
85
+ before_pct = int(diff.before_aggregate * 100)
86
+ after_pct = int(diff.after_aggregate * 100)
87
+ delta_pct = after_pct - before_pct
88
+
89
+ if delta_pct > 0:
90
+ delta_str = f"[green]↑ +{delta_pct}[/]"
91
+ elif delta_pct < 0:
92
+ delta_str = f"[red]↓ {delta_pct}[/]"
93
+ else:
94
+ delta_str = "[dim]—[/]"
95
+
96
+ header = (
97
+ "[bold magenta]QUALIS[/] · Score Diff\n\n"
98
+ f" Before: [bold]{before_pct} / 100[/]\n"
99
+ f" After: [bold]{after_pct} / 100[/]\n"
100
+ f" Delta: {delta_str}"
101
+ )
102
+ c.print(Panel(header, border_style="bright_black", padding=(1, 4)))
103
+
104
+ table = Table(show_header=True, header_style="bold")
105
+ table.add_column("Dimension", style="cyan", min_width=16)
106
+ table.add_column("Before", justify="right", min_width=8)
107
+ table.add_column("After", justify="right", min_width=8)
108
+ table.add_column("Delta", justify="right", min_width=8)
109
+ table.add_column("", min_width=3)
110
+
111
+ for dd in diff.dimension_deltas:
112
+ before_val = f"{int(dd.before_score * 100)}%" if dd.before_score is not None else "—"
113
+ after_val = f"{int(dd.after_score * 100)}%" if dd.after_score is not None else "—"
114
+
115
+ dim_delta = int(dd.delta * 100)
116
+ if dim_delta > 0:
117
+ delta_cell = f"[green]↑ +{dim_delta}[/]"
118
+ indicator = "[green]✓[/]"
119
+ elif dim_delta < 0:
120
+ delta_cell = f"[red]↓ {dim_delta}[/]"
121
+ indicator = "[red]✗[/]"
122
+ else:
123
+ delta_cell = "[dim]—[/]"
124
+ indicator = "[green]✓[/]"
125
+
126
+ table.add_row(
127
+ dd.dimension.value.capitalize(),
128
+ before_val,
129
+ after_val,
130
+ delta_cell,
131
+ indicator,
132
+ )
133
+
134
+ c.print(table)
135
+
136
+ viol_delta = diff.after_violations - diff.before_violations
137
+ if viol_delta != 0:
138
+ sign = "+" if viol_delta > 0 else ""
139
+ color = "red" if viol_delta > 0 else "green"
140
+ c.print(
141
+ f"\nViolations: [bold]{diff.before_violations}[/] → "
142
+ f"[bold]{diff.after_violations}[/] "
143
+ f"([{color}]{sign}{viol_delta}[/])"
144
+ )
File without changes
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ import duckdb
6
+
7
+ from qualis.adapters.duckdb.sql_templates import (
8
+ BETWEEN_SQL,
9
+ IN_SET_SQL,
10
+ NOT_NEGATIVE_SQL,
11
+ NOT_NULL_SQL,
12
+ REFERENCE_LOOKUP_SQL,
13
+ REGEX_SQL,
14
+ ROW_COUNT_SQL,
15
+ TABLE_EXISTS_SQL,
16
+ UNIQUE_SQL,
17
+ )
18
+
19
+ if TYPE_CHECKING:
20
+ from collections.abc import Iterator
21
+
22
+
23
+ def _qualified(schema: str, table: str) -> str:
24
+ """Return a qualified table reference for use inside SQL strings."""
25
+ if schema:
26
+ return f'"{schema}"."{table}"'
27
+ return f'"{table}"'
28
+
29
+
30
+ class DuckDBAdapter:
31
+ """DuckDB-backed implementation of ``DatabasePort``.
32
+
33
+ Parameters
34
+ ----------
35
+ database:
36
+ Path to a persistent DuckDB file, or ``":memory:"`` (default) for an
37
+ in-process, session-only database.
38
+ """
39
+
40
+ def __init__(self, database: str = ":memory:") -> None:
41
+ self._con = duckdb.connect(database)
42
+
43
+ # ------------------------------------------------------------------
44
+ # Registration helpers
45
+ # ------------------------------------------------------------------
46
+
47
+ def register_csv(self, table_name: str, path: str) -> None:
48
+ """Create a table from a CSV file using DuckDB's auto-detection."""
49
+ self._con.execute(
50
+ f'CREATE TABLE "{table_name}" AS SELECT * FROM read_csv_auto(\'{path}\')'
51
+ )
52
+
53
+ def register_parquet(self, table_name: str, path: str) -> None:
54
+ """Create a table from a Parquet file."""
55
+ self._con.execute(
56
+ f'CREATE TABLE "{table_name}" AS SELECT * FROM read_parquet(\'{path}\')'
57
+ )
58
+
59
+ # ------------------------------------------------------------------
60
+ # DatabasePort implementation
61
+ # ------------------------------------------------------------------
62
+
63
+ def query(self, sql: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
64
+ """Execute *sql* and return a list of dicts keyed by column name."""
65
+ rel = self._con.execute(sql)
66
+ columns = [desc[0] for desc in rel.description or []]
67
+ return [dict(zip(columns, row, strict=True)) for row in rel.fetchall()]
68
+
69
+ def execute(self, sql: str, params: dict[str, Any] | None = None) -> int:
70
+ """Execute a DML statement; return the number of rows affected."""
71
+ self._con.execute(sql)
72
+ # DuckDB does not expose rowcount directly; return 0 as a safe default
73
+ return 0
74
+
75
+ def stream(
76
+ self,
77
+ sql: str,
78
+ params: dict[str, Any] | None = None,
79
+ chunk_size: int = 10_000,
80
+ ) -> Iterator[list[dict[str, Any]]]:
81
+ """Stream query results in chunks of *chunk_size* rows."""
82
+ rel = self._con.execute(sql)
83
+ columns = [desc[0] for desc in rel.description or []]
84
+ while True:
85
+ rows = rel.fetchmany(chunk_size)
86
+ if not rows:
87
+ break
88
+ yield [dict(zip(columns, row, strict=True)) for row in rows]
89
+
90
+ def table_exists(self, schema: str, table: str) -> bool:
91
+ """Return True when the named table is registered in the database."""
92
+ result = self._con.execute(
93
+ TABLE_EXISTS_SQL.format(table=table)
94
+ ).fetchone()
95
+ return bool(result and result[0] > 0)
96
+
97
+ # ------------------------------------------------------------------
98
+ # Check methods
99
+ # ------------------------------------------------------------------
100
+
101
+ def check_not_null(self, schema: str, table: str, column: str) -> dict[str, int]:
102
+ table_ref = _qualified(schema, table)
103
+ sql = NOT_NULL_SQL.format(column=column, table=table_ref)
104
+ row = self._con.execute(sql).fetchone()
105
+ null_count, total_count = (row[0], row[1]) if row else (0, 0)
106
+ return {"null_count": int(null_count), "total_count": int(total_count)}
107
+
108
+ def check_unique(self, schema: str, table: str, column: str) -> dict[str, int]:
109
+ table_ref = _qualified(schema, table)
110
+ sql = UNIQUE_SQL.format(column=column, table=table_ref)
111
+ row = self._con.execute(sql).fetchone()
112
+ if row is None:
113
+ # No duplicate groups at all — zero duplicates
114
+ total_row = self._con.execute(
115
+ f"SELECT COUNT(*) FROM {table_ref}"
116
+ ).fetchone()
117
+ total = int(total_row[0]) if total_row else 0
118
+ return {"duplicate_count": 0, "total_count": total}
119
+ dup_count = int(row[0])
120
+ # Re-query total so it is accurate regardless of UNIQUE_SQL shape
121
+ total_row = self._con.execute(
122
+ f"SELECT COUNT(*) FROM {table_ref}"
123
+ ).fetchone()
124
+ total = int(total_row[0]) if total_row else 0
125
+ return {"duplicate_count": dup_count, "total_count": total}
126
+
127
+ def check_between(
128
+ self,
129
+ schema: str,
130
+ table: str,
131
+ column: str,
132
+ min_val: str,
133
+ max_val: str,
134
+ ) -> dict[str, int]:
135
+ table_ref = _qualified(schema, table)
136
+ sql = BETWEEN_SQL.format(
137
+ column=column, table=table_ref, min_val=min_val, max_val=max_val
138
+ )
139
+ row = self._con.execute(sql).fetchone()
140
+ out_of_range, total = (row[0], row[1]) if row else (0, 0)
141
+ return {"out_of_range_count": int(out_of_range), "total_count": int(total)}
142
+
143
+ def check_regex(
144
+ self,
145
+ schema: str,
146
+ table: str,
147
+ column: str,
148
+ pattern: str,
149
+ ) -> dict[str, int]:
150
+ table_ref = _qualified(schema, table)
151
+ sql = REGEX_SQL.format(column=column, table=table_ref, pattern=pattern)
152
+ row = self._con.execute(sql).fetchone()
153
+ non_matching, total = (row[0], row[1]) if row else (0, 0)
154
+ return {"non_matching_count": int(non_matching), "total_count": int(total)}
155
+
156
+ def check_in_set(
157
+ self,
158
+ schema: str,
159
+ table: str,
160
+ column: str,
161
+ values: list[str],
162
+ ) -> dict[str, int]:
163
+ table_ref = _qualified(schema, table)
164
+ escaped = [v.replace("'", "''") for v in values]
165
+ value_list = ", ".join(f"'{v}'" for v in escaped)
166
+ sql = IN_SET_SQL.format(column=column, table=table_ref, value_list=value_list)
167
+ row = self._con.execute(sql).fetchone()
168
+ invalid, total = (row[0], row[1]) if row else (0, 0)
169
+ return {"invalid_count": int(invalid), "total_count": int(total)}
170
+
171
+ def check_row_count(self, schema: str, table: str) -> dict[str, int]:
172
+ table_ref = _qualified(schema, table)
173
+ sql = ROW_COUNT_SQL.format(table=table_ref)
174
+ row = self._con.execute(sql).fetchone()
175
+ count = int(row[0]) if row else 0
176
+ return {"row_count": count}
177
+
178
+ def check_not_negative(self, schema: str, table: str, column: str) -> dict[str, int]:
179
+ table_ref = _qualified(schema, table)
180
+ sql = NOT_NEGATIVE_SQL.format(column=column, table=table_ref)
181
+ row = self._con.execute(sql).fetchone()
182
+ negative, total = (row[0], row[1]) if row else (0, 0)
183
+ return {"negative_count": int(negative), "total_count": int(total)}
184
+
185
+ def check_reference_lookup(
186
+ self,
187
+ schema: str,
188
+ table: str,
189
+ column: str,
190
+ valid_values: list[str],
191
+ ) -> dict[str, int]:
192
+ table_ref = _qualified(schema, table)
193
+ escaped = [v.replace("'", "''") for v in valid_values]
194
+ value_list = ", ".join(f"'{v}'" for v in escaped) or "NULL"
195
+ sql = REFERENCE_LOOKUP_SQL.format(
196
+ column=column, table=table_ref, value_list=value_list,
197
+ )
198
+ row = self._con.execute(sql).fetchone()
199
+ invalid, total = (row[0], row[1]) if row else (0, 0)
200
+ return {"invalid_count": int(invalid), "total_count": int(total)}
201
+
202
+ # ------------------------------------------------------------------
203
+ # Lifecycle
204
+ # ------------------------------------------------------------------
205
+
206
+ def close(self) -> None:
207
+ """Close the underlying DuckDB connection."""
208
+ self._con.close()
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ # ---------------------------------------------------------------------------
4
+ # DuckDB SQL templates
5
+ # ---------------------------------------------------------------------------
6
+ # All column references use double-quoted identifiers to handle names that
7
+ # are reserved keywords or contain special characters.
8
+
9
+ NOT_NULL_SQL = (
10
+ 'SELECT COUNT(*) FILTER (WHERE "{column}" IS NULL) AS null_count, '
11
+ "COUNT(*) AS total_count "
12
+ "FROM {table}"
13
+ )
14
+
15
+ UNIQUE_SQL = (
16
+ "SELECT COUNT(*) AS duplicate_count, COUNT(*) + "
17
+ "(SELECT COUNT(*) FROM {table}) - COUNT(*) AS total_count "
18
+ "FROM ("
19
+ ' SELECT "{column}" FROM {table} '
20
+ ' GROUP BY "{column}" HAVING COUNT(*) > 1'
21
+ ") dup"
22
+ )
23
+
24
+ BETWEEN_SQL = (
25
+ "SELECT "
26
+ " COUNT(*) FILTER ("
27
+ ' WHERE CAST("{column}" AS VARCHAR) < \'{min_val}\' '
28
+ ' OR CAST("{column}" AS VARCHAR) > \'{max_val}\''
29
+ " ) AS out_of_range_count, "
30
+ " COUNT(*) AS total_count "
31
+ "FROM {table}"
32
+ )
33
+
34
+ REGEX_SQL = (
35
+ "SELECT "
36
+ " COUNT(*) FILTER ("
37
+ ' WHERE "{column}" IS NULL '
38
+ ' OR NOT regexp_matches(CAST("{column}" AS VARCHAR), \'{pattern}\')'
39
+ " ) AS non_matching_count, "
40
+ " COUNT(*) AS total_count "
41
+ "FROM {table}"
42
+ )
43
+
44
+ IN_SET_SQL = (
45
+ "SELECT "
46
+ " COUNT(*) FILTER ("
47
+ ' WHERE "{column}" IS NULL '
48
+ ' OR CAST("{column}" AS VARCHAR) NOT IN ({value_list})'
49
+ " ) AS invalid_count, "
50
+ " COUNT(*) AS total_count "
51
+ "FROM {table}"
52
+ )
53
+
54
+ ROW_COUNT_SQL = "SELECT COUNT(*) AS row_count FROM {table}"
55
+
56
+ NOT_NEGATIVE_SQL = (
57
+ "SELECT "
58
+ ' COUNT(*) FILTER (WHERE "{column}" IS NOT NULL AND "{column}" < 0) '
59
+ " AS negative_count, "
60
+ " COUNT(*) AS total_count "
61
+ "FROM {table}"
62
+ )
63
+
64
+ REFERENCE_LOOKUP_SQL = (
65
+ "SELECT "
66
+ " COUNT(*) FILTER ("
67
+ ' WHERE "{column}" IS NOT NULL '
68
+ ' AND CAST("{column}" AS VARCHAR) NOT IN ({value_list})'
69
+ " ) AS invalid_count, "
70
+ " COUNT(*) AS total_count "
71
+ "FROM {table}"
72
+ )
73
+
74
+ TABLE_EXISTS_SQL = (
75
+ "SELECT COUNT(*) AS cnt FROM information_schema.tables "
76
+ "WHERE table_name = '{table}'"
77
+ )
File without changes
@@ -0,0 +1,146 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ if TYPE_CHECKING:
7
+ from collections.abc import Iterator
8
+
9
+
10
+ class InMemoryAdapter:
11
+ def __init__(self) -> None:
12
+ self._tables: dict[str, list[dict[str, Any]]] = {}
13
+
14
+ def add_table(self, schema: str, table: str, rows: list[dict[str, Any]]) -> None:
15
+ self._tables[f"{schema}.{table}"] = list(rows)
16
+
17
+ def _get_rows(self, schema: str, table: str) -> list[dict[str, Any]]:
18
+ key = f"{schema}.{table}"
19
+ if key not in self._tables:
20
+ raise ValueError(f"Table {key} not found")
21
+ return self._tables[key]
22
+
23
+ def query(self, sql: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]:
24
+ table_key = self._extract_table_from_sql(sql)
25
+ if table_key and table_key in self._tables:
26
+ return list(self._tables[table_key])
27
+ return []
28
+
29
+ def execute(self, sql: str, params: dict[str, Any] | None = None) -> int:
30
+ return 0
31
+
32
+ def stream(
33
+ self,
34
+ sql: str,
35
+ params: dict[str, Any] | None = None,
36
+ chunk_size: int = 10_000,
37
+ ) -> Iterator[list[dict[str, Any]]]:
38
+ table_key = self._extract_table_from_sql(sql)
39
+ if table_key and table_key in self._tables:
40
+ rows = self._tables[table_key]
41
+ for i in range(0, len(rows), chunk_size):
42
+ yield rows[i : i + chunk_size]
43
+
44
+ def table_exists(self, schema: str, table: str) -> bool:
45
+ return f"{schema}.{table}" in self._tables
46
+
47
+ def check_not_null(self, schema: str, table: str, column: str) -> dict[str, int]:
48
+ rows = self._get_rows(schema, table)
49
+ null_count = sum(1 for r in rows if r.get(column) is None)
50
+ return {"null_count": null_count, "total_count": len(rows)}
51
+
52
+ def check_unique(self, schema: str, table: str, column: str) -> dict[str, int]:
53
+ rows = self._get_rows(schema, table)
54
+ values = [r.get(column) for r in rows if r.get(column) is not None]
55
+ duplicate_count = len(values) - len(set(values))
56
+ return {"duplicate_count": duplicate_count, "total_count": len(rows)}
57
+
58
+ def check_between(
59
+ self,
60
+ schema: str,
61
+ table: str,
62
+ column: str,
63
+ min_val: str,
64
+ max_val: str,
65
+ ) -> dict[str, int]:
66
+ rows = self._get_rows(schema, table)
67
+ out_count = 0
68
+ checked = 0
69
+ for r in rows:
70
+ val = r.get(column)
71
+ if val is None:
72
+ continue
73
+ checked += 1
74
+ if str(val) < min_val or str(val) > max_val:
75
+ out_count += 1
76
+ return {"out_of_range_count": out_count, "total_count": len(rows), "checked": checked}
77
+
78
+ def check_regex(
79
+ self,
80
+ schema: str,
81
+ table: str,
82
+ column: str,
83
+ pattern: str,
84
+ ) -> dict[str, int]:
85
+ rows = self._get_rows(schema, table)
86
+ compiled = re.compile(pattern)
87
+ non_matching = sum(
88
+ 1
89
+ for r in rows
90
+ if r.get(column) is None or not compiled.match(str(r.get(column)))
91
+ )
92
+ return {"non_matching_count": non_matching, "total_count": len(rows)}
93
+
94
+ def check_in_set(
95
+ self,
96
+ schema: str,
97
+ table: str,
98
+ column: str,
99
+ values: list[str],
100
+ ) -> dict[str, int]:
101
+ rows = self._get_rows(schema, table)
102
+ allowed = set(values)
103
+ invalid_count = sum(
104
+ 1
105
+ for r in rows
106
+ if r.get(column) is None or str(r.get(column)) not in allowed
107
+ )
108
+ return {"invalid_count": invalid_count, "total_count": len(rows)}
109
+
110
+ def check_row_count(self, schema: str, table: str) -> dict[str, int]:
111
+ rows = self._get_rows(schema, table)
112
+ return {"row_count": len(rows)}
113
+
114
+ def check_not_negative(
115
+ self,
116
+ schema: str,
117
+ table: str,
118
+ column: str,
119
+ ) -> dict[str, int]:
120
+ rows = self._get_rows(schema, table)
121
+ negative_count = sum(
122
+ 1
123
+ for r in rows
124
+ if r.get(column) is not None and (r[column]) < 0
125
+ )
126
+ return {"negative_count": negative_count, "total_count": len(rows)}
127
+
128
+ def check_reference_lookup(
129
+ self,
130
+ schema: str,
131
+ table: str,
132
+ column: str,
133
+ valid_values: list[str],
134
+ ) -> dict[str, int]:
135
+ rows = self._get_rows(schema, table)
136
+ valid_set = set(valid_values)
137
+ invalid_count = sum(
138
+ 1
139
+ for r in rows
140
+ if r.get(column) is not None and r.get(column) not in valid_set
141
+ )
142
+ return {"invalid_count": invalid_count, "total_count": len(rows)}
143
+
144
+ def _extract_table_from_sql(self, sql: str) -> str | None:
145
+ match = re.search(r"FROM\s+(\S+)", sql, re.IGNORECASE)
146
+ return match.group(1) if match else None
@@ -0,0 +1,25 @@
1
+ """In-memory ReferenceDataPort -- dict-backed, for testing and small datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class InMemoryReferenceData:
7
+ """ReferenceDataPort backed by a dict.
8
+
9
+ Use ``register(reference, key_column, values)`` to load data;
10
+ ``load_values(reference, key_column)`` to retrieve it.
11
+ """
12
+
13
+ def __init__(self) -> None:
14
+ self._data: dict[tuple[str, str], list[str]] = {}
15
+
16
+ def register(self, reference: str, key_column: str, values: list[str]) -> None:
17
+ self._data[(reference, key_column)] = list(values)
18
+
19
+ def load_values(self, reference: str, key_column: str) -> list[str]:
20
+ key = (reference, key_column)
21
+ if key not in self._data:
22
+ raise KeyError(
23
+ f"Reference {reference!r} with key column {key_column!r} not registered"
24
+ )
25
+ return list(self._data[key])
File without changes