semql-validate-db 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ """Pre-deploy drift checker for semql catalogues.
2
+
3
+ ``validate_against_db(catalog, connection=...)`` runs cheap probe
4
+ queries against a live database and returns a list of
5
+ ``DbValidationError`` findings — one per cube / field / join that
6
+ broke. Use it as a CI gate before promoting a catalogue change.
7
+
8
+ This package is driver-agnostic: the ``connection`` argument is any
9
+ DB-API 2.0 connection. Wire your own ``psycopg.connect`` /
10
+ ``clickhouse_connect.get_client`` / ``duckdb.connect`` and pass it in.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from semql_validate_db._validate import (
16
+ DbValidationCode,
17
+ DbValidationError,
18
+ validate_against_db,
19
+ )
20
+
21
+ __all__ = ["DbValidationCode", "DbValidationError", "validate_against_db"]
@@ -0,0 +1,315 @@
1
+ """Catalogue ↔ database drift checker.
2
+
3
+ The compiler is intentionally pure (no I/O). This package picks up the
4
+ class of bugs the compiler can't see: an upstream column rename, a
5
+ dropped table, a join predicate that suddenly compares incompatible
6
+ types. Suitable as a pre-deploy CI gate.
7
+
8
+ Strategy: for each cube, run cheap ``LIMIT 0`` probes against the
9
+ target database — one for the table itself, one per measure /
10
+ dimension SQL fragment, one for ``base_predicate``. Any SQL error
11
+ gets translated into a ``DbValidationError`` naming the cube and the
12
+ field that broke; the function collects all of them rather than
13
+ short-circuiting so a single run surfaces the full drift picture.
14
+
15
+ Transport: accepts any DB-API 2.0 connection. Callers bring their
16
+ own driver (psycopg, clickhouse-connect, duckdb) so this package
17
+ stays driver-agnostic.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import contextlib
23
+ import re
24
+ from dataclasses import dataclass
25
+ from typing import Literal, Protocol
26
+
27
+ from semql.catalog import Catalog
28
+ from semql.introspect import iter_cubes, iter_fields, iter_joins
29
+ from semql.model import Cube, Dimension, Join, Measure, TimeDimension
30
+
31
+ DbValidationCode = Literal[
32
+ "missing_table",
33
+ "missing_column",
34
+ "base_predicate_invalid",
35
+ "join_predicate_invalid",
36
+ "required_filter_dimension_missing",
37
+ ]
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class DbValidationError:
42
+ """One drift finding.
43
+
44
+ ``cube`` always names the cube the probe ran against. ``field`` is
45
+ the measure / dimension / join target the probe was checking, or
46
+ ``None`` for cube-level findings (``missing_table``,
47
+ ``base_predicate_invalid``). ``detail`` carries the database's
48
+ own error message so the caller can route the message at the user
49
+ without re-parsing.
50
+ """
51
+
52
+ code: DbValidationCode
53
+ cube: str
54
+ field: str | None
55
+ message: str
56
+ detail: str | None = None
57
+
58
+
59
+ class _Cursor(Protocol):
60
+ # DB-API 2.0 cursors return ``self``-like objects from execute, but the
61
+ # exact shape varies by driver. We only care about side-effects + close,
62
+ # so the return type is intentionally untyped via ``object``.
63
+ def execute(self, sql: str, /) -> object: ...
64
+ def close(self) -> None: ...
65
+
66
+
67
+ class _Connection(Protocol):
68
+ def cursor(self) -> _Cursor: ...
69
+
70
+
71
+ _PLACEHOLDER_RE = re.compile(r"\{([a-z_][a-z0-9_]*)\}")
72
+
73
+
74
+ def _resolve_placeholders(sql: str, lookup: dict[str, str]) -> str:
75
+ """Substitute ``{key}`` placeholders the same way the compiler does.
76
+
77
+ Mirrors ``semql.compile._resolve_sql``'s behaviour: ``{alias}`` and
78
+ ``{cube_name}`` resolve to the alias; ``{ctx_key}`` resolves from
79
+ the caller-supplied ``context``. Unknown placeholders are left
80
+ in place — the resulting query will fail at the database and that
81
+ failure is the signal we want.
82
+ """
83
+
84
+ def _repl(m: re.Match[str]) -> str:
85
+ key = m.group(1)
86
+ return lookup.get(key, m.group(0))
87
+
88
+ return _PLACEHOLDER_RE.sub(_repl, sql)
89
+
90
+
91
+ def _probe(
92
+ connection: _Connection,
93
+ sql: str,
94
+ ) -> tuple[bool, str | None]:
95
+ """Run a SQL probe under the connection's cursor.
96
+
97
+ Returns ``(True, None)`` on success and ``(False, error_message)``
98
+ on any failure. ``LIMIT 0`` is the caller's responsibility — this
99
+ function just runs whatever it's handed and reports the outcome.
100
+ """
101
+ cursor = connection.cursor()
102
+ try:
103
+ cursor.execute(sql)
104
+ except Exception as exc: # noqa: BLE001 — any DB-API error counts as drift
105
+ return False, str(exc)
106
+ finally:
107
+ # Driver cleanup quirks vary; we don't want a flaky close() to mask
108
+ # a real drift finding so swallow whatever the driver throws here.
109
+ with contextlib.suppress(Exception):
110
+ cursor.close()
111
+ return True, None
112
+
113
+
114
+ def _cube_lookup(cube: Cube, context: dict[str, str]) -> dict[str, str]:
115
+ """The substitution table the cube's SQL fragments resolve against."""
116
+ return {
117
+ cube.alias: cube.alias,
118
+ cube.name: cube.alias,
119
+ **context,
120
+ }
121
+
122
+
123
+ def _validate_required_filters(cube: Cube) -> list[DbValidationError]:
124
+ """Static check: every ``required_filters`` entry must name a real
125
+ dimension on the cube.
126
+
127
+ This is technically catalogue-internal (no DB query needed), but
128
+ pre-deploy is the right surface for it — surfacing here means a CI
129
+ gate catches the typo even when the static-validation pass missed."""
130
+ dim_names = {d.name for d in cube.dimensions}
131
+ out: list[DbValidationError] = []
132
+ for req in cube.required_filters:
133
+ if req not in dim_names:
134
+ out.append(
135
+ DbValidationError(
136
+ code="required_filter_dimension_missing",
137
+ cube=cube.name,
138
+ field=req,
139
+ message=(
140
+ f"Cube {cube.name!r} declares required_filters=[{req!r}] "
141
+ f"but has no dimension by that name. Known: {sorted(dim_names)}."
142
+ ),
143
+ )
144
+ )
145
+ return out
146
+
147
+
148
+ def _validate_cube(
149
+ cube: Cube,
150
+ connection: _Connection,
151
+ context: dict[str, str],
152
+ ) -> list[DbValidationError]:
153
+ """Probe a single cube. Order: table → fragments → base_predicate.
154
+
155
+ Stops on ``missing_table`` (every subsequent probe would just
156
+ repeat the same error). All other findings accumulate so a single
157
+ run reports the whole picture for the cube."""
158
+ errors: list[DbValidationError] = []
159
+ lookup = _cube_lookup(cube, context)
160
+ table = _resolve_placeholders(cube.table, lookup)
161
+ alias = cube.alias
162
+
163
+ ok, detail = _probe(connection, f"SELECT * FROM {table} AS {alias} LIMIT 0")
164
+ if not ok:
165
+ errors.append(
166
+ DbValidationError(
167
+ code="missing_table",
168
+ cube=cube.name,
169
+ field=None,
170
+ message=(
171
+ f"Cube {cube.name!r}: table {table!r} did not respond to a "
172
+ "trivial SELECT — likely missing, renamed, or "
173
+ "inaccessible to the connection's role."
174
+ ),
175
+ detail=detail,
176
+ )
177
+ )
178
+ return errors
179
+
180
+ # Walk every addressable field on the cube and probe its SQL
181
+ # fragment. ``iter_fields`` yields measures, dimensions, time
182
+ # dimensions, segments — we skip segments here (their fragments
183
+ # are predicates, not projections, and probe via base_predicate's
184
+ # path if needed). Ratio measures and ``count(*)`` are also
185
+ # skipped: ratio has no fragment of its own, ``count(*)`` is
186
+ # covered by the table probe.
187
+ for field in iter_fields(cube):
188
+ if isinstance(field, Measure):
189
+ if field.agg == "ratio":
190
+ continue
191
+ if field.agg == "count" and field.sql.strip() == "*":
192
+ continue
193
+ kind = "measure"
194
+ elif isinstance(field, TimeDimension):
195
+ kind = "time_dimension"
196
+ elif isinstance(field, Dimension):
197
+ kind = "dimension"
198
+ else:
199
+ # Segment — predicate, not projection; covered by
200
+ # base_predicate semantics. No fragment-as-SELECT probe.
201
+ continue
202
+ fragment = _resolve_placeholders(field.sql, lookup)
203
+ ok, detail = _probe(connection, f"SELECT {fragment} FROM {table} AS {alias} LIMIT 0")
204
+ if not ok:
205
+ errors.append(
206
+ DbValidationError(
207
+ code="missing_column",
208
+ cube=cube.name,
209
+ field=field.name,
210
+ message=(
211
+ f"Cube {cube.name!r}, {kind} {field.name!r}: SQL "
212
+ f"fragment {field.sql!r} did not execute against the table."
213
+ ),
214
+ detail=detail,
215
+ )
216
+ )
217
+
218
+ if cube.base_predicate:
219
+ predicate = _resolve_placeholders(cube.base_predicate, lookup)
220
+ ok, detail = _probe(
221
+ connection,
222
+ f"SELECT 1 FROM {table} AS {alias} WHERE {predicate} LIMIT 0",
223
+ )
224
+ if not ok:
225
+ errors.append(
226
+ DbValidationError(
227
+ code="base_predicate_invalid",
228
+ cube=cube.name,
229
+ field=None,
230
+ message=(
231
+ f"Cube {cube.name!r}: base_predicate {cube.base_predicate!r} "
232
+ "did not execute against the table."
233
+ ),
234
+ detail=detail,
235
+ )
236
+ )
237
+
238
+ return errors
239
+
240
+
241
+ def _validate_join(
242
+ source: Cube,
243
+ join: Join,
244
+ target: Cube,
245
+ connection: _Connection,
246
+ context: dict[str, str],
247
+ ) -> list[DbValidationError]:
248
+ """Probe ``source LEFT JOIN target ON <join.on> LIMIT 0``.
249
+
250
+ A successful run means the predicate parses against the actual
251
+ column types on both sides. Type-mismatch and unknown-column errors
252
+ surface here under ``join_predicate_invalid``."""
253
+ src_lookup = _cube_lookup(source, context)
254
+ tgt_lookup = _cube_lookup(target, context)
255
+ src_table = _resolve_placeholders(source.table, src_lookup)
256
+ tgt_table = _resolve_placeholders(target.table, tgt_lookup)
257
+ on_clause = _resolve_placeholders(join.on, {**src_lookup, **tgt_lookup})
258
+
259
+ sql = (
260
+ f"SELECT 1 FROM {src_table} AS {source.alias} "
261
+ f"LEFT JOIN {tgt_table} AS {target.alias} ON {on_clause} LIMIT 0"
262
+ )
263
+ ok, detail = _probe(connection, sql)
264
+ if ok:
265
+ return []
266
+ return [
267
+ DbValidationError(
268
+ code="join_predicate_invalid",
269
+ cube=source.name,
270
+ field=join.to,
271
+ message=(
272
+ f"Cube {source.name!r} → {join.to!r}: join predicate "
273
+ f"{join.on!r} did not execute against the joined tables."
274
+ ),
275
+ detail=detail,
276
+ )
277
+ ]
278
+
279
+
280
+ def validate_against_db(
281
+ catalog: Catalog,
282
+ *,
283
+ connection: _Connection,
284
+ context: dict[str, str] | None = None,
285
+ ) -> list[DbValidationError]:
286
+ """Validate every cube and join in ``catalog`` against a live DB.
287
+
288
+ Returns the full list of findings (empty on success). Each finding
289
+ names the cube / field that drifted and carries the database's
290
+ own error message in ``detail`` for routing into a CI log.
291
+
292
+ ``context`` substitutes ``{key}`` placeholders inside catalog SQL
293
+ (e.g. ``{"schema": "analytics"}`` for cubes whose ``table`` is
294
+ ``"{schema}.orders"``).
295
+
296
+ META reflection cubes are skipped — they don't live in the
297
+ physical database.
298
+ """
299
+ ctx = context or {}
300
+ errors: list[DbValidationError] = []
301
+ # ``iter_cubes`` skips META reflection cubes by default — they
302
+ # live in-memory and aren't real database tables.
303
+ for cube in iter_cubes(catalog):
304
+ errors.extend(_validate_required_filters(cube))
305
+ errors.extend(_validate_cube(cube, connection, ctx))
306
+ for source, join, target in iter_joins(catalog):
307
+ errors.extend(_validate_join(source, join, target, connection, ctx))
308
+ return errors
309
+
310
+
311
+ __all__ = [
312
+ "DbValidationCode",
313
+ "DbValidationError",
314
+ "validate_against_db",
315
+ ]
File without changes
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.4
2
+ Name: semql-validate-db
3
+ Version: 0.1.0
4
+ Summary: Validate a semql Catalog against a live database — catches missing tables, dropped columns, and broken join predicates before a deploy.
5
+ Author: Nikhil Pallamreddy
6
+ Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
7
+ License-Expression: BSD-3-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Database
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Classifier: Topic :: Software Development :: Quality Assurance
18
+ Classifier: Typing :: Typed
19
+ Requires-Dist: semql>=0.1.0,<0.2
20
+ Requires-Python: >=3.12
21
+ Project-URL: Homepage, https://github.com/npalladium/semql
22
+ Project-URL: Repository, https://github.com/npalladium/semql
23
+ Project-URL: Issues, https://github.com/npalladium/semql/issues
24
+ Description-Content-Type: text/markdown
25
+
26
+ # semql-validate-db
27
+
28
+ Pre-deploy drift checker for [`semql`](../semql) catalogues. Runs cheap
29
+ probe queries against a live database and surfaces the class of bugs
30
+ the compiler can't see — missing tables, dropped columns, broken join
31
+ predicates, base-predicate drift.
32
+
33
+ `semql` is intentionally pure (PHILOSOPHY: "the compiler has no I/O").
34
+ That keeps the compiler simple, but it also means a catalog can pass
35
+ every compile-time check and still blow up at query time because
36
+ upstream renamed a column. `semql-validate-db` is the out-of-band
37
+ gate that catches it.
38
+
39
+ ## Install
40
+
41
+ ```sh
42
+ pip install semql-validate-db
43
+ ```
44
+
45
+ The package is driver-agnostic. Bring your own DB-API 2.0 connection:
46
+
47
+ ```sh
48
+ pip install psycopg # Postgres
49
+ pip install clickhouse-connect # ClickHouse
50
+ pip install duckdb # DuckDB
51
+ ```
52
+
53
+ ## Quick start
54
+
55
+ ```python
56
+ import duckdb
57
+ from semql import Backend, Catalog, Cube, Dimension, Measure, TimeDimension
58
+ from semql_validate_db import validate_against_db
59
+
60
+ orders = Cube(
61
+ name="orders",
62
+ backend=Backend.DUCKDB,
63
+ table="orders",
64
+ alias="o",
65
+ measures=[Measure(name="revenue", sql="{o}.amount", agg="sum")],
66
+ dimensions=[Dimension(name="region", sql="{o}.region", type="string")],
67
+ time_dimensions=[TimeDimension(name="created_at", sql="{o}.created_at")],
68
+ )
69
+ catalog = Catalog([orders])
70
+
71
+ conn = duckdb.connect(":memory:")
72
+ conn.execute(
73
+ "CREATE TABLE orders (amount DOUBLE, region TEXT, created_at TIMESTAMP)"
74
+ )
75
+
76
+ errors = validate_against_db(catalog, connection=conn)
77
+ for e in errors:
78
+ print(f"{e.code}: {e.cube}.{e.field or ''} — {e.message}")
79
+ ```
80
+
81
+ A clean run returns an empty list. Drift (a missing column, a renamed
82
+ table) yields one `DbValidationError` per finding so a single run
83
+ gives the full picture instead of bailing on the first failure.
84
+
85
+ ## What it catches
86
+
87
+ - `missing_table` — `cube.table` doesn't exist or the connection's
88
+ role can't see it.
89
+ - `missing_column` — a measure / dimension / time-dimension SQL
90
+ fragment references a column that no longer exists.
91
+ - `base_predicate_invalid` — `cube.base_predicate` doesn't execute.
92
+ - `join_predicate_invalid` — a `Join.on` predicate references columns
93
+ that aren't there, or compares incompatible types.
94
+ - `required_filter_dimension_missing` — static catalog check; the
95
+ named `required_filters` entry has no matching `Dimension`.
96
+
97
+ ## What it doesn't catch
98
+
99
+ - Semantic drift (a column exists but means something different now).
100
+ Schema is necessary, not sufficient.
101
+ - Cross-table referential integrity. The probes are `LIMIT 0`; they
102
+ parse, they don't sample.
103
+ - Backend-specific feature drift (a function got dialect-renamed).
104
+ Use the compiler's snapshot tests for that.
105
+
106
+ ## Why `LIMIT 0`?
107
+
108
+ Every probe runs `SELECT … LIMIT 0`. The query planner type-checks
109
+ identifiers and predicates but does no row work, so the cost is
110
+ microseconds per probe — fine for a per-cube fan-out in CI. The
111
+ trade-off is that purely runtime drift (e.g. an `enum` value that
112
+ got dropped from a check constraint) won't surface here.
113
+
114
+ ## CLI
115
+
116
+ The package is library-first; a CLI lives in callers' deploy scripts
117
+ where the connection / DSN / role are already known.
118
+
119
+ ## Status
120
+
121
+ Phase A: probe-by-fragment shape. Drift findings are accurate;
122
+ performance is "fine for CI, not for runtime gates."
@@ -0,0 +1,7 @@
1
+ semql_validate_db/__init__.py,sha256=LN-tMWlMBusTROLnJUpn63ky3CAOcTyi7Nc2QFAE7Fc,733
2
+ semql_validate_db/_validate.py,sha256=C4Hq5JjqH5y0ED1rpOvn_8fCyjbxbg8T2Mc7_6Yt9e8,11102
3
+ semql_validate_db/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ semql_validate_db-0.1.0.dist-info/licenses/LICENSE,sha256=AdcAzanKVr3cVSrhBpG6gytjG0Ss1SBTQDAavLe0CRc,1505
5
+ semql_validate_db-0.1.0.dist-info/WHEEL,sha256=wXwAVsgVaOZ_pwDFqQm5Rd6PID-Fc74nkLc8X8gHiDo,81
6
+ semql_validate_db-0.1.0.dist-info/METADATA,sha256=dbLNm8BrmM056AStkUjxYWqwdf0ukiBbEEtbKaTBO08,4439
7
+ semql_validate_db-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.19
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Nikhil Pallamreddy
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.