PyPI - semql-validate-db - Versions diffs - 0.1.0__py3-none-any.whl - Mend

semql-validate-db 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

semql_validate_db/__init__.py +21 -0
semql_validate_db/_validate.py +315 -0
semql_validate_db/py.typed +0 -0
semql_validate_db-0.1.0.dist-info/METADATA +122 -0
semql_validate_db-0.1.0.dist-info/RECORD +7 -0
semql_validate_db-0.1.0.dist-info/WHEEL +4 -0
semql_validate_db-0.1.0.dist-info/licenses/LICENSE +28 -0

semql_validate_db/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""Pre-deploy drift checker for semql catalogues.
+``validate_against_db(catalog, connection=...)`` runs cheap probe
+queries against a live database and returns a list of
+``DbValidationError`` findings — one per cube / field / join that
+broke. Use it as a CI gate before promoting a catalogue change.
+This package is driver-agnostic: the ``connection`` argument is any
+DB-API 2.0 connection. Wire your own ``psycopg.connect`` /
+``clickhouse_connect.get_client`` / ``duckdb.connect`` and pass it in.
+"""
+from __future__ import annotations
+from semql_validate_db._validate import (
+    DbValidationCode,
+    DbValidationError,
+    validate_against_db,
+)
+__all__ = ["DbValidationCode", "DbValidationError", "validate_against_db"]

semql_validate_db/_validate.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""Catalogue ↔ database drift checker.
+The compiler is intentionally pure (no I/O). This package picks up the
+class of bugs the compiler can't see: an upstream column rename, a
+dropped table, a join predicate that suddenly compares incompatible
+types. Suitable as a pre-deploy CI gate.
+Strategy: for each cube, run cheap ``LIMIT 0`` probes against the
+target database — one for the table itself, one per measure /
+dimension SQL fragment, one for ``base_predicate``. Any SQL error
+gets translated into a ``DbValidationError`` naming the cube and the
+field that broke; the function collects all of them rather than
+short-circuiting so a single run surfaces the full drift picture.
+Transport: accepts any DB-API 2.0 connection. Callers bring their
+own driver (psycopg, clickhouse-connect, duckdb) so this package
+stays driver-agnostic.
+"""
+from __future__ import annotations
+import contextlib
+import re
+from dataclasses import dataclass
+from typing import Literal, Protocol
+from semql.catalog import Catalog
+from semql.introspect import iter_cubes, iter_fields, iter_joins
+from semql.model import Cube, Dimension, Join, Measure, TimeDimension
+DbValidationCode = Literal[
+    "missing_table",
+    "missing_column",
+    "base_predicate_invalid",
+    "join_predicate_invalid",
+    "required_filter_dimension_missing",
+]
+@dataclass(frozen=True)
+class DbValidationError:
+    """One drift finding.
+    ``cube`` always names the cube the probe ran against. ``field`` is
+    the measure / dimension / join target the probe was checking, or
+    ``None`` for cube-level findings (``missing_table``,
+    ``base_predicate_invalid``). ``detail`` carries the database's
+    own error message so the caller can route the message at the user
+    without re-parsing.
+    """
+    code: DbValidationCode
+    cube: str
+    field: str | None
+    message: str
+    detail: str | None = None
+class _Cursor(Protocol):
+    # DB-API 2.0 cursors return ``self``-like objects from execute, but the
+    # exact shape varies by driver. We only care about side-effects + close,
+    # so the return type is intentionally untyped via ``object``.
+    def execute(self, sql: str, /) -> object: ...
+    def close(self) -> None: ...
+class _Connection(Protocol):
+    def cursor(self) -> _Cursor: ...
+_PLACEHOLDER_RE = re.compile(r"\{([a-z_][a-z0-9_]*)\}")
+def _resolve_placeholders(sql: str, lookup: dict[str, str]) -> str:
+    """Substitute ``{key}`` placeholders the same way the compiler does.
+    Mirrors ``semql.compile._resolve_sql``'s behaviour: ``{alias}`` and
+    ``{cube_name}`` resolve to the alias; ``{ctx_key}`` resolves from
+    the caller-supplied ``context``. Unknown placeholders are left
+    in place — the resulting query will fail at the database and that
+    failure is the signal we want.
+    """
+    def _repl(m: re.Match[str]) -> str:
+        key = m.group(1)
+        return lookup.get(key, m.group(0))
+    return _PLACEHOLDER_RE.sub(_repl, sql)
+def _probe(
+    connection: _Connection,
+    sql: str,
+) -> tuple[bool, str | None]:
+    """Run a SQL probe under the connection's cursor.
+    Returns ``(True, None)`` on success and ``(False, error_message)``
+    on any failure. ``LIMIT 0`` is the caller's responsibility — this
+    function just runs whatever it's handed and reports the outcome.
+    """
+    cursor = connection.cursor()
+    try:
+        cursor.execute(sql)
+    except Exception as exc:  # noqa: BLE001 — any DB-API error counts as drift
+        return False, str(exc)
+    finally:
+        # Driver cleanup quirks vary; we don't want a flaky close() to mask
+        # a real drift finding so swallow whatever the driver throws here.
+        with contextlib.suppress(Exception):
+            cursor.close()
+    return True, None
+def _cube_lookup(cube: Cube, context: dict[str, str]) -> dict[str, str]:
+    """The substitution table the cube's SQL fragments resolve against."""
+    return {
+        cube.alias: cube.alias,
+        cube.name: cube.alias,
+        **context,
+    }
+def _validate_required_filters(cube: Cube) -> list[DbValidationError]:
+    """Static check: every ``required_filters`` entry must name a real
+    dimension on the cube.
+    This is technically catalogue-internal (no DB query needed), but
+    pre-deploy is the right surface for it — surfacing here means a CI
+    gate catches the typo even when the static-validation pass missed."""
+    dim_names = {d.name for d in cube.dimensions}
+    out: list[DbValidationError] = []
+    for req in cube.required_filters:
+        if req not in dim_names:
+            out.append(
+                DbValidationError(
+                    code="required_filter_dimension_missing",
+                    cube=cube.name,
+                    field=req,
+                    message=(
+                        f"Cube {cube.name!r} declares required_filters=[{req!r}] "
+                        f"but has no dimension by that name. Known: {sorted(dim_names)}."
+                    ),
+                )
+            )
+    return out
+def _validate_cube(
+    cube: Cube,
+    connection: _Connection,
+    context: dict[str, str],
+) -> list[DbValidationError]:
+    """Probe a single cube. Order: table → fragments → base_predicate.
+    Stops on ``missing_table`` (every subsequent probe would just
+    repeat the same error). All other findings accumulate so a single
+    run reports the whole picture for the cube."""
+    errors: list[DbValidationError] = []
+    lookup = _cube_lookup(cube, context)
+    table = _resolve_placeholders(cube.table, lookup)
+    alias = cube.alias
+    ok, detail = _probe(connection, f"SELECT * FROM {table} AS {alias} LIMIT 0")
+    if not ok:
+        errors.append(
+            DbValidationError(
+                code="missing_table",
+                cube=cube.name,
+                field=None,
+                message=(
+                    f"Cube {cube.name!r}: table {table!r} did not respond to a "
+                    "trivial SELECT — likely missing, renamed, or "
+                    "inaccessible to the connection's role."
+                ),
+                detail=detail,
+            )
+        )
+        return errors
+    # Walk every addressable field on the cube and probe its SQL
+    # fragment. ``iter_fields`` yields measures, dimensions, time
+    # dimensions, segments — we skip segments here (their fragments
+    # are predicates, not projections, and probe via base_predicate's
+    # path if needed). Ratio measures and ``count(*)`` are also
+    # skipped: ratio has no fragment of its own, ``count(*)`` is
+    # covered by the table probe.
+    for field in iter_fields(cube):
+        if isinstance(field, Measure):
+            if field.agg == "ratio":
+                continue
+            if field.agg == "count" and field.sql.strip() == "*":
+                continue
+            kind = "measure"
+        elif isinstance(field, TimeDimension):
+            kind = "time_dimension"
+        elif isinstance(field, Dimension):
+            kind = "dimension"
+        else:
+            # Segment — predicate, not projection; covered by
+            # base_predicate semantics. No fragment-as-SELECT probe.
+            continue
+        fragment = _resolve_placeholders(field.sql, lookup)
+        ok, detail = _probe(connection, f"SELECT {fragment} FROM {table} AS {alias} LIMIT 0")
+        if not ok:
+            errors.append(
+                DbValidationError(
+                    code="missing_column",
+                    cube=cube.name,
+                    field=field.name,
+                    message=(
+                        f"Cube {cube.name!r}, {kind} {field.name!r}: SQL "
+                        f"fragment {field.sql!r} did not execute against the table."
+                    ),
+                    detail=detail,
+                )
+            )
+    if cube.base_predicate:
+        predicate = _resolve_placeholders(cube.base_predicate, lookup)
+        ok, detail = _probe(
+            connection,
+            f"SELECT 1 FROM {table} AS {alias} WHERE {predicate} LIMIT 0",
+        )
+        if not ok:
+            errors.append(
+                DbValidationError(
+                    code="base_predicate_invalid",
+                    cube=cube.name,
+                    field=None,
+                    message=(
+                        f"Cube {cube.name!r}: base_predicate {cube.base_predicate!r} "
+                        "did not execute against the table."
+                    ),
+                    detail=detail,
+                )
+            )
+    return errors
+def _validate_join(
+    source: Cube,
+    join: Join,
+    target: Cube,
+    connection: _Connection,
+    context: dict[str, str],
+) -> list[DbValidationError]:
+    """Probe ``source LEFT JOIN target ON <join.on> LIMIT 0``.
+    A successful run means the predicate parses against the actual
+    column types on both sides. Type-mismatch and unknown-column errors
+    surface here under ``join_predicate_invalid``."""
+    src_lookup = _cube_lookup(source, context)
+    tgt_lookup = _cube_lookup(target, context)
+    src_table = _resolve_placeholders(source.table, src_lookup)
+    tgt_table = _resolve_placeholders(target.table, tgt_lookup)
+    on_clause = _resolve_placeholders(join.on, {**src_lookup, **tgt_lookup})
+    sql = (
+        f"SELECT 1 FROM {src_table} AS {source.alias} "
+        f"LEFT JOIN {tgt_table} AS {target.alias} ON {on_clause} LIMIT 0"
+    )
+    ok, detail = _probe(connection, sql)
+    if ok:
+        return []
+    return [
+        DbValidationError(
+            code="join_predicate_invalid",
+            cube=source.name,
+            field=join.to,
+            message=(
+                f"Cube {source.name!r} → {join.to!r}: join predicate "
+                f"{join.on!r} did not execute against the joined tables."
+            ),
+            detail=detail,
+        )
+    ]
+def validate_against_db(
+    catalog: Catalog,
+    *,
+    connection: _Connection,
+    context: dict[str, str] | None = None,
+) -> list[DbValidationError]:
+    """Validate every cube and join in ``catalog`` against a live DB.
+    Returns the full list of findings (empty on success). Each finding
+    names the cube / field that drifted and carries the database's
+    own error message in ``detail`` for routing into a CI log.
+    ``context`` substitutes ``{key}`` placeholders inside catalog SQL
+    (e.g. ``{"schema": "analytics"}`` for cubes whose ``table`` is
+    ``"{schema}.orders"``).
+    META reflection cubes are skipped — they don't live in the
+    physical database.
+    """
+    ctx = context or {}
+    errors: list[DbValidationError] = []
+    # ``iter_cubes`` skips META reflection cubes by default — they
+    # live in-memory and aren't real database tables.
+    for cube in iter_cubes(catalog):
+        errors.extend(_validate_required_filters(cube))
+        errors.extend(_validate_cube(cube, connection, ctx))
+    for source, join, target in iter_joins(catalog):
+        errors.extend(_validate_join(source, join, target, connection, ctx))
+    return errors
+__all__ = [
+    "DbValidationCode",
+    "DbValidationError",
+    "validate_against_db",
+]

semql_validate_db/py.typed ADDED Viewed

File without changes

semql_validate_db-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,122 @@
+Metadata-Version: 2.4
+Name: semql-validate-db
+Version: 0.1.0
+Summary: Validate a semql Catalog against a live database — catches missing tables, dropped columns, and broken join predicates before a deploy.
+Author: Nikhil Pallamreddy
+Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
+License-Expression: BSD-3-Clause
+License-File: LICENSE
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Software Development :: Quality Assurance
+Classifier: Typing :: Typed
+Requires-Dist: semql>=0.1.0,<0.2
+Requires-Python: >=3.12
+Project-URL: Homepage, https://github.com/npalladium/semql
+Project-URL: Repository, https://github.com/npalladium/semql
+Project-URL: Issues, https://github.com/npalladium/semql/issues
+Description-Content-Type: text/markdown
+# semql-validate-db
+Pre-deploy drift checker for [`semql`](../semql) catalogues. Runs cheap
+probe queries against a live database and surfaces the class of bugs
+the compiler can't see — missing tables, dropped columns, broken join
+predicates, base-predicate drift.
+`semql` is intentionally pure (PHILOSOPHY: "the compiler has no I/O").
+That keeps the compiler simple, but it also means a catalog can pass
+every compile-time check and still blow up at query time because
+upstream renamed a column. `semql-validate-db` is the out-of-band
+gate that catches it.
+## Install
+```sh
+pip install semql-validate-db
+```
+The package is driver-agnostic. Bring your own DB-API 2.0 connection:
+```sh
+pip install psycopg              # Postgres
+pip install clickhouse-connect   # ClickHouse
+pip install duckdb               # DuckDB
+```
+## Quick start
+```python
+import duckdb
+from semql import Backend, Catalog, Cube, Dimension, Measure, TimeDimension
+from semql_validate_db import validate_against_db
+orders = Cube(
+    name="orders",
+    backend=Backend.DUCKDB,
+    table="orders",
+    alias="o",
+    measures=[Measure(name="revenue", sql="{o}.amount", agg="sum")],
+    dimensions=[Dimension(name="region", sql="{o}.region", type="string")],
+    time_dimensions=[TimeDimension(name="created_at", sql="{o}.created_at")],
+)
+catalog = Catalog([orders])
+conn = duckdb.connect(":memory:")
+conn.execute(
+    "CREATE TABLE orders (amount DOUBLE, region TEXT, created_at TIMESTAMP)"
+)
+errors = validate_against_db(catalog, connection=conn)
+for e in errors:
+    print(f"{e.code}: {e.cube}.{e.field or ''} — {e.message}")
+```
+A clean run returns an empty list. Drift (a missing column, a renamed
+table) yields one `DbValidationError` per finding so a single run
+gives the full picture instead of bailing on the first failure.
+## What it catches
+- `missing_table` — `cube.table` doesn't exist or the connection's
+  role can't see it.
+- `missing_column` — a measure / dimension / time-dimension SQL
+  fragment references a column that no longer exists.
+- `base_predicate_invalid` — `cube.base_predicate` doesn't execute.
+- `join_predicate_invalid` — a `Join.on` predicate references columns
+  that aren't there, or compares incompatible types.
+- `required_filter_dimension_missing` — static catalog check; the
+  named `required_filters` entry has no matching `Dimension`.
+## What it doesn't catch
+- Semantic drift (a column exists but means something different now).
+  Schema is necessary, not sufficient.
+- Cross-table referential integrity. The probes are `LIMIT 0`; they
+  parse, they don't sample.
+- Backend-specific feature drift (a function got dialect-renamed).
+  Use the compiler's snapshot tests for that.
+## Why `LIMIT 0`?
+Every probe runs `SELECT … LIMIT 0`. The query planner type-checks
+identifiers and predicates but does no row work, so the cost is
+microseconds per probe — fine for a per-cube fan-out in CI. The
+trade-off is that purely runtime drift (e.g. an `enum` value that
+got dropped from a check constraint) won't surface here.
+## CLI
+The package is library-first; a CLI lives in callers' deploy scripts
+where the connection / DSN / role are already known.
+## Status
+Phase A: probe-by-fragment shape. Drift findings are accurate;
+performance is "fine for CI, not for runtime gates."

semql_validate_db-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+semql_validate_db/__init__.py,sha256=LN-tMWlMBusTROLnJUpn63ky3CAOcTyi7Nc2QFAE7Fc,733
+semql_validate_db/_validate.py,sha256=C4Hq5JjqH5y0ED1rpOvn_8fCyjbxbg8T2Mc7_6Yt9e8,11102
+semql_validate_db/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+semql_validate_db-0.1.0.dist-info/licenses/LICENSE,sha256=AdcAzanKVr3cVSrhBpG6gytjG0Ss1SBTQDAavLe0CRc,1505
+semql_validate_db-0.1.0.dist-info/WHEEL,sha256=wXwAVsgVaOZ_pwDFqQm5Rd6PID-Fc74nkLc8X8gHiDo,81
+semql_validate_db-0.1.0.dist-info/METADATA,sha256=dbLNm8BrmM056AStkUjxYWqwdf0ukiBbEEtbKaTBO08,4439
+semql_validate_db-0.1.0.dist-info/RECORD,,

semql_validate_db-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.11.19
+Root-Is-Purelib: true
+Tag: py3-none-any

semql_validate_db-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2026, Nikhil Pallamreddy
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.