PyPI - semql-introspect - Versions diffs - 0.2.1__tar.gz - Mend

semql-introspect 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

semql_introspect-0.2.1/LICENSE +28 -0
semql_introspect-0.2.1/PKG-INFO +65 -0
semql_introspect-0.2.1/README.md +40 -0
semql_introspect-0.2.1/pyproject.toml +41 -0
semql_introspect-0.2.1/src/semql_introspect/__init__.py +105 -0
semql_introspect-0.2.1/src/semql_introspect/__main__.py +121 -0
semql_introspect-0.2.1/src/semql_introspect/_emit.py +148 -0
semql_introspect-0.2.1/src/semql_introspect/_heuristics.py +217 -0
semql_introspect-0.2.1/src/semql_introspect/_introspect.py +200 -0
semql_introspect-0.2.1/src/semql_introspect/_probe.py +207 -0
semql_introspect-0.2.1/src/semql_introspect/py.typed +0 -0

semql_introspect-0.2.1/LICENSE ADDED Viewed

@@ -0,0 +1,28 @@
+BSD 3-Clause License
+Copyright (c) 2026, Nikhil Pallamreddy
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

semql_introspect-0.2.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,65 @@
+Metadata-Version: 2.4
+Name: semql-introspect
+Version: 0.2.1
+Summary: Bootstrap a semql Catalog from a live database — emits Python cube stubs from Information Schema with heuristic measure/dimension inference.
+Author: Nikhil Pallamreddy
+Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
+License-Expression: BSD-3-Clause
+License-File: LICENSE
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Software Development :: Code Generators
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Typing :: Typed
+Requires-Dist: semql>=0.2.1,<0.3
+Requires-Python: >=3.12
+Project-URL: Homepage, https://github.com/npalladium/semql
+Project-URL: Repository, https://github.com/npalladium/semql
+Project-URL: Issues, https://github.com/npalladium/semql/issues
+Description-Content-Type: text/markdown
+# semql-introspect
+Bootstrap a [semql](../semql) `Catalog` from a live database.
+Reads Information Schema, emits Python `Cube` stubs with heuristic
+measure / dimension / time-dimension inference and foreign-key derived
+joins. Designed for greenfield adoption — a team with 200 tables can
+generate the mechanical 80% of a catalog in seconds, then hand-edit
+the heuristic guesses.
+## Usage
+```python
+import duckdb
+from semql.model import Backend
+from semql_introspect import introspect_to_python
+con = duckdb.connect("warehouse.db")
+print(introspect_to_python(con, backend=Backend.DUCKDB, schema="main"))
+```
+Or via CLI:
+```sh
+semql-introspect --backend duckdb --schema main --conn "warehouse.db"
+```
+## Heuristics
+- Numeric columns named `amount` / `price` / `revenue` / `cost` / `total`
+  / `value` / `qty` / `quantity` / `count` → `Measure(agg="sum")`.
+- Columns ending in `_id` → `Measure(agg="count_distinct")` (the table's
+  cardinality is usually interesting).
+- `date` / `timestamp` columns → `TimeDimension`.
+- Foreign keys → `Join(relationship="many_to_one")` plus the foreign-side
+  `Dimension(foreign_key=...)`.
+- Everything else → `Dimension` typed by the column's SQL type.
+Heuristic guesses get a `# TODO: review` comment so the diff makes the
+inference choices reviewable.

semql_introspect-0.2.1/README.md ADDED Viewed

@@ -0,0 +1,40 @@
+# semql-introspect
+Bootstrap a [semql](../semql) `Catalog` from a live database.
+Reads Information Schema, emits Python `Cube` stubs with heuristic
+measure / dimension / time-dimension inference and foreign-key derived
+joins. Designed for greenfield adoption — a team with 200 tables can
+generate the mechanical 80% of a catalog in seconds, then hand-edit
+the heuristic guesses.
+## Usage
+```python
+import duckdb
+from semql.model import Backend
+from semql_introspect import introspect_to_python
+con = duckdb.connect("warehouse.db")
+print(introspect_to_python(con, backend=Backend.DUCKDB, schema="main"))
+```
+Or via CLI:
+```sh
+semql-introspect --backend duckdb --schema main --conn "warehouse.db"
+```
+## Heuristics
+- Numeric columns named `amount` / `price` / `revenue` / `cost` / `total`
+  / `value` / `qty` / `quantity` / `count` → `Measure(agg="sum")`.
+- Columns ending in `_id` → `Measure(agg="count_distinct")` (the table's
+  cardinality is usually interesting).
+- `date` / `timestamp` columns → `TimeDimension`.
+- Foreign keys → `Join(relationship="many_to_one")` plus the foreign-side
+  `Dimension(foreign_key=...)`.
+- Everything else → `Dimension` typed by the column's SQL type.
+Heuristic guesses get a `# TODO: review` comment so the diff makes the
+inference choices reviewable.

semql_introspect-0.2.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,41 @@
+[project]
+name = "semql-introspect"
+version = "0.2.1"
+description = "Bootstrap a semql Catalog from a live database — emits Python cube stubs from Information Schema with heuristic measure/dimension inference."
+readme = "README.md"
+license = "BSD-3-Clause"
+license-files = ["LICENSE"]
+authors = [
+    { name = "Nikhil Pallamreddy", email = "nikhil.pallamreddy+git@gmail.com" }
+]
+requires-python = ">=3.12"
+dependencies = [
+    "semql>=0.2.1,<0.3",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Database",
+    "Topic :: Software Development :: Code Generators",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Typing :: Typed",
+]
+[project.scripts]
+semql-introspect = "semql_introspect.__main__:main"
+[project.urls]
+Homepage = "https://github.com/npalladium/semql"
+Repository = "https://github.com/npalladium/semql"
+Issues = "https://github.com/npalladium/semql/issues"
+[build-system]
+requires = ["uv_build>=0.11.19,<0.12.0"]
+build-backend = "uv_build"
+[tool.uv.sources]
+semql = { workspace = true, editable = true }

semql_introspect-0.2.1/src/semql_introspect/__init__.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""Bootstrap a semql Catalog from a live database.
+Top-level entrypoints — ``introspect_catalog`` returns
+``list[Cube]`` for programmatic callers, ``introspect_to_python``
+returns an importable Python source string for review-then-commit
+workflows. Both layer over :class:`semql_introspect.InformationSchemaProbe`
+or a caller-supplied :class:`SchemaProbe` for non-ANSI dialects.
+"""
+from __future__ import annotations
+from typing import Any
+from semql.model import Backend, Cube
+from semql_introspect._emit import emit_python
+from semql_introspect._introspect import (
+    HeuristicAnnotation,
+    IntrospectionResult,
+    introspect,
+)
+from semql_introspect._probe import (
+    ColumnInfo,
+    ForeignKeyInfo,
+    InformationSchemaProbe,
+    SchemaProbe,
+    TableInfo,
+)
+def introspect_catalog(
+    connection: Any,  # noqa: ANN401 — any DB-API 2.0 conn
+    *,
+    backend: Backend,
+    schema: str,
+    include_tables: list[str] | None = None,
+    exclude_tables: list[str] | None = None,
+) -> list[Cube]:
+    """Convenience wrapper that builds the ANSI probe + runs ``introspect``.
+    Returns the cube list only; for the parallel heuristic annotations
+    call :func:`introspect_to_result` instead. Pass a custom
+    :class:`SchemaProbe` plus :func:`introspect` directly for non-ANSI
+    dialects (ClickHouse, BigQuery)."""
+    return introspect_to_result(
+        connection,
+        backend=backend,
+        schema=schema,
+        include_tables=include_tables,
+        exclude_tables=exclude_tables,
+    ).cubes
+def introspect_to_result(
+    connection: Any,  # noqa: ANN401
+    *,
+    backend: Backend,
+    schema: str,
+    include_tables: list[str] | None = None,
+    exclude_tables: list[str] | None = None,
+) -> IntrospectionResult:
+    """Full result envelope (cubes + heuristic annotations)."""
+    probe = InformationSchemaProbe(
+        connection,
+        schema=schema,
+        include_tables=include_tables,
+        exclude_tables=exclude_tables,
+    )
+    return introspect(probe, backend=backend)
+def introspect_to_python(
+    connection: Any,  # noqa: ANN401
+    *,
+    backend: Backend,
+    schema: str,
+    include_tables: list[str] | None = None,
+    exclude_tables: list[str] | None = None,
+    header: str | None = None,
+) -> str:
+    """Return a self-contained Python module string with the inferred cubes."""
+    result = introspect_to_result(
+        connection,
+        backend=backend,
+        schema=schema,
+        include_tables=include_tables,
+        exclude_tables=exclude_tables,
+    )
+    return emit_python(result, header=header)
+__all__ = [
+    "ColumnInfo",
+    "ForeignKeyInfo",
+    "HeuristicAnnotation",
+    "InformationSchemaProbe",
+    "IntrospectionResult",
+    "SchemaProbe",
+    "TableInfo",
+    "emit_python",
+    "introspect",
+    "introspect_catalog",
+    "introspect_to_python",
+    "introspect_to_result",
+]

semql_introspect-0.2.1/src/semql_introspect/__main__.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""CLI entrypoint — ``python -m semql_introspect`` / ``semql-introspect``.
+Connects to a database via the matching DB-API driver, runs the
+introspector, and writes the emitted Python to stdout. Driver imports
+happen lazily so a user introspecting DuckDB doesn't need ``psycopg``
+installed.
+"""
+from __future__ import annotations
+import argparse
+import sys
+from typing import Any, cast
+from semql.model import Backend
+from semql_introspect import introspect_to_python
+_BACKENDS_BY_NAME = {b.value: b for b in Backend}
+def _connect(backend_name: str, conn_string: str) -> Any:  # noqa: ANN401
+    """Lazy-import a DB-API driver and open a connection.
+    Drivers are imported inline so the CLI launches even when only
+    one backend's driver is installed. Connection-string syntax is
+    driver-native (DSNs for psycopg, file paths for DuckDB)."""
+    if backend_name == "postgres":
+        # psycopg lacks py.typed; cast the module so attribute access yields Any.
+        import psycopg  # type: ignore[import-not-found]
+        return cast(Any, psycopg).connect(conn_string)
+    if backend_name == "duckdb":
+        import duckdb
+        return cast(Any, duckdb).connect(conn_string)
+    if backend_name == "snowflake":
+        # snowflake-connector-python lacks py.typed.
+        from snowflake import connector as sf_connector  # type: ignore[import-not-found]
+        # Snowflake takes kwargs; expect a key=value;... string.
+        kwargs: dict[str, str] = {}
+        for pair in conn_string.split(";"):
+            if not pair:
+                continue
+            k, _, v = pair.partition("=")
+            kwargs[k.strip()] = v.strip()
+        return cast(Any, sf_connector).connect(**kwargs)
+    raise SystemExit(
+        f"semql-introspect: no driver wired up for backend {backend_name!r}. "
+        "Supported via the CLI: postgres, duckdb, snowflake. For other "
+        "backends, call ``introspect_to_python`` from Python with a "
+        "connection you opened yourself."
+    )
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="semql-introspect",
+        description=(
+            "Generate a semql cube catalog from a live database. "
+            "Reads Information Schema, applies heuristic measure / "
+            "dimension inference, and emits Python."
+        ),
+    )
+    parser.add_argument(
+        "--backend",
+        required=True,
+        choices=sorted(_BACKENDS_BY_NAME),
+        help="semql Backend tag stamped onto every emitted cube.",
+    )
+    parser.add_argument(
+        "--schema",
+        required=True,
+        help="information_schema table_schema to scan.",
+    )
+    parser.add_argument(
+        "--conn",
+        required=True,
+        help=(
+            "Driver-native connection string (DSN for psycopg, file "
+            "path for DuckDB, key=value;... for Snowflake)."
+        ),
+    )
+    parser.add_argument(
+        "--include",
+        action="append",
+        default=None,
+        metavar="TABLE",
+        help="Only introspect these tables (repeat for multiple).",
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=None,
+        metavar="TABLE",
+        help="Skip these tables (repeat for multiple).",
+    )
+    parser.add_argument(
+        "--header",
+        default=None,
+        help="Custom docstring for the emitted module (defaults to a TODO-review reminder).",
+    )
+    args = parser.parse_args(argv)
+    backend = _BACKENDS_BY_NAME[args.backend]
+    connection = _connect(args.backend, args.conn)
+    src = introspect_to_python(
+        connection,
+        backend=backend,
+        schema=args.schema,
+        include_tables=args.include,
+        exclude_tables=args.exclude,
+        header=args.header,
+    )
+    sys.stdout.write(src)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

semql_introspect-0.2.1/src/semql_introspect/_emit.py ADDED Viewed

@@ -0,0 +1,148 @@
+"""Render a :class:`IntrospectionResult` as Python source code.
+The output is a self-contained module that ``import``s from
+``semql.model`` and defines a ``CUBES: list[Cube]`` constant. The dev
+review loop is: run the introspector, open the file, search for
+``# TODO: review`` comments, edit by hand. Heuristic annotations
+get rendered inline so the dev sees *why* the tool picked each guess.
+"""
+from __future__ import annotations
+import textwrap
+from semql.model import Cube, Join
+from semql_introspect._introspect import IntrospectionResult
+def emit_python(result: IntrospectionResult, *, header: str | None = None) -> str:
+    """Render the introspection result as a single Python module.
+    ``header`` overrides the default docstring at the top — useful for
+    CLI invocations that want to record the connection string / schema
+    in the file so the diff carries provenance."""
+    annotations_by_field: dict[tuple[str, str], list[str]] = {}
+    for ann in result.annotations:
+        annotations_by_field.setdefault((ann.cube, ann.field), []).append(ann.reason)
+    lines: list[str] = []
+    lines.append(_header(header))
+    lines.append("from __future__ import annotations\n")
+    lines.append("")
+    lines.append("from semql.model import (")
+    lines.append("    Backend,")
+    lines.append("    Cube,")
+    lines.append("    Dimension,")
+    lines.append("    Join,")
+    lines.append("    Measure,")
+    lines.append("    TimeDimension,")
+    lines.append(")")
+    lines.append("")
+    lines.append("")
+    for cube in result.cubes:
+        lines.append(_render_cube(cube, annotations_by_field))
+        lines.append("")
+    lines.append("CUBES: list[Cube] = [")
+    for cube in result.cubes:
+        lines.append(f"    {_cube_var(cube.name)},")
+    lines.append("]")
+    return "\n".join(lines) + "\n"
+def _header(custom: str | None) -> str:
+    if custom is not None:
+        return f'"""{custom}"""\n\n'
+    return (
+        '"""Auto-generated by semql-introspect.\n\n'
+        "Review every ``# TODO: review`` comment before promoting this\n"
+        "file to a production catalog. The heuristic-inferred measures\n"
+        "and dimensions are educated guesses, not authoritative — the\n"
+        "introspector errs on the side of surfacing structure so a human\n"
+        "review pass can prune.\n"
+        '"""\n\n'
+    )
+def _cube_var(cube_name: str) -> str:
+    """Variable name for a cube in the emitted module."""
+    return f"{cube_name}_cube"
+def _render_cube(
+    cube: Cube,
+    annotations: dict[tuple[str, str], list[str]],
+) -> str:
+    var = _cube_var(cube.name)
+    lines: list[str] = []
+    lines.append(f"{var} = Cube(")
+    lines.append(f"    name={cube.name!r},")
+    lines.append(f"    backend=Backend.{cube.backend.name},")
+    lines.append(f"    table={cube.table!r},")
+    lines.append(f"    alias={cube.alias!r},")
+    if cube.primary_key is not None:
+        lines.append(f"    primary_key={cube.primary_key!r},")
+    if cube.measures:
+        lines.append("    measures=[")
+        for m in cube.measures:
+            todo = annotations.get((cube.name, m.name))
+            if todo:
+                lines.append(_indent_comment(todo, 8))
+            lines.append(f"        Measure(name={m.name!r}, sql={m.sql!r}, agg={m.agg!r}),")
+        lines.append("    ],")
+    if cube.dimensions:
+        lines.append("    dimensions=[")
+        for d in cube.dimensions:
+            todo = annotations.get((cube.name, d.name))
+            if todo:
+                lines.append(_indent_comment(todo, 8))
+            inner = f"name={d.name!r}, sql={d.sql!r}, type={d.type!r}"
+            if d.foreign_key is not None:
+                inner += f", foreign_key={d.foreign_key!r}"
+            lines.append(f"        Dimension({inner}),")
+        lines.append("    ],")
+    if cube.time_dimensions:
+        lines.append("    time_dimensions=[")
+        for td in cube.time_dimensions:
+            lines.append(f"        TimeDimension(name={td.name!r}, sql={td.sql!r}),")
+        lines.append("    ],")
+    if cube.joins:
+        lines.append("    joins=[")
+        for j in cube.joins:
+            lines.append(_render_join(j))
+        lines.append("    ],")
+    # S7 — emit placeholder grounding fields with TODO comments so the
+    # introspect → review flow is explicit. The user fills these in
+    # (or runs `semql suggest` to draft them via LLM).
+    lines.append("    # TODO: review — questions users might literally ask of this cube.")
+    lines.append("    questions=[],")
+    lines.append("    # TODO: review — keywords for lexical retrieval (acronyms preserved).")
+    lines.append("    keywords=[],")
+    lines.append(")")
+    return "\n".join(lines)
+def _render_join(j: Join) -> str:
+    return f"        Join(to={j.to!r}, relationship={j.relationship!r}, on={j.on!r}),"
+def _indent_comment(reasons: list[str], indent: int) -> str:
+    prefix = " " * indent + "# TODO: review — "
+    wrapper = textwrap.TextWrapper(
+        initial_indent=prefix,
+        subsequent_indent=" " * indent + "#   ",
+        width=88,
+    )
+    lines: list[str] = []
+    for reason in reasons:
+        lines.append(wrapper.fill(reason))
+    return "\n".join(lines)
+__all__ = ["emit_python"]

semql_introspect-0.2.1/src/semql_introspect/_heuristics.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""Column → field-kind classification.
+Pure functions over :class:`ColumnInfo`. Heuristic choices live here so
+the orchestrator can stay glue and tests can exercise each rule in
+isolation. Every guess that isn't a hard rule emits a
+``heuristic_reason`` the emitter can surface as a ``# TODO: review``
+comment — the dev reviewing the diff sees *why* the tool picked
+``count_distinct`` over a plain dimension.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+from semql.model import DimTypeLiteral
+from semql_introspect._probe import ColumnInfo
+FieldKind = Literal[
+    "measure_sum",
+    "measure_count_distinct",
+    "time_dimension",
+    "dimension",
+]
+@dataclass(frozen=True)
+class Classification:
+    """How a single column should be modelled in the catalog.
+    ``dim_type`` is set only for ``kind == "dimension"`` — it's the
+    semql ``DimTypeLiteral`` the dimension should declare. Other
+    kinds don't carry a dim type.
+    ``heuristic_reason`` is non-empty when the classification came
+    from a guess rather than a hard rule; the emitter renders it as a
+    ``# TODO: review`` comment next to the field.
+    """
+    kind: FieldKind
+    dim_type: DimTypeLiteral | None = None
+    heuristic_reason: str = ""
+# Column-name tokens that mark a numeric column as an additive measure.
+# Plural / singular variants both included — the rule fires on a token
+# match, not a full-string match.
+_MEASURE_NAME_TOKENS = frozenset(
+    {
+        "amount",
+        "amounts",
+        "price",
+        "prices",
+        "revenue",
+        "revenues",
+        "cost",
+        "costs",
+        "total",
+        "totals",
+        "value",
+        "values",
+        "qty",
+        "quantity",
+        "quantities",
+        "spend",
+        "fee",
+        "fees",
+        "balance",
+        "balances",
+    }
+)
+def _normalize_type(data_type: str) -> str:
+    """Strip qualifiers/parameters from a SQL type string.
+    ``"timestamp without time zone"`` → ``"timestamp"``;
+    ``"VARCHAR(255)"`` → ``"varchar"``;
+    ``"numeric(18,2)"`` → ``"numeric"``.
+    """
+    t = data_type.lower().strip()
+    if "(" in t:
+        t = t.split("(", 1)[0].strip()
+    if " " in t:
+        t = t.split(" ", 1)[0]
+    return t
+_NUMERIC_TYPES = frozenset(
+    {
+        "smallint",
+        "integer",
+        "int",
+        "int2",
+        "int4",
+        "int8",
+        "bigint",
+        "decimal",
+        "numeric",
+        "real",
+        "float",
+        "float4",
+        "float8",
+        "double",
+        "money",
+    }
+)
+_DATE_TYPES = frozenset(
+    {
+        "date",
+        "timestamp",
+        "timestamptz",
+        "datetime",
+        "time",
+        "timetz",
+    }
+)
+_BOOL_TYPES = frozenset({"boolean", "bool"})
+def _is_numeric(data_type: str) -> bool:
+    return _normalize_type(data_type) in _NUMERIC_TYPES
+def _is_date(data_type: str) -> bool:
+    return _normalize_type(data_type) in _DATE_TYPES
+def _is_bool(data_type: str) -> bool:
+    return _normalize_type(data_type) in _BOOL_TYPES
+def _dim_type_for(data_type: str) -> DimTypeLiteral:
+    """Map a SQL type string onto a semql ``DimTypeLiteral``.
+    Falls back to ``"string"`` for unknown types — the catalog author
+    can refine it post-emission. ``"string"`` is the safer default
+    than ``"number"`` because misclassifying a numeric ID as a number
+    invites accidental aggregation."""
+    if _is_numeric(data_type):
+        return "number"
+    if _is_date(data_type):
+        return "time"
+    if _is_bool(data_type):
+        return "bool"
+    return "string"
+def classify_column(col: ColumnInfo, *, is_fk: bool, is_pk: bool) -> Classification:
+    """Pick the field kind + (where applicable) dimension type for a column.
+    Rules, in priority order:
+    1. **Date / timestamp** → ``time_dimension``. Hard rule.
+    2. **Foreign-key columns** → ``dimension`` (with ``foreign_key=``
+       wired up by the orchestrator). Numeric-FK heuristic *does not*
+       apply — FKs are identifiers, not measurements.
+    3. **Primary-key columns** → ``dimension``. Hard rule; the cube's
+       ``primary_key`` field tracks identity separately.
+    4. **Numeric columns whose name matches a measure-name token**
+       (amount / price / revenue / ...) → ``measure_sum``.
+    5. **Columns ending in ``_id``** → ``measure_count_distinct``. The
+       table's distinct identifier count is a useful default measure
+       even when an ID column isn't a measure proper.
+    6. Otherwise → ``dimension`` typed by the column's SQL type.
+    """
+    name = col.name.lower()
+    if _is_date(col.data_type):
+        return Classification(kind="time_dimension")
+    if is_pk:
+        return Classification(
+            kind="dimension",
+            dim_type=_dim_type_for(col.data_type),
+        )
+    if is_fk:
+        return Classification(
+            kind="dimension",
+            dim_type=_dim_type_for(col.data_type),
+        )
+    if _is_numeric(col.data_type):
+        tokens = set(name.split("_"))
+        if tokens & _MEASURE_NAME_TOKENS:
+            return Classification(
+                kind="measure_sum",
+                heuristic_reason=(
+                    f"numeric column named {col.name!r} matched a measure-name "
+                    "token (amount/price/revenue/...) — confirm this should be a "
+                    "summable measure rather than a dimension."
+                ),
+            )
+    if name.endswith("_id"):
+        return Classification(
+            kind="measure_count_distinct",
+            heuristic_reason=(
+                f"column ends in ``_id`` ({col.name!r}); inferred a "
+                "``count_distinct`` measure. Drop the measure if this column "
+                "isn't useful as a count, or move it to a foreign-key "
+                "dimension if it should link to another cube."
+            ),
+        )
+    return Classification(
+        kind="dimension",
+        dim_type=_dim_type_for(col.data_type),
+    )
+__all__ = ["Classification", "FieldKind", "classify_column"]

semql_introspect-0.2.1/src/semql_introspect/_introspect.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""Orchestrator — turn a :class:`SchemaProbe` into a list of ``Cube``s.
+Reads tables / FKs / PKs from the probe, classifies each column via
+:mod:`._heuristics`, and assembles ``Cube`` / ``Measure`` / ``Dimension``
+/ ``TimeDimension`` instances. The result is a pure
+:class:`semql.model.Cube` list — ready to wrap in a
+:class:`semql.Catalog` or to round-trip through the emitter.
+Per-column heuristic reasons (the ``# TODO: review`` hints) ride along
+on a parallel :class:`HeuristicAnnotation` list, indexed by
+``(cube_name, field_name)``. The emitter renders them as inline
+comments; programmatic callers can ignore them.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from semql.model import (
+    Backend,
+    Cube,
+    Dimension,
+    Join,
+    Measure,
+    TimeDimension,
+)
+from semql_introspect._heuristics import classify_column
+from semql_introspect._probe import SchemaProbe
+@dataclass(frozen=True)
+class HeuristicAnnotation:
+    """One ``# TODO: review`` hint surfaced by the introspector."""
+    cube: str
+    field: str
+    reason: str
+@dataclass(frozen=True)
+class IntrospectionResult:
+    """Cubes + the heuristic-guess annotations parallel to them.
+    The cubes alone can be plugged straight into a ``Catalog``; the
+    annotations are presentation metadata for emitters / reports."""
+    cubes: list[Cube]
+    annotations: list[HeuristicAnnotation]
+def _alias_for(table: str) -> str:
+    """Single-or-two-letter alias derived from the table name.
+    semql cube aliases must match ``[a-z_][a-z0-9_]*``. The introspect
+    convention: take the initial letters of ``_``-separated tokens
+    (``user_events`` → ``ue``); fall back to the first letter for
+    single-token names (``orders`` → ``o``). Aliases get
+    deduplicated at the catalog assembly step."""
+    tokens = [t for t in table.lower().split("_") if t]
+    if not tokens:
+        return "t"
+    if len(tokens) == 1:
+        return tokens[0][0]
+    return "".join(t[0] for t in tokens)
+def _dedupe_aliases(cubes: list[Cube]) -> list[Cube]:
+    """Ensure every cube has a unique alias.
+    Two ``user_events`` / ``user_eligibility`` tables both alias to
+    ``ue`` by default — append a suffix to all but the first
+    collider."""
+    seen: dict[str, int] = {}
+    out: list[Cube] = []
+    for cube in cubes:
+        alias = cube.alias
+        n = seen.get(alias, 0)
+        if n > 0:
+            new_alias = f"{alias}{n + 1}"
+            seen[alias] = n + 1
+            cube = cube.model_copy(update={"alias": new_alias})
+        else:
+            seen[alias] = 1
+        out.append(cube)
+    return out
+def introspect(probe: SchemaProbe, *, backend: Backend) -> IntrospectionResult:
+    """Build a catalog from a probe + a target backend.
+    ``backend`` is the semql ``Backend`` enum tag stamped onto every
+    emitted cube — the introspector doesn't try to detect the dialect
+    itself because a single probe shape often spans multiple backends
+    (ANSI ``information_schema`` works against PG, DuckDB, and
+    Snowflake)."""
+    tables = probe.list_tables()
+    fks = probe.list_foreign_keys()
+    pks = probe.list_primary_keys()
+    table_names = {t.name for t in tables}
+    # ``{from_table.from_column → to_table}`` so the per-column loop
+    # knows which dim should carry ``foreign_key=``.
+    fk_by_source: dict[tuple[str, str], str] = {
+        (fk.from_table, fk.from_column): fk.to_table for fk in fks if fk.to_table in table_names
+    }
+    cubes: list[Cube] = []
+    annotations: list[HeuristicAnnotation] = []
+    for tbl in tables:
+        alias = _alias_for(tbl.name)
+        pk_col = pks.get(tbl.name)
+        measures: list[Measure] = []
+        dimensions: list[Dimension] = []
+        time_dims: list[TimeDimension] = []
+        cube_pk: str | None = None
+        for col in tbl.columns:
+            is_fk = (tbl.name, col.name) in fk_by_source
+            is_pk = pk_col == col.name
+            cls = classify_column(col, is_fk=is_fk, is_pk=is_pk)
+            sql = f"{{{alias}}}.{col.name}"
+            if cls.kind == "time_dimension":
+                time_dims.append(TimeDimension(name=col.name, sql=sql))
+            elif cls.kind == "measure_sum":
+                measures.append(Measure(name=col.name, sql=sql, agg="sum"))
+                if cls.heuristic_reason:
+                    annotations.append(
+                        HeuristicAnnotation(
+                            cube=tbl.name, field=col.name, reason=cls.heuristic_reason
+                        )
+                    )
+            elif cls.kind == "measure_count_distinct":
+                # Distinct-count measure named ``distinct_<col>`` so the
+                # output catalog reads cleanly — ``orders.distinct_customer_id``.
+                m_name = f"distinct_{col.name}"
+                measures.append(Measure(name=m_name, sql=sql, agg="count_distinct"))
+                if cls.heuristic_reason:
+                    annotations.append(
+                        HeuristicAnnotation(
+                            cube=tbl.name, field=m_name, reason=cls.heuristic_reason
+                        )
+                    )
+            else:
+                # Plain dimension. FK targets get ``foreign_key=...``;
+                # everything else is just a typed dimension.
+                assert cls.dim_type is not None  # classifier invariant
+                fk_target = fk_by_source.get((tbl.name, col.name))
+                dimensions.append(
+                    Dimension(
+                        name=col.name,
+                        sql=sql,
+                        type=cls.dim_type,
+                        foreign_key=fk_target,
+                    )
+                )
+                if is_pk:
+                    cube_pk = col.name
+        # Auto-derived joins: for every FK source on this table whose
+        # target also got introspected, emit a ``many_to_one`` Join.
+        # The semql catalog will further auto-derive from
+        # ``Dimension.foreign_key`` at construction time, but spelling
+        # the joins out in the generated source makes them visible at
+        # review.
+        joins: list[Join] = []
+        for fk in fks:
+            if fk.from_table != tbl.name or fk.to_table not in table_names:
+                continue
+            target_alias = _alias_for(fk.to_table)
+            joins.append(
+                Join(
+                    to=fk.to_table,
+                    relationship="many_to_one",
+                    on=f"{{{alias}}}.{fk.from_column} = {{{target_alias}}}.{fk.to_column}",
+                )
+            )
+        cubes.append(
+            Cube(
+                name=tbl.name,
+                backend=backend,
+                table=tbl.name,
+                alias=alias,
+                primary_key=cube_pk,
+                measures=measures,
+                dimensions=dimensions,
+                time_dimensions=time_dims,
+                joins=joins,
+            )
+        )
+    cubes = _dedupe_aliases(cubes)
+    return IntrospectionResult(cubes=cubes, annotations=annotations)
+__all__ = ["HeuristicAnnotation", "IntrospectionResult", "introspect"]

semql_introspect-0.2.1/src/semql_introspect/_probe.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Schema probes — read tables / columns / FKs from a live database.
+The probe abstracts the dialect-specific information_schema layout:
+PG / DuckDB / Snowflake mostly share the ANSI ``information_schema``
+shape; ClickHouse uses ``system.columns`` + ``system.tables``; BigQuery
+adds a ``project_id.dataset`` prefix. v1 ships
+:class:`InformationSchemaProbe` which covers the ANSI dialects;
+non-ANSI dialects can implement :class:`SchemaProbe` directly without
+touching the rest of the package.
+"""
+from __future__ import annotations
+from collections.abc import Iterable
+from dataclasses import dataclass
+from typing import Any, Protocol
+@dataclass(frozen=True)
+class ColumnInfo:
+    """One column on a table as returned by a probe.
+    ``data_type`` is the raw SQL type string the database returned
+    (``"integer"``, ``"timestamp without time zone"``, etc.). Heuristics
+    walk this to decide field kind / dimension type.
+    """
+    name: str
+    data_type: str
+    is_nullable: bool
+@dataclass(frozen=True)
+class ForeignKeyInfo:
+    """One FK edge — ``from_table.from_column → to_table.to_column``."""
+    from_table: str
+    from_column: str
+    to_table: str
+    to_column: str
+@dataclass(frozen=True)
+class TableInfo:
+    """One table the catalog will materialise as a ``Cube``."""
+    name: str
+    columns: tuple[ColumnInfo, ...]
+class SchemaProbe(Protocol):
+    """Read-only dialect adapter for introspection.
+    Implementations should return tables / columns / foreign keys
+    scoped to whatever schema the caller asked about — the orchestrator
+    treats the result as authoritative and doesn't filter again."""
+    def list_tables(self) -> list[TableInfo]: ...
+    def list_foreign_keys(self) -> list[ForeignKeyInfo]: ...
+    def list_primary_keys(self) -> dict[str, str]:
+        """``{table_name: primary_key_column}``.
+        Probes that can't recover primary keys (some warehouses don't
+        ship constraints) should return ``{}`` — heuristics will fall
+        back to "first column named ``id``" detection."""
+        ...
+class InformationSchemaProbe:
+    """ANSI ``information_schema`` probe.
+    Works against any database that ships the standard
+    ``information_schema.columns`` / ``information_schema.table_constraints``
+    layout — Postgres, DuckDB, Snowflake (mostly), and SQL Server. The
+    schema argument scopes ``table_schema``; pass it explicitly so
+    catalog dumps don't accidentally span system schemas.
+    """
+    def __init__(
+        self,
+        connection: Any,  # noqa: ANN401 — any DB-API 2.0 conn
+        *,
+        schema: str,
+        include_tables: Iterable[str] | None = None,
+        exclude_tables: Iterable[str] | None = None,
+    ) -> None:
+        self._conn = connection
+        self._schema = schema
+        self._include = set(include_tables) if include_tables else None
+        self._exclude = set(exclude_tables or ())
+    def list_tables(self) -> list[TableInfo]:
+        cur = self._conn.cursor()
+        try:
+            cur.execute(
+                "SELECT table_name FROM information_schema.tables "
+                f"WHERE table_schema = '{self._schema}' "
+                "AND table_type = 'BASE TABLE' "
+                "ORDER BY table_name"
+            )
+            table_names = [row[0] for row in cur.fetchall()]
+        finally:
+            cur.close()
+        if self._include is not None:
+            table_names = [t for t in table_names if t in self._include]
+        table_names = [t for t in table_names if t not in self._exclude]
+        out: list[TableInfo] = []
+        for tbl in table_names:
+            out.append(TableInfo(name=tbl, columns=self._columns_for(tbl)))
+        return out
+    def _columns_for(self, table: str) -> tuple[ColumnInfo, ...]:
+        cur = self._conn.cursor()
+        try:
+            cur.execute(
+                "SELECT column_name, data_type, is_nullable "
+                "FROM information_schema.columns "
+                f"WHERE table_schema = '{self._schema}' "
+                f"AND table_name = '{table}' "
+                "ORDER BY ordinal_position"
+            )
+            rows = cur.fetchall()
+        finally:
+            cur.close()
+        return tuple(
+            ColumnInfo(
+                name=row[0],
+                data_type=str(row[1]),
+                is_nullable=(str(row[2]).upper() == "YES"),
+            )
+            for row in rows
+        )
+    def list_foreign_keys(self) -> list[ForeignKeyInfo]:
+        # ``information_schema.constraint_column_usage`` differs per
+        # backend: Postgres returns the referenced (target) table for
+        # FK constraints, DuckDB returns the source table. The more
+        # portable shape is ``referential_constraints`` → the PK
+        # constraint on the referenced side, joined via ``key_column_usage``
+        # on both ends with matching ordinal positions for composite
+        # keys. Works on PG / DuckDB / Snowflake unchanged.
+        cur = self._conn.cursor()
+        try:
+            cur.execute(
+                "SELECT kcu_from.table_name AS from_table, "
+                "       kcu_from.column_name AS from_column, "
+                "       kcu_to.table_name AS to_table, "
+                "       kcu_to.column_name AS to_column "
+                "FROM information_schema.referential_constraints rc "
+                "JOIN information_schema.key_column_usage kcu_from "
+                "  ON rc.constraint_name = kcu_from.constraint_name "
+                " AND rc.constraint_schema = kcu_from.constraint_schema "
+                "JOIN information_schema.key_column_usage kcu_to "
+                "  ON rc.unique_constraint_name = kcu_to.constraint_name "
+                " AND rc.unique_constraint_schema = kcu_to.constraint_schema "
+                " AND kcu_from.ordinal_position = kcu_to.ordinal_position "
+                f"WHERE rc.constraint_schema = '{self._schema}' "
+                "ORDER BY kcu_from.table_name, kcu_from.ordinal_position"
+            )
+            rows = cur.fetchall()
+        finally:
+            cur.close()
+        return [
+            ForeignKeyInfo(
+                from_table=row[0],
+                from_column=row[1],
+                to_table=row[2],
+                to_column=row[3],
+            )
+            for row in rows
+        ]
+    def list_primary_keys(self) -> dict[str, str]:
+        cur = self._conn.cursor()
+        try:
+            cur.execute(
+                "SELECT kcu.table_name, kcu.column_name "
+                "FROM information_schema.table_constraints tc "
+                "JOIN information_schema.key_column_usage kcu "
+                "  ON tc.constraint_name = kcu.constraint_name "
+                " AND tc.table_schema = kcu.table_schema "
+                "WHERE tc.constraint_type = 'PRIMARY KEY' "
+                f"  AND tc.table_schema = '{self._schema}' "
+                "ORDER BY kcu.table_name, kcu.ordinal_position"
+            )
+            rows = cur.fetchall()
+        finally:
+            cur.close()
+        # If a PK is composite, the first column wins — the catalog
+        # model only supports a single ``primary_key`` per cube. A
+        # ``# TODO: review`` will surface composite-key tables anyway.
+        out: dict[str, str] = {}
+        for row in rows:
+            out.setdefault(str(row[0]), str(row[1]))
+        return out
+__all__ = [
+    "ColumnInfo",
+    "ForeignKeyInfo",
+    "InformationSchemaProbe",
+    "SchemaProbe",
+    "TableInfo",
+]

semql_introspect-0.2.1/src/semql_introspect/py.typed ADDED Viewed

File without changes