semql-introspect 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2026, Nikhil Pallamreddy
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,65 @@
1
+ Metadata-Version: 2.4
2
+ Name: semql-introspect
3
+ Version: 0.2.1
4
+ Summary: Bootstrap a semql Catalog from a live database — emits Python cube stubs from Information Schema with heuristic measure/dimension inference.
5
+ Author: Nikhil Pallamreddy
6
+ Author-email: Nikhil Pallamreddy <nikhil.pallamreddy+git@gmail.com>
7
+ License-Expression: BSD-3-Clause
8
+ License-File: LICENSE
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Database
16
+ Classifier: Topic :: Software Development :: Code Generators
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Classifier: Typing :: Typed
19
+ Requires-Dist: semql>=0.2.1,<0.3
20
+ Requires-Python: >=3.12
21
+ Project-URL: Homepage, https://github.com/npalladium/semql
22
+ Project-URL: Repository, https://github.com/npalladium/semql
23
+ Project-URL: Issues, https://github.com/npalladium/semql/issues
24
+ Description-Content-Type: text/markdown
25
+
26
+ # semql-introspect
27
+
28
+ Bootstrap a [semql](../semql) `Catalog` from a live database.
29
+
30
+ Reads Information Schema, emits Python `Cube` stubs with heuristic
31
+ measure / dimension / time-dimension inference and foreign-key derived
32
+ joins. Designed for greenfield adoption — a team with 200 tables can
33
+ generate the mechanical 80% of a catalog in seconds, then hand-edit
34
+ the heuristic guesses.
35
+
36
+ ## Usage
37
+
38
+ ```python
39
+ import duckdb
40
+ from semql.model import Backend
41
+ from semql_introspect import introspect_to_python
42
+
43
+ con = duckdb.connect("warehouse.db")
44
+ print(introspect_to_python(con, backend=Backend.DUCKDB, schema="main"))
45
+ ```
46
+
47
+ Or via CLI:
48
+
49
+ ```sh
50
+ semql-introspect --backend duckdb --schema main --conn "warehouse.db"
51
+ ```
52
+
53
+ ## Heuristics
54
+
55
+ - Numeric columns named `amount` / `price` / `revenue` / `cost` / `total`
56
+ / `value` / `qty` / `quantity` / `count` → `Measure(agg="sum")`.
57
+ - Columns ending in `_id` → `Measure(agg="count_distinct")` (the table's
58
+ cardinality is usually interesting).
59
+ - `date` / `timestamp` columns → `TimeDimension`.
60
+ - Foreign keys → `Join(relationship="many_to_one")` plus the foreign-side
61
+ `Dimension(foreign_key=...)`.
62
+ - Everything else → `Dimension` typed by the column's SQL type.
63
+
64
+ Heuristic guesses get a `# TODO: review` comment so the diff makes the
65
+ inference choices reviewable.
@@ -0,0 +1,40 @@
1
+ # semql-introspect
2
+
3
+ Bootstrap a [semql](../semql) `Catalog` from a live database.
4
+
5
+ Reads Information Schema, emits Python `Cube` stubs with heuristic
6
+ measure / dimension / time-dimension inference and foreign-key derived
7
+ joins. Designed for greenfield adoption — a team with 200 tables can
8
+ generate the mechanical 80% of a catalog in seconds, then hand-edit
9
+ the heuristic guesses.
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ import duckdb
15
+ from semql.model import Backend
16
+ from semql_introspect import introspect_to_python
17
+
18
+ con = duckdb.connect("warehouse.db")
19
+ print(introspect_to_python(con, backend=Backend.DUCKDB, schema="main"))
20
+ ```
21
+
22
+ Or via CLI:
23
+
24
+ ```sh
25
+ semql-introspect --backend duckdb --schema main --conn "warehouse.db"
26
+ ```
27
+
28
+ ## Heuristics
29
+
30
+ - Numeric columns named `amount` / `price` / `revenue` / `cost` / `total`
31
+ / `value` / `qty` / `quantity` / `count` → `Measure(agg="sum")`.
32
+ - Columns ending in `_id` → `Measure(agg="count_distinct")` (the table's
33
+ cardinality is usually interesting).
34
+ - `date` / `timestamp` columns → `TimeDimension`.
35
+ - Foreign keys → `Join(relationship="many_to_one")` plus the foreign-side
36
+ `Dimension(foreign_key=...)`.
37
+ - Everything else → `Dimension` typed by the column's SQL type.
38
+
39
+ Heuristic guesses get a `# TODO: review` comment so the diff makes the
40
+ inference choices reviewable.
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "semql-introspect"
3
+ version = "0.2.1"
4
+ description = "Bootstrap a semql Catalog from a live database — emits Python cube stubs from Information Schema with heuristic measure/dimension inference."
5
+ readme = "README.md"
6
+ license = "BSD-3-Clause"
7
+ license-files = ["LICENSE"]
8
+ authors = [
9
+ { name = "Nikhil Pallamreddy", email = "nikhil.pallamreddy+git@gmail.com" }
10
+ ]
11
+ requires-python = ">=3.12"
12
+ dependencies = [
13
+ "semql>=0.2.1,<0.3",
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Database",
23
+ "Topic :: Software Development :: Code Generators",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ "Typing :: Typed",
26
+ ]
27
+
28
+ [project.scripts]
29
+ semql-introspect = "semql_introspect.__main__:main"
30
+
31
+ [project.urls]
32
+ Homepage = "https://github.com/npalladium/semql"
33
+ Repository = "https://github.com/npalladium/semql"
34
+ Issues = "https://github.com/npalladium/semql/issues"
35
+
36
+ [build-system]
37
+ requires = ["uv_build>=0.11.19,<0.12.0"]
38
+ build-backend = "uv_build"
39
+
40
+ [tool.uv.sources]
41
+ semql = { workspace = true, editable = true }
@@ -0,0 +1,105 @@
1
+ """Bootstrap a semql Catalog from a live database.
2
+
3
+ Top-level entrypoints — ``introspect_catalog`` returns
4
+ ``list[Cube]`` for programmatic callers, ``introspect_to_python``
5
+ returns an importable Python source string for review-then-commit
6
+ workflows. Both layer over :class:`semql_introspect.InformationSchemaProbe`
7
+ or a caller-supplied :class:`SchemaProbe` for non-ANSI dialects.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any
13
+
14
+ from semql.model import Backend, Cube
15
+
16
+ from semql_introspect._emit import emit_python
17
+ from semql_introspect._introspect import (
18
+ HeuristicAnnotation,
19
+ IntrospectionResult,
20
+ introspect,
21
+ )
22
+ from semql_introspect._probe import (
23
+ ColumnInfo,
24
+ ForeignKeyInfo,
25
+ InformationSchemaProbe,
26
+ SchemaProbe,
27
+ TableInfo,
28
+ )
29
+
30
+
31
+ def introspect_catalog(
32
+ connection: Any, # noqa: ANN401 — any DB-API 2.0 conn
33
+ *,
34
+ backend: Backend,
35
+ schema: str,
36
+ include_tables: list[str] | None = None,
37
+ exclude_tables: list[str] | None = None,
38
+ ) -> list[Cube]:
39
+ """Convenience wrapper that builds the ANSI probe + runs ``introspect``.
40
+
41
+ Returns the cube list only; for the parallel heuristic annotations
42
+ call :func:`introspect_to_result` instead. Pass a custom
43
+ :class:`SchemaProbe` plus :func:`introspect` directly for non-ANSI
44
+ dialects (ClickHouse, BigQuery)."""
45
+ return introspect_to_result(
46
+ connection,
47
+ backend=backend,
48
+ schema=schema,
49
+ include_tables=include_tables,
50
+ exclude_tables=exclude_tables,
51
+ ).cubes
52
+
53
+
54
+ def introspect_to_result(
55
+ connection: Any, # noqa: ANN401
56
+ *,
57
+ backend: Backend,
58
+ schema: str,
59
+ include_tables: list[str] | None = None,
60
+ exclude_tables: list[str] | None = None,
61
+ ) -> IntrospectionResult:
62
+ """Full result envelope (cubes + heuristic annotations)."""
63
+ probe = InformationSchemaProbe(
64
+ connection,
65
+ schema=schema,
66
+ include_tables=include_tables,
67
+ exclude_tables=exclude_tables,
68
+ )
69
+ return introspect(probe, backend=backend)
70
+
71
+
72
+ def introspect_to_python(
73
+ connection: Any, # noqa: ANN401
74
+ *,
75
+ backend: Backend,
76
+ schema: str,
77
+ include_tables: list[str] | None = None,
78
+ exclude_tables: list[str] | None = None,
79
+ header: str | None = None,
80
+ ) -> str:
81
+ """Return a self-contained Python module string with the inferred cubes."""
82
+ result = introspect_to_result(
83
+ connection,
84
+ backend=backend,
85
+ schema=schema,
86
+ include_tables=include_tables,
87
+ exclude_tables=exclude_tables,
88
+ )
89
+ return emit_python(result, header=header)
90
+
91
+
92
+ __all__ = [
93
+ "ColumnInfo",
94
+ "ForeignKeyInfo",
95
+ "HeuristicAnnotation",
96
+ "InformationSchemaProbe",
97
+ "IntrospectionResult",
98
+ "SchemaProbe",
99
+ "TableInfo",
100
+ "emit_python",
101
+ "introspect",
102
+ "introspect_catalog",
103
+ "introspect_to_python",
104
+ "introspect_to_result",
105
+ ]
@@ -0,0 +1,121 @@
1
+ """CLI entrypoint — ``python -m semql_introspect`` / ``semql-introspect``.
2
+
3
+ Connects to a database via the matching DB-API driver, runs the
4
+ introspector, and writes the emitted Python to stdout. Driver imports
5
+ happen lazily so a user introspecting DuckDB doesn't need ``psycopg``
6
+ installed.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import sys
13
+ from typing import Any, cast
14
+
15
+ from semql.model import Backend
16
+
17
+ from semql_introspect import introspect_to_python
18
+
19
+ _BACKENDS_BY_NAME = {b.value: b for b in Backend}
20
+
21
+
22
+ def _connect(backend_name: str, conn_string: str) -> Any: # noqa: ANN401
23
+ """Lazy-import a DB-API driver and open a connection.
24
+
25
+ Drivers are imported inline so the CLI launches even when only
26
+ one backend's driver is installed. Connection-string syntax is
27
+ driver-native (DSNs for psycopg, file paths for DuckDB)."""
28
+ if backend_name == "postgres":
29
+ # psycopg lacks py.typed; cast the module so attribute access yields Any.
30
+ import psycopg # type: ignore[import-not-found]
31
+
32
+ return cast(Any, psycopg).connect(conn_string)
33
+ if backend_name == "duckdb":
34
+ import duckdb
35
+
36
+ return cast(Any, duckdb).connect(conn_string)
37
+ if backend_name == "snowflake":
38
+ # snowflake-connector-python lacks py.typed.
39
+ from snowflake import connector as sf_connector # type: ignore[import-not-found]
40
+
41
+ # Snowflake takes kwargs; expect a key=value;... string.
42
+ kwargs: dict[str, str] = {}
43
+ for pair in conn_string.split(";"):
44
+ if not pair:
45
+ continue
46
+ k, _, v = pair.partition("=")
47
+ kwargs[k.strip()] = v.strip()
48
+ return cast(Any, sf_connector).connect(**kwargs)
49
+ raise SystemExit(
50
+ f"semql-introspect: no driver wired up for backend {backend_name!r}. "
51
+ "Supported via the CLI: postgres, duckdb, snowflake. For other "
52
+ "backends, call ``introspect_to_python`` from Python with a "
53
+ "connection you opened yourself."
54
+ )
55
+
56
+
57
+ def main(argv: list[str] | None = None) -> int:
58
+ parser = argparse.ArgumentParser(
59
+ prog="semql-introspect",
60
+ description=(
61
+ "Generate a semql cube catalog from a live database. "
62
+ "Reads Information Schema, applies heuristic measure / "
63
+ "dimension inference, and emits Python."
64
+ ),
65
+ )
66
+ parser.add_argument(
67
+ "--backend",
68
+ required=True,
69
+ choices=sorted(_BACKENDS_BY_NAME),
70
+ help="semql Backend tag stamped onto every emitted cube.",
71
+ )
72
+ parser.add_argument(
73
+ "--schema",
74
+ required=True,
75
+ help="information_schema table_schema to scan.",
76
+ )
77
+ parser.add_argument(
78
+ "--conn",
79
+ required=True,
80
+ help=(
81
+ "Driver-native connection string (DSN for psycopg, file "
82
+ "path for DuckDB, key=value;... for Snowflake)."
83
+ ),
84
+ )
85
+ parser.add_argument(
86
+ "--include",
87
+ action="append",
88
+ default=None,
89
+ metavar="TABLE",
90
+ help="Only introspect these tables (repeat for multiple).",
91
+ )
92
+ parser.add_argument(
93
+ "--exclude",
94
+ action="append",
95
+ default=None,
96
+ metavar="TABLE",
97
+ help="Skip these tables (repeat for multiple).",
98
+ )
99
+ parser.add_argument(
100
+ "--header",
101
+ default=None,
102
+ help="Custom docstring for the emitted module (defaults to a TODO-review reminder).",
103
+ )
104
+ args = parser.parse_args(argv)
105
+
106
+ backend = _BACKENDS_BY_NAME[args.backend]
107
+ connection = _connect(args.backend, args.conn)
108
+ src = introspect_to_python(
109
+ connection,
110
+ backend=backend,
111
+ schema=args.schema,
112
+ include_tables=args.include,
113
+ exclude_tables=args.exclude,
114
+ header=args.header,
115
+ )
116
+ sys.stdout.write(src)
117
+ return 0
118
+
119
+
120
+ if __name__ == "__main__":
121
+ raise SystemExit(main())
@@ -0,0 +1,148 @@
1
+ """Render a :class:`IntrospectionResult` as Python source code.
2
+
3
+ The output is a self-contained module that ``import``s from
4
+ ``semql.model`` and defines a ``CUBES: list[Cube]`` constant. The dev
5
+ review loop is: run the introspector, open the file, search for
6
+ ``# TODO: review`` comments, edit by hand. Heuristic annotations
7
+ get rendered inline so the dev sees *why* the tool picked each guess.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import textwrap
13
+
14
+ from semql.model import Cube, Join
15
+
16
+ from semql_introspect._introspect import IntrospectionResult
17
+
18
+
19
+ def emit_python(result: IntrospectionResult, *, header: str | None = None) -> str:
20
+ """Render the introspection result as a single Python module.
21
+
22
+ ``header`` overrides the default docstring at the top — useful for
23
+ CLI invocations that want to record the connection string / schema
24
+ in the file so the diff carries provenance."""
25
+ annotations_by_field: dict[tuple[str, str], list[str]] = {}
26
+ for ann in result.annotations:
27
+ annotations_by_field.setdefault((ann.cube, ann.field), []).append(ann.reason)
28
+
29
+ lines: list[str] = []
30
+ lines.append(_header(header))
31
+ lines.append("from __future__ import annotations\n")
32
+ lines.append("")
33
+ lines.append("from semql.model import (")
34
+ lines.append(" Backend,")
35
+ lines.append(" Cube,")
36
+ lines.append(" Dimension,")
37
+ lines.append(" Join,")
38
+ lines.append(" Measure,")
39
+ lines.append(" TimeDimension,")
40
+ lines.append(")")
41
+ lines.append("")
42
+ lines.append("")
43
+
44
+ for cube in result.cubes:
45
+ lines.append(_render_cube(cube, annotations_by_field))
46
+ lines.append("")
47
+
48
+ lines.append("CUBES: list[Cube] = [")
49
+ for cube in result.cubes:
50
+ lines.append(f" {_cube_var(cube.name)},")
51
+ lines.append("]")
52
+
53
+ return "\n".join(lines) + "\n"
54
+
55
+
56
+ def _header(custom: str | None) -> str:
57
+ if custom is not None:
58
+ return f'"""{custom}"""\n\n'
59
+ return (
60
+ '"""Auto-generated by semql-introspect.\n\n'
61
+ "Review every ``# TODO: review`` comment before promoting this\n"
62
+ "file to a production catalog. The heuristic-inferred measures\n"
63
+ "and dimensions are educated guesses, not authoritative — the\n"
64
+ "introspector errs on the side of surfacing structure so a human\n"
65
+ "review pass can prune.\n"
66
+ '"""\n\n'
67
+ )
68
+
69
+
70
+ def _cube_var(cube_name: str) -> str:
71
+ """Variable name for a cube in the emitted module."""
72
+ return f"{cube_name}_cube"
73
+
74
+
75
+ def _render_cube(
76
+ cube: Cube,
77
+ annotations: dict[tuple[str, str], list[str]],
78
+ ) -> str:
79
+ var = _cube_var(cube.name)
80
+ lines: list[str] = []
81
+ lines.append(f"{var} = Cube(")
82
+ lines.append(f" name={cube.name!r},")
83
+ lines.append(f" backend=Backend.{cube.backend.name},")
84
+ lines.append(f" table={cube.table!r},")
85
+ lines.append(f" alias={cube.alias!r},")
86
+ if cube.primary_key is not None:
87
+ lines.append(f" primary_key={cube.primary_key!r},")
88
+
89
+ if cube.measures:
90
+ lines.append(" measures=[")
91
+ for m in cube.measures:
92
+ todo = annotations.get((cube.name, m.name))
93
+ if todo:
94
+ lines.append(_indent_comment(todo, 8))
95
+ lines.append(f" Measure(name={m.name!r}, sql={m.sql!r}, agg={m.agg!r}),")
96
+ lines.append(" ],")
97
+ if cube.dimensions:
98
+ lines.append(" dimensions=[")
99
+ for d in cube.dimensions:
100
+ todo = annotations.get((cube.name, d.name))
101
+ if todo:
102
+ lines.append(_indent_comment(todo, 8))
103
+ inner = f"name={d.name!r}, sql={d.sql!r}, type={d.type!r}"
104
+ if d.foreign_key is not None:
105
+ inner += f", foreign_key={d.foreign_key!r}"
106
+ lines.append(f" Dimension({inner}),")
107
+ lines.append(" ],")
108
+ if cube.time_dimensions:
109
+ lines.append(" time_dimensions=[")
110
+ for td in cube.time_dimensions:
111
+ lines.append(f" TimeDimension(name={td.name!r}, sql={td.sql!r}),")
112
+ lines.append(" ],")
113
+ if cube.joins:
114
+ lines.append(" joins=[")
115
+ for j in cube.joins:
116
+ lines.append(_render_join(j))
117
+ lines.append(" ],")
118
+
119
+ # S7 — emit placeholder grounding fields with TODO comments so the
120
+ # introspect → review flow is explicit. The user fills these in
121
+ # (or runs `semql suggest` to draft them via LLM).
122
+ lines.append(" # TODO: review — questions users might literally ask of this cube.")
123
+ lines.append(" questions=[],")
124
+ lines.append(" # TODO: review — keywords for lexical retrieval (acronyms preserved).")
125
+ lines.append(" keywords=[],")
126
+
127
+ lines.append(")")
128
+ return "\n".join(lines)
129
+
130
+
131
+ def _render_join(j: Join) -> str:
132
+ return f" Join(to={j.to!r}, relationship={j.relationship!r}, on={j.on!r}),"
133
+
134
+
135
+ def _indent_comment(reasons: list[str], indent: int) -> str:
136
+ prefix = " " * indent + "# TODO: review — "
137
+ wrapper = textwrap.TextWrapper(
138
+ initial_indent=prefix,
139
+ subsequent_indent=" " * indent + "# ",
140
+ width=88,
141
+ )
142
+ lines: list[str] = []
143
+ for reason in reasons:
144
+ lines.append(wrapper.fill(reason))
145
+ return "\n".join(lines)
146
+
147
+
148
+ __all__ = ["emit_python"]
@@ -0,0 +1,217 @@
1
+ """Column → field-kind classification.
2
+
3
+ Pure functions over :class:`ColumnInfo`. Heuristic choices live here so
4
+ the orchestrator can stay glue and tests can exercise each rule in
5
+ isolation. Every guess that isn't a hard rule emits a
6
+ ``heuristic_reason`` the emitter can surface as a ``# TODO: review``
7
+ comment — the dev reviewing the diff sees *why* the tool picked
8
+ ``count_distinct`` over a plain dimension.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from dataclasses import dataclass
14
+ from typing import Literal
15
+
16
+ from semql.model import DimTypeLiteral
17
+
18
+ from semql_introspect._probe import ColumnInfo
19
+
20
+ FieldKind = Literal[
21
+ "measure_sum",
22
+ "measure_count_distinct",
23
+ "time_dimension",
24
+ "dimension",
25
+ ]
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class Classification:
30
+ """How a single column should be modelled in the catalog.
31
+
32
+ ``dim_type`` is set only for ``kind == "dimension"`` — it's the
33
+ semql ``DimTypeLiteral`` the dimension should declare. Other
34
+ kinds don't carry a dim type.
35
+
36
+ ``heuristic_reason`` is non-empty when the classification came
37
+ from a guess rather than a hard rule; the emitter renders it as a
38
+ ``# TODO: review`` comment next to the field.
39
+ """
40
+
41
+ kind: FieldKind
42
+ dim_type: DimTypeLiteral | None = None
43
+ heuristic_reason: str = ""
44
+
45
+
46
+ # Column-name tokens that mark a numeric column as an additive measure.
47
+ # Plural / singular variants both included — the rule fires on a token
48
+ # match, not a full-string match.
49
+ _MEASURE_NAME_TOKENS = frozenset(
50
+ {
51
+ "amount",
52
+ "amounts",
53
+ "price",
54
+ "prices",
55
+ "revenue",
56
+ "revenues",
57
+ "cost",
58
+ "costs",
59
+ "total",
60
+ "totals",
61
+ "value",
62
+ "values",
63
+ "qty",
64
+ "quantity",
65
+ "quantities",
66
+ "spend",
67
+ "fee",
68
+ "fees",
69
+ "balance",
70
+ "balances",
71
+ }
72
+ )
73
+
74
+
75
+ def _normalize_type(data_type: str) -> str:
76
+ """Strip qualifiers/parameters from a SQL type string.
77
+
78
+ ``"timestamp without time zone"`` → ``"timestamp"``;
79
+ ``"VARCHAR(255)"`` → ``"varchar"``;
80
+ ``"numeric(18,2)"`` → ``"numeric"``.
81
+ """
82
+ t = data_type.lower().strip()
83
+ if "(" in t:
84
+ t = t.split("(", 1)[0].strip()
85
+ if " " in t:
86
+ t = t.split(" ", 1)[0]
87
+ return t
88
+
89
+
90
+ _NUMERIC_TYPES = frozenset(
91
+ {
92
+ "smallint",
93
+ "integer",
94
+ "int",
95
+ "int2",
96
+ "int4",
97
+ "int8",
98
+ "bigint",
99
+ "decimal",
100
+ "numeric",
101
+ "real",
102
+ "float",
103
+ "float4",
104
+ "float8",
105
+ "double",
106
+ "money",
107
+ }
108
+ )
109
+
110
+
111
+ _DATE_TYPES = frozenset(
112
+ {
113
+ "date",
114
+ "timestamp",
115
+ "timestamptz",
116
+ "datetime",
117
+ "time",
118
+ "timetz",
119
+ }
120
+ )
121
+
122
+
123
+ _BOOL_TYPES = frozenset({"boolean", "bool"})
124
+
125
+
126
+ def _is_numeric(data_type: str) -> bool:
127
+ return _normalize_type(data_type) in _NUMERIC_TYPES
128
+
129
+
130
+ def _is_date(data_type: str) -> bool:
131
+ return _normalize_type(data_type) in _DATE_TYPES
132
+
133
+
134
+ def _is_bool(data_type: str) -> bool:
135
+ return _normalize_type(data_type) in _BOOL_TYPES
136
+
137
+
138
+ def _dim_type_for(data_type: str) -> DimTypeLiteral:
139
+ """Map a SQL type string onto a semql ``DimTypeLiteral``.
140
+
141
+ Falls back to ``"string"`` for unknown types — the catalog author
142
+ can refine it post-emission. ``"string"`` is the safer default
143
+ than ``"number"`` because misclassifying a numeric ID as a number
144
+ invites accidental aggregation."""
145
+ if _is_numeric(data_type):
146
+ return "number"
147
+ if _is_date(data_type):
148
+ return "time"
149
+ if _is_bool(data_type):
150
+ return "bool"
151
+ return "string"
152
+
153
+
154
+ def classify_column(col: ColumnInfo, *, is_fk: bool, is_pk: bool) -> Classification:
155
+ """Pick the field kind + (where applicable) dimension type for a column.
156
+
157
+ Rules, in priority order:
158
+
159
+ 1. **Date / timestamp** → ``time_dimension``. Hard rule.
160
+ 2. **Foreign-key columns** → ``dimension`` (with ``foreign_key=``
161
+ wired up by the orchestrator). Numeric-FK heuristic *does not*
162
+ apply — FKs are identifiers, not measurements.
163
+ 3. **Primary-key columns** → ``dimension``. Hard rule; the cube's
164
+ ``primary_key`` field tracks identity separately.
165
+ 4. **Numeric columns whose name matches a measure-name token**
166
+ (amount / price / revenue / ...) → ``measure_sum``.
167
+ 5. **Columns ending in ``_id``** → ``measure_count_distinct``. The
168
+ table's distinct identifier count is a useful default measure
169
+ even when an ID column isn't a measure proper.
170
+ 6. Otherwise → ``dimension`` typed by the column's SQL type.
171
+ """
172
+ name = col.name.lower()
173
+
174
+ if _is_date(col.data_type):
175
+ return Classification(kind="time_dimension")
176
+
177
+ if is_pk:
178
+ return Classification(
179
+ kind="dimension",
180
+ dim_type=_dim_type_for(col.data_type),
181
+ )
182
+ if is_fk:
183
+ return Classification(
184
+ kind="dimension",
185
+ dim_type=_dim_type_for(col.data_type),
186
+ )
187
+
188
+ if _is_numeric(col.data_type):
189
+ tokens = set(name.split("_"))
190
+ if tokens & _MEASURE_NAME_TOKENS:
191
+ return Classification(
192
+ kind="measure_sum",
193
+ heuristic_reason=(
194
+ f"numeric column named {col.name!r} matched a measure-name "
195
+ "token (amount/price/revenue/...) — confirm this should be a "
196
+ "summable measure rather than a dimension."
197
+ ),
198
+ )
199
+
200
+ if name.endswith("_id"):
201
+ return Classification(
202
+ kind="measure_count_distinct",
203
+ heuristic_reason=(
204
+ f"column ends in ``_id`` ({col.name!r}); inferred a "
205
+ "``count_distinct`` measure. Drop the measure if this column "
206
+ "isn't useful as a count, or move it to a foreign-key "
207
+ "dimension if it should link to another cube."
208
+ ),
209
+ )
210
+
211
+ return Classification(
212
+ kind="dimension",
213
+ dim_type=_dim_type_for(col.data_type),
214
+ )
215
+
216
+
217
+ __all__ = ["Classification", "FieldKind", "classify_column"]
@@ -0,0 +1,200 @@
1
+ """Orchestrator — turn a :class:`SchemaProbe` into a list of ``Cube``s.
2
+
3
+ Reads tables / FKs / PKs from the probe, classifies each column via
4
+ :mod:`._heuristics`, and assembles ``Cube`` / ``Measure`` / ``Dimension``
5
+ / ``TimeDimension`` instances. The result is a pure
6
+ :class:`semql.model.Cube` list — ready to wrap in a
7
+ :class:`semql.Catalog` or to round-trip through the emitter.
8
+
9
+ Per-column heuristic reasons (the ``# TODO: review`` hints) ride along
10
+ on a parallel :class:`HeuristicAnnotation` list, indexed by
11
+ ``(cube_name, field_name)``. The emitter renders them as inline
12
+ comments; programmatic callers can ignore them.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+
19
+ from semql.model import (
20
+ Backend,
21
+ Cube,
22
+ Dimension,
23
+ Join,
24
+ Measure,
25
+ TimeDimension,
26
+ )
27
+
28
+ from semql_introspect._heuristics import classify_column
29
+ from semql_introspect._probe import SchemaProbe
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class HeuristicAnnotation:
34
+ """One ``# TODO: review`` hint surfaced by the introspector."""
35
+
36
+ cube: str
37
+ field: str
38
+ reason: str
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class IntrospectionResult:
43
+ """Cubes + the heuristic-guess annotations parallel to them.
44
+
45
+ The cubes alone can be plugged straight into a ``Catalog``; the
46
+ annotations are presentation metadata for emitters / reports."""
47
+
48
+ cubes: list[Cube]
49
+ annotations: list[HeuristicAnnotation]
50
+
51
+
52
+ def _alias_for(table: str) -> str:
53
+ """Single-or-two-letter alias derived from the table name.
54
+
55
+ semql cube aliases must match ``[a-z_][a-z0-9_]*``. The introspect
56
+ convention: take the initial letters of ``_``-separated tokens
57
+ (``user_events`` → ``ue``); fall back to the first letter for
58
+ single-token names (``orders`` → ``o``). Aliases get
59
+ deduplicated at the catalog assembly step."""
60
+ tokens = [t for t in table.lower().split("_") if t]
61
+ if not tokens:
62
+ return "t"
63
+ if len(tokens) == 1:
64
+ return tokens[0][0]
65
+ return "".join(t[0] for t in tokens)
66
+
67
+
68
+ def _dedupe_aliases(cubes: list[Cube]) -> list[Cube]:
69
+ """Ensure every cube has a unique alias.
70
+
71
+ Two ``user_events`` / ``user_eligibility`` tables both alias to
72
+ ``ue`` by default — append a suffix to all but the first
73
+ collider."""
74
+ seen: dict[str, int] = {}
75
+ out: list[Cube] = []
76
+ for cube in cubes:
77
+ alias = cube.alias
78
+ n = seen.get(alias, 0)
79
+ if n > 0:
80
+ new_alias = f"{alias}{n + 1}"
81
+ seen[alias] = n + 1
82
+ cube = cube.model_copy(update={"alias": new_alias})
83
+ else:
84
+ seen[alias] = 1
85
+ out.append(cube)
86
+ return out
87
+
88
+
89
+ def introspect(probe: SchemaProbe, *, backend: Backend) -> IntrospectionResult:
90
+ """Build a catalog from a probe + a target backend.
91
+
92
+ ``backend`` is the semql ``Backend`` enum tag stamped onto every
93
+ emitted cube — the introspector doesn't try to detect the dialect
94
+ itself because a single probe shape often spans multiple backends
95
+ (ANSI ``information_schema`` works against PG, DuckDB, and
96
+ Snowflake)."""
97
+ tables = probe.list_tables()
98
+ fks = probe.list_foreign_keys()
99
+ pks = probe.list_primary_keys()
100
+
101
+ table_names = {t.name for t in tables}
102
+
103
+ # ``{from_table.from_column → to_table}`` so the per-column loop
104
+ # knows which dim should carry ``foreign_key=``.
105
+ fk_by_source: dict[tuple[str, str], str] = {
106
+ (fk.from_table, fk.from_column): fk.to_table for fk in fks if fk.to_table in table_names
107
+ }
108
+
109
+ cubes: list[Cube] = []
110
+ annotations: list[HeuristicAnnotation] = []
111
+
112
+ for tbl in tables:
113
+ alias = _alias_for(tbl.name)
114
+ pk_col = pks.get(tbl.name)
115
+ measures: list[Measure] = []
116
+ dimensions: list[Dimension] = []
117
+ time_dims: list[TimeDimension] = []
118
+ cube_pk: str | None = None
119
+
120
+ for col in tbl.columns:
121
+ is_fk = (tbl.name, col.name) in fk_by_source
122
+ is_pk = pk_col == col.name
123
+ cls = classify_column(col, is_fk=is_fk, is_pk=is_pk)
124
+ sql = f"{{{alias}}}.{col.name}"
125
+
126
+ if cls.kind == "time_dimension":
127
+ time_dims.append(TimeDimension(name=col.name, sql=sql))
128
+ elif cls.kind == "measure_sum":
129
+ measures.append(Measure(name=col.name, sql=sql, agg="sum"))
130
+ if cls.heuristic_reason:
131
+ annotations.append(
132
+ HeuristicAnnotation(
133
+ cube=tbl.name, field=col.name, reason=cls.heuristic_reason
134
+ )
135
+ )
136
+ elif cls.kind == "measure_count_distinct":
137
+ # Distinct-count measure named ``distinct_<col>`` so the
138
+ # output catalog reads cleanly — ``orders.distinct_customer_id``.
139
+ m_name = f"distinct_{col.name}"
140
+ measures.append(Measure(name=m_name, sql=sql, agg="count_distinct"))
141
+ if cls.heuristic_reason:
142
+ annotations.append(
143
+ HeuristicAnnotation(
144
+ cube=tbl.name, field=m_name, reason=cls.heuristic_reason
145
+ )
146
+ )
147
+ else:
148
+ # Plain dimension. FK targets get ``foreign_key=...``;
149
+ # everything else is just a typed dimension.
150
+ assert cls.dim_type is not None # classifier invariant
151
+ fk_target = fk_by_source.get((tbl.name, col.name))
152
+ dimensions.append(
153
+ Dimension(
154
+ name=col.name,
155
+ sql=sql,
156
+ type=cls.dim_type,
157
+ foreign_key=fk_target,
158
+ )
159
+ )
160
+ if is_pk:
161
+ cube_pk = col.name
162
+
163
+ # Auto-derived joins: for every FK source on this table whose
164
+ # target also got introspected, emit a ``many_to_one`` Join.
165
+ # The semql catalog will further auto-derive from
166
+ # ``Dimension.foreign_key`` at construction time, but spelling
167
+ # the joins out in the generated source makes them visible at
168
+ # review.
169
+ joins: list[Join] = []
170
+ for fk in fks:
171
+ if fk.from_table != tbl.name or fk.to_table not in table_names:
172
+ continue
173
+ target_alias = _alias_for(fk.to_table)
174
+ joins.append(
175
+ Join(
176
+ to=fk.to_table,
177
+ relationship="many_to_one",
178
+ on=f"{{{alias}}}.{fk.from_column} = {{{target_alias}}}.{fk.to_column}",
179
+ )
180
+ )
181
+
182
+ cubes.append(
183
+ Cube(
184
+ name=tbl.name,
185
+ backend=backend,
186
+ table=tbl.name,
187
+ alias=alias,
188
+ primary_key=cube_pk,
189
+ measures=measures,
190
+ dimensions=dimensions,
191
+ time_dimensions=time_dims,
192
+ joins=joins,
193
+ )
194
+ )
195
+
196
+ cubes = _dedupe_aliases(cubes)
197
+ return IntrospectionResult(cubes=cubes, annotations=annotations)
198
+
199
+
200
+ __all__ = ["HeuristicAnnotation", "IntrospectionResult", "introspect"]
@@ -0,0 +1,207 @@
1
+ """Schema probes — read tables / columns / FKs from a live database.
2
+
3
+ The probe abstracts the dialect-specific information_schema layout:
4
+ PG / DuckDB / Snowflake mostly share the ANSI ``information_schema``
5
+ shape; ClickHouse uses ``system.columns`` + ``system.tables``; BigQuery
6
+ adds a ``project_id.dataset`` prefix. v1 ships
7
+ :class:`InformationSchemaProbe` which covers the ANSI dialects;
8
+ non-ANSI dialects can implement :class:`SchemaProbe` directly without
9
+ touching the rest of the package.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from collections.abc import Iterable
15
+ from dataclasses import dataclass
16
+ from typing import Any, Protocol
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ColumnInfo:
21
+ """One column on a table as returned by a probe.
22
+
23
+ ``data_type`` is the raw SQL type string the database returned
24
+ (``"integer"``, ``"timestamp without time zone"``, etc.). Heuristics
25
+ walk this to decide field kind / dimension type.
26
+ """
27
+
28
+ name: str
29
+ data_type: str
30
+ is_nullable: bool
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class ForeignKeyInfo:
35
+ """One FK edge — ``from_table.from_column → to_table.to_column``."""
36
+
37
+ from_table: str
38
+ from_column: str
39
+ to_table: str
40
+ to_column: str
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class TableInfo:
45
+ """One table the catalog will materialise as a ``Cube``."""
46
+
47
+ name: str
48
+ columns: tuple[ColumnInfo, ...]
49
+
50
+
51
+ class SchemaProbe(Protocol):
52
+ """Read-only dialect adapter for introspection.
53
+
54
+ Implementations should return tables / columns / foreign keys
55
+ scoped to whatever schema the caller asked about — the orchestrator
56
+ treats the result as authoritative and doesn't filter again."""
57
+
58
+ def list_tables(self) -> list[TableInfo]: ...
59
+
60
+ def list_foreign_keys(self) -> list[ForeignKeyInfo]: ...
61
+
62
+ def list_primary_keys(self) -> dict[str, str]:
63
+ """``{table_name: primary_key_column}``.
64
+
65
+ Probes that can't recover primary keys (some warehouses don't
66
+ ship constraints) should return ``{}`` — heuristics will fall
67
+ back to "first column named ``id``" detection."""
68
+ ...
69
+
70
+
71
+ class InformationSchemaProbe:
72
+ """ANSI ``information_schema`` probe.
73
+
74
+ Works against any database that ships the standard
75
+ ``information_schema.columns`` / ``information_schema.table_constraints``
76
+ layout — Postgres, DuckDB, Snowflake (mostly), and SQL Server. The
77
+ schema argument scopes ``table_schema``; pass it explicitly so
78
+ catalog dumps don't accidentally span system schemas.
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ connection: Any, # noqa: ANN401 — any DB-API 2.0 conn
84
+ *,
85
+ schema: str,
86
+ include_tables: Iterable[str] | None = None,
87
+ exclude_tables: Iterable[str] | None = None,
88
+ ) -> None:
89
+ self._conn = connection
90
+ self._schema = schema
91
+ self._include = set(include_tables) if include_tables else None
92
+ self._exclude = set(exclude_tables or ())
93
+
94
+ def list_tables(self) -> list[TableInfo]:
95
+ cur = self._conn.cursor()
96
+ try:
97
+ cur.execute(
98
+ "SELECT table_name FROM information_schema.tables "
99
+ f"WHERE table_schema = '{self._schema}' "
100
+ "AND table_type = 'BASE TABLE' "
101
+ "ORDER BY table_name"
102
+ )
103
+ table_names = [row[0] for row in cur.fetchall()]
104
+ finally:
105
+ cur.close()
106
+ if self._include is not None:
107
+ table_names = [t for t in table_names if t in self._include]
108
+ table_names = [t for t in table_names if t not in self._exclude]
109
+
110
+ out: list[TableInfo] = []
111
+ for tbl in table_names:
112
+ out.append(TableInfo(name=tbl, columns=self._columns_for(tbl)))
113
+ return out
114
+
115
+ def _columns_for(self, table: str) -> tuple[ColumnInfo, ...]:
116
+ cur = self._conn.cursor()
117
+ try:
118
+ cur.execute(
119
+ "SELECT column_name, data_type, is_nullable "
120
+ "FROM information_schema.columns "
121
+ f"WHERE table_schema = '{self._schema}' "
122
+ f"AND table_name = '{table}' "
123
+ "ORDER BY ordinal_position"
124
+ )
125
+ rows = cur.fetchall()
126
+ finally:
127
+ cur.close()
128
+ return tuple(
129
+ ColumnInfo(
130
+ name=row[0],
131
+ data_type=str(row[1]),
132
+ is_nullable=(str(row[2]).upper() == "YES"),
133
+ )
134
+ for row in rows
135
+ )
136
+
137
+ def list_foreign_keys(self) -> list[ForeignKeyInfo]:
138
+ # ``information_schema.constraint_column_usage`` differs per
139
+ # backend: Postgres returns the referenced (target) table for
140
+ # FK constraints, DuckDB returns the source table. The more
141
+ # portable shape is ``referential_constraints`` → the PK
142
+ # constraint on the referenced side, joined via ``key_column_usage``
143
+ # on both ends with matching ordinal positions for composite
144
+ # keys. Works on PG / DuckDB / Snowflake unchanged.
145
+ cur = self._conn.cursor()
146
+ try:
147
+ cur.execute(
148
+ "SELECT kcu_from.table_name AS from_table, "
149
+ " kcu_from.column_name AS from_column, "
150
+ " kcu_to.table_name AS to_table, "
151
+ " kcu_to.column_name AS to_column "
152
+ "FROM information_schema.referential_constraints rc "
153
+ "JOIN information_schema.key_column_usage kcu_from "
154
+ " ON rc.constraint_name = kcu_from.constraint_name "
155
+ " AND rc.constraint_schema = kcu_from.constraint_schema "
156
+ "JOIN information_schema.key_column_usage kcu_to "
157
+ " ON rc.unique_constraint_name = kcu_to.constraint_name "
158
+ " AND rc.unique_constraint_schema = kcu_to.constraint_schema "
159
+ " AND kcu_from.ordinal_position = kcu_to.ordinal_position "
160
+ f"WHERE rc.constraint_schema = '{self._schema}' "
161
+ "ORDER BY kcu_from.table_name, kcu_from.ordinal_position"
162
+ )
163
+ rows = cur.fetchall()
164
+ finally:
165
+ cur.close()
166
+ return [
167
+ ForeignKeyInfo(
168
+ from_table=row[0],
169
+ from_column=row[1],
170
+ to_table=row[2],
171
+ to_column=row[3],
172
+ )
173
+ for row in rows
174
+ ]
175
+
176
+ def list_primary_keys(self) -> dict[str, str]:
177
+ cur = self._conn.cursor()
178
+ try:
179
+ cur.execute(
180
+ "SELECT kcu.table_name, kcu.column_name "
181
+ "FROM information_schema.table_constraints tc "
182
+ "JOIN information_schema.key_column_usage kcu "
183
+ " ON tc.constraint_name = kcu.constraint_name "
184
+ " AND tc.table_schema = kcu.table_schema "
185
+ "WHERE tc.constraint_type = 'PRIMARY KEY' "
186
+ f" AND tc.table_schema = '{self._schema}' "
187
+ "ORDER BY kcu.table_name, kcu.ordinal_position"
188
+ )
189
+ rows = cur.fetchall()
190
+ finally:
191
+ cur.close()
192
+ # If a PK is composite, the first column wins — the catalog
193
+ # model only supports a single ``primary_key`` per cube. A
194
+ # ``# TODO: review`` will surface composite-key tables anyway.
195
+ out: dict[str, str] = {}
196
+ for row in rows:
197
+ out.setdefault(str(row[0]), str(row[1]))
198
+ return out
199
+
200
+
201
+ __all__ = [
202
+ "ColumnInfo",
203
+ "ForeignKeyInfo",
204
+ "InformationSchemaProbe",
205
+ "SchemaProbe",
206
+ "TableInfo",
207
+ ]
File without changes