catalogkit-query 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ """Public package surface for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from catalogkit.core import CatalogArtifact
6
+
7
+ from ._version import __version__
8
+ from .api import build_catalog_artifact, build_query_map, render_json, render_text
9
+ from .errors import QueryMapContractError, QueryMapError, QueryMapParseError
10
+ from .models import (
11
+ OutputColumn,
12
+ OutputSourceHint,
13
+ QueryMap,
14
+ QuerySummary,
15
+ Relation,
16
+ RelationEdge,
17
+ RelationUsage,
18
+ WarningEntry,
19
+ )
20
+
21
+ __all__ = [
22
+ "__version__",
23
+ "build_catalog_artifact",
24
+ "build_query_map",
25
+ "CatalogArtifact",
26
+ "OutputColumn",
27
+ "OutputSourceHint",
28
+ "QueryMap",
29
+ "QuerySummary",
30
+ "QueryMapContractError",
31
+ "QueryMapError",
32
+ "QueryMapParseError",
33
+ "Relation",
34
+ "RelationEdge",
35
+ "RelationUsage",
36
+ "render_json",
37
+ "render_text",
38
+ "WarningEntry",
39
+ ]
@@ -0,0 +1,9 @@
1
+ """Module entrypoint for `python -m catalogkit.query`."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .cli import main
6
+
7
+
8
+ if __name__ == "__main__":
9
+ raise SystemExit(main())
@@ -0,0 +1,3 @@
1
+ """Package version."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,39 @@
1
+ """Public API for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from catalogkit.core import CatalogArtifact
6
+
7
+ from .build import build_catalog_artifact_from_parsed, build_query_map_from_parsed
8
+ from .models import QueryMap
9
+ from .parser import parse_statement
10
+ from .render.json import render_json
11
+ from .render.text import render_text
12
+
13
+
14
+ def build_query_map(
15
+ sql: str,
16
+ *,
17
+ dialect: str,
18
+ ) -> QueryMap:
19
+ """Build the public catalogkit-query artifact for one SQL statement."""
20
+ parsed = parse_statement(sql, dialect=dialect)
21
+ return build_query_map_from_parsed(parsed)
22
+
23
+
24
+ def build_catalog_artifact(
25
+ sql: str,
26
+ *,
27
+ dialect: str,
28
+ ) -> CatalogArtifact:
29
+ """Build the shared catalog artifact for CatalogKit composition."""
30
+ parsed = parse_statement(sql, dialect=dialect)
31
+ return build_catalog_artifact_from_parsed(parsed)
32
+
33
+
34
+ __all__ = [
35
+ "build_catalog_artifact",
36
+ "build_query_map",
37
+ "render_json",
38
+ "render_text",
39
+ ]
@@ -0,0 +1,72 @@
1
+ """Shared sqlglot AST helpers for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+
7
+ from sqlglot import exp
8
+
9
+
10
+ def iter_ctes(root_expression: exp.Expression) -> Iterable[exp.CTE]:
11
+ """Yield every CTE in the parsed query."""
12
+ yield from root_expression.find_all(exp.CTE)
13
+
14
+
15
+ def cte_name(cte: exp.CTE) -> str:
16
+ """Return the raw CTE name from the AST."""
17
+ alias = cte.alias
18
+ if isinstance(alias, str):
19
+ return alias
20
+ return str(alias or "").strip()
21
+
22
+
23
+ def qualified_table_name(table: exp.Table) -> str:
24
+ """Return the dotted table name as it appears in the AST."""
25
+ parts = [
26
+ str(part).strip()
27
+ for part in (table.catalog, table.db, table.name)
28
+ if str(part or "").strip()
29
+ ]
30
+ return ".".join(parts)
31
+
32
+
33
+ def has_join_ancestor(node: exp.Expression, *, stop_node: exp.Expression) -> bool:
34
+ """Detect whether a table node appears under a JOIN subtree."""
35
+ current = node.parent
36
+ while current is not None and current is not stop_node:
37
+ if isinstance(current, exp.Join):
38
+ return True
39
+ current = current.parent
40
+ return False
41
+
42
+
43
+ def iter_table_nodes(expression: exp.Expression | None) -> Iterable[exp.Table]:
44
+ """Yield all table nodes under an expression."""
45
+ if expression is None:
46
+ return
47
+ for table in expression.find_all(exp.Table):
48
+ yield table
49
+
50
+
51
+ def iter_table_nodes_skipping_ctes(
52
+ expression: exp.Expression | None,
53
+ ) -> Iterable[exp.Table]:
54
+ """Yield table nodes while skipping traversal into nested CTE definitions."""
55
+ if expression is None:
56
+ return
57
+ yield from _walk_tables(expression, skip_ctes=True)
58
+
59
+
60
+ def _walk_tables(expression: exp.Expression, *, skip_ctes: bool) -> Iterable[exp.Table]:
61
+ if skip_ctes and isinstance(expression, exp.CTE):
62
+ return
63
+ if isinstance(expression, exp.Table):
64
+ yield expression
65
+ return
66
+ for child in expression.args.values():
67
+ if isinstance(child, list):
68
+ for item in child:
69
+ if isinstance(item, exp.Expression):
70
+ yield from _walk_tables(item, skip_ctes=skip_ctes)
71
+ elif isinstance(child, exp.Expression):
72
+ yield from _walk_tables(child, skip_ctes=skip_ctes)
@@ -0,0 +1,249 @@
1
+ """Artifact assembly for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, cast
6
+
7
+ from catalogkit.core import CatalogArtifact, Edge, Evidence, Warning, merge
8
+ from sqlglot import exp
9
+
10
+ from .ctes import extract_dependency_edges
11
+ from .errors import QueryMapContractError
12
+ from .models import (
13
+ EdgeKind as QueryMapEdgeKind,
14
+ QueryMap,
15
+ QuerySummary,
16
+ RelationEdge,
17
+ RelationUsage,
18
+ WarningCode,
19
+ WarningEntry,
20
+ )
21
+ from .parser import ParsedStatement
22
+ from .relations import RelationExtraction, extract_relations
23
+
24
+ _QUERYMAP_EDGE_KINDS = {"depends_on", "joins"}
25
+ _QUERYMAP_WARNING_CODES = {
26
+ "parse_recovered",
27
+ "select_star",
28
+ "table_star",
29
+ "ambiguous_output_source",
30
+ "unresolved_output_source",
31
+ "non_equi_join",
32
+ "unsupported_construct",
33
+ }
34
+
35
+
36
+ def build_query_map_from_parsed(parsed: ParsedStatement) -> QueryMap:
37
+ """Build the public catalogkit-query artifact for one parsed SQL statement."""
38
+ artifact, relation_extraction = _build_catalog_artifact_from_parsed(parsed)
39
+
40
+ edges = [_relation_edge_from_catalog_edge(edge) for edge in artifact.edges]
41
+ warnings = [_warning_entry_from_warning(warning) for warning in artifact.warnings]
42
+ summary = QuerySummary(
43
+ dialect=parsed.dialect,
44
+ statement_type=parsed.statement.key.lower(),
45
+ has_ctes=bool(relation_extraction.cte_nodes),
46
+ relation_count=len(relation_extraction.relations),
47
+ cte_count=len(relation_extraction.cte_nodes),
48
+ output_count=0,
49
+ )
50
+
51
+ return QueryMap(
52
+ version=artifact.version,
53
+ summary=summary,
54
+ relations=relation_extraction.relations,
55
+ relation_usages=[
56
+ RelationUsage(
57
+ relation_id=usage.relation_id,
58
+ alias=usage.alias,
59
+ context=usage.context,
60
+ sql=usage.sql,
61
+ normalized_sql=usage.sql,
62
+ )
63
+ for usage in relation_extraction.relation_usages
64
+ ],
65
+ edges=_dedupe_query_map_edges(edges),
66
+ outputs=[],
67
+ warnings=_dedupe_warnings(warnings),
68
+ )
69
+
70
+
71
+ def build_catalog_artifact_from_parsed(parsed: ParsedStatement) -> CatalogArtifact:
72
+ """Build the shared catalog artifact used for CatalogKit composition."""
73
+ artifact, _ = _build_catalog_artifact_from_parsed(parsed)
74
+ return artifact
75
+
76
+
77
+ def _build_catalog_artifact_from_parsed(
78
+ parsed: ParsedStatement,
79
+ ) -> tuple[CatalogArtifact, RelationExtraction]:
80
+ """Build the internal shared artifact and keep extraction metadata nearby."""
81
+ relation_extraction = extract_relations(parsed.root_expression, dialect=parsed.dialect)
82
+ dependency_extraction = extract_dependency_edges(
83
+ parsed.root_expression,
84
+ relation_extraction=relation_extraction,
85
+ dialect=parsed.dialect,
86
+ )
87
+
88
+ if not relation_extraction.nodes:
89
+ raise QueryMapContractError("No tables or CTE relations were found in the SQL statement.")
90
+
91
+ nodes_by_id = {node.id: node.model_copy(deep=True) for node in relation_extraction.nodes}
92
+ for usage in relation_extraction.relation_usages:
93
+ node = nodes_by_id[usage.relation_id]
94
+ node.evidence.append(
95
+ Evidence(
96
+ location=usage.context,
97
+ expression=usage.sql,
98
+ confidence="high",
99
+ )
100
+ )
101
+
102
+ artifact = CatalogArtifact(
103
+ nodes=sorted(nodes_by_id.values(), key=lambda node: node.id),
104
+ edges=dependency_extraction.edges,
105
+ warnings=[
106
+ *dependency_extraction.warnings,
107
+ *_extract_contract_warnings(parsed.root_expression, dialect=parsed.dialect),
108
+ ],
109
+ )
110
+ return merge(artifact), relation_extraction
111
+
112
+
113
+ def _extract_contract_warnings(
114
+ root_expression: Any,
115
+ *,
116
+ dialect: str,
117
+ ) -> list[Warning]:
118
+ warnings: list[Warning] = []
119
+
120
+ union_expression = root_expression.find(exp.Union)
121
+ if union_expression is not None:
122
+ warnings.append(
123
+ Warning(
124
+ code="unsupported_construct",
125
+ message="UNION queries are mapped at the relation level only in the MVP.",
126
+ location=union_expression.sql(dialect=dialect),
127
+ )
128
+ )
129
+ intersect_expression = root_expression.find(exp.Intersect)
130
+ if intersect_expression is not None:
131
+ warnings.append(
132
+ Warning(
133
+ code="unsupported_construct",
134
+ message="INTERSECT queries are mapped at the relation level only in the MVP.",
135
+ location=intersect_expression.sql(dialect=dialect),
136
+ )
137
+ )
138
+ except_expression = root_expression.find(exp.Except)
139
+ if except_expression is not None:
140
+ warnings.append(
141
+ Warning(
142
+ code="unsupported_construct",
143
+ message="EXCEPT queries are mapped at the relation level only in the MVP.",
144
+ location=except_expression.sql(dialect=dialect),
145
+ )
146
+ )
147
+
148
+ for select in root_expression.find_all(exp.Select):
149
+ for selection in select.expressions:
150
+ if isinstance(selection, exp.Star):
151
+ warnings.append(
152
+ Warning(
153
+ code="select_star",
154
+ message="SELECT * was detected; output mapping is deferred in the MVP.",
155
+ location=selection.sql(dialect=dialect),
156
+ )
157
+ )
158
+ elif isinstance(selection, exp.Column) and str(selection.name or "").strip() == "*":
159
+ warnings.append(
160
+ Warning(
161
+ code="table_star",
162
+ message="table.* was detected; output mapping is deferred in the MVP.",
163
+ location=selection.sql(dialect=dialect),
164
+ )
165
+ )
166
+
167
+ for join in root_expression.find_all(exp.Join):
168
+ on_clause = join.args.get("on")
169
+ using_clause = join.args.get("using")
170
+ if on_clause is None and using_clause is None:
171
+ warnings.append(
172
+ Warning(
173
+ code="unsupported_construct",
174
+ message="JOIN without ON/USING is not modeled beyond relation dependency mapping.",
175
+ location=join.sql(dialect=dialect),
176
+ )
177
+ )
178
+ continue
179
+ if on_clause is not None and not _is_equality_join(on_clause):
180
+ warnings.append(
181
+ Warning(
182
+ code="non_equi_join",
183
+ message="Non-equality join detected; MVP preserves relation dependencies but does not model join semantics.",
184
+ location=join.sql(dialect=dialect),
185
+ )
186
+ )
187
+
188
+ return warnings
189
+
190
+
191
+ def _is_equality_join(expression: Any) -> bool:
192
+ if isinstance(expression, exp.EQ):
193
+ return True
194
+ if isinstance(expression, exp.And):
195
+ return _is_equality_join(expression.left) and _is_equality_join(expression.right)
196
+ return False
197
+
198
+
199
+ def _relation_edge_from_catalog_edge(edge: Edge) -> RelationEdge:
200
+ if edge.kind not in _QUERYMAP_EDGE_KINDS:
201
+ raise QueryMapContractError(
202
+ f"catalogkit-query cannot emit unsupported edge kind {edge.kind!r} in its public contract."
203
+ )
204
+ sql = edge.evidence[0].expression if edge.evidence else None
205
+ return RelationEdge(
206
+ kind=cast(QueryMapEdgeKind, edge.kind),
207
+ source_id=edge.source_id,
208
+ target_id=edge.target_id,
209
+ label=edge.label,
210
+ confidence=edge.confidence,
211
+ sql=sql,
212
+ normalized_sql=sql,
213
+ )
214
+
215
+
216
+ def _warning_entry_from_warning(warning: Warning) -> WarningEntry:
217
+ if warning.code not in _QUERYMAP_WARNING_CODES:
218
+ raise QueryMapContractError(
219
+ f"catalogkit-query cannot emit unsupported warning code {warning.code!r} in its public contract."
220
+ )
221
+ return WarningEntry(
222
+ code=cast(WarningCode, warning.code),
223
+ message=warning.message,
224
+ location=warning.location,
225
+ )
226
+
227
+
228
+ def _dedupe_query_map_edges(edges: list[RelationEdge]) -> list[RelationEdge]:
229
+ seen: set[tuple[str, str, str | None]] = set()
230
+ deduped: list[RelationEdge] = []
231
+ for edge in edges:
232
+ key = (edge.source_id, edge.target_id, edge.sql)
233
+ if key in seen:
234
+ continue
235
+ seen.add(key)
236
+ deduped.append(edge)
237
+ return deduped
238
+
239
+
240
+ def _dedupe_warnings(warnings: list[WarningEntry]) -> list[WarningEntry]:
241
+ seen: set[tuple[str, str, str | None]] = set()
242
+ deduped: list[WarningEntry] = []
243
+ for warning in warnings:
244
+ key = (warning.code, warning.message, warning.location)
245
+ if key in seen:
246
+ continue
247
+ seen.add(key)
248
+ deduped.append(warning)
249
+ return deduped
@@ -0,0 +1,61 @@
1
+ """CLI entrypoint for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ from . import __version__
11
+ from .api import build_query_map, render_json, render_text
12
+ from .errors import QueryMapError
13
+
14
+
15
+ def build_parser() -> argparse.ArgumentParser:
16
+ parser = argparse.ArgumentParser(
17
+ prog="catalogkit-query",
18
+ description="Map one supported SQL statement into deterministic catalog dependencies.",
19
+ )
20
+ parser.add_argument(
21
+ "sql_file",
22
+ help="Path to a UTF-8 SQL file containing exactly one supported statement.",
23
+ )
24
+ parser.add_argument(
25
+ "--dialect",
26
+ required=True,
27
+ help="sqlglot dialect name, for example postgres, snowflake, tsql, or bigquery.",
28
+ )
29
+ parser.add_argument(
30
+ "--format",
31
+ choices=("text", "json"),
32
+ default="text",
33
+ help="Renderer output format.",
34
+ )
35
+ parser.add_argument(
36
+ "--version",
37
+ action="version",
38
+ version=f"%(prog)s {__version__}",
39
+ )
40
+ return parser
41
+
42
+
43
+ def main(argv: list[str] | None = None) -> int:
44
+ parser = build_parser()
45
+ args = parser.parse_args(argv)
46
+
47
+ try:
48
+ sql = Path(args.sql_file).read_text(encoding="utf-8")
49
+ artifact = build_query_map(sql, dialect=args.dialect)
50
+ if args.format == "json":
51
+ print(json.dumps(render_json(artifact), indent=2, sort_keys=False))
52
+ else:
53
+ print(render_text(artifact))
54
+ return 0
55
+ except (OSError, QueryMapError) as exc:
56
+ print(f"catalogkit-query error: {exc}", file=sys.stderr)
57
+ return 1
58
+
59
+
60
+ if __name__ == "__main__":
61
+ raise SystemExit(main())
@@ -0,0 +1,111 @@
1
+ """Dependency edge extraction for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ from catalogkit.core import Edge, Evidence, Warning, cte_id, normalize_identifier, normalize_identifier_part
8
+ from sqlglot import exp
9
+
10
+ from .ast_utils import cte_name, iter_ctes, iter_table_nodes, iter_table_nodes_skipping_ctes, qualified_table_name
11
+ from .errors import QueryMapContractError
12
+ from .relations import RelationExtraction
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class DependencyExtraction:
17
+ edges: list[Edge]
18
+ warnings: list[Warning]
19
+
20
+
21
+ def extract_dependency_edges(
22
+ root_expression: exp.Expression,
23
+ *,
24
+ relation_extraction: RelationExtraction,
25
+ dialect: str,
26
+ ) -> DependencyExtraction:
27
+ """Extract relation-level dependency edges for root query and CTEs."""
28
+ edges: list[Edge] = []
29
+ warnings: list[Warning] = []
30
+
31
+ for cte in iter_ctes(root_expression):
32
+ raw_cte_name = cte_name(cte)
33
+ if not raw_cte_name:
34
+ continue
35
+ source_id = cte_id(raw_cte_name)
36
+ for table in iter_table_nodes(cte.this):
37
+ target_id = _resolve_relation_id(table, relation_extraction=relation_extraction)
38
+ if target_id is None:
39
+ continue
40
+ if target_id == source_id:
41
+ warnings.append(
42
+ Warning(
43
+ code="unsupported_construct",
44
+ message=f"Recursive CTE {normalize_identifier_part(raw_cte_name)!r} is not modeled explicitly in the MVP.",
45
+ location=table.sql(dialect=dialect),
46
+ )
47
+ )
48
+ continue
49
+ edges.append(
50
+ Edge(
51
+ kind="depends_on",
52
+ source_id=source_id,
53
+ target_id=target_id,
54
+ label="depends_on",
55
+ confidence="high",
56
+ evidence=[
57
+ Evidence(
58
+ location="cte_body",
59
+ expression=table.sql(dialect=dialect),
60
+ confidence="high",
61
+ )
62
+ ],
63
+ )
64
+ )
65
+
66
+ for table in iter_table_nodes_skipping_ctes(root_expression):
67
+ target_id = _resolve_relation_id(table, relation_extraction=relation_extraction)
68
+ if target_id is None:
69
+ continue
70
+ edges.append(
71
+ Edge(
72
+ kind="depends_on",
73
+ source_id="query:root",
74
+ target_id=target_id,
75
+ label="depends_on",
76
+ confidence="high",
77
+ evidence=[
78
+ Evidence(
79
+ location="root_query",
80
+ expression=table.sql(dialect=dialect),
81
+ confidence="high",
82
+ )
83
+ ],
84
+ )
85
+ )
86
+
87
+ return DependencyExtraction(edges=edges, warnings=warnings)
88
+
89
+
90
+ def _resolve_relation_id(
91
+ table: exp.Table,
92
+ *,
93
+ relation_extraction: RelationExtraction,
94
+ ) -> str | None:
95
+ name = str(table.name or "").strip()
96
+ if not name:
97
+ return None
98
+
99
+ cte_node = relation_extraction.cte_nodes.get(normalize_identifier_part(name))
100
+ if cte_node is not None:
101
+ return cte_node.id
102
+
103
+ qualified_name = qualified_table_name(table)
104
+ normalized_qualified_name = normalize_identifier(qualified_name)
105
+ table_node = relation_extraction.table_nodes.get(normalized_qualified_name)
106
+ if table_node is not None:
107
+ return table_node.id
108
+
109
+ raise QueryMapContractError(
110
+ f"Unresolved relation id for table reference {qualified_name!r}."
111
+ )
@@ -0,0 +1,15 @@
1
+ """Package-specific errors for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class QueryMapError(Exception):
7
+ """Base class for catalogkit-query failures."""
8
+
9
+
10
+ class QueryMapParseError(QueryMapError):
11
+ """Raised when SQL cannot be parsed into a supported AST."""
12
+
13
+
14
+ class QueryMapContractError(QueryMapError):
15
+ """Raised when parsed SQL cannot be represented by the current contract."""
@@ -0,0 +1,93 @@
1
+ """Public artifact models for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+ Confidence = Literal["high", "medium", "low"]
10
+ RelationKind = Literal["table", "cte"]
11
+ RelationUsageContext = Literal["from", "join", "cte_body"]
12
+ EdgeKind = Literal["depends_on", "joins"]
13
+ WarningCode = Literal[
14
+ "parse_recovered",
15
+ "select_star",
16
+ "table_star",
17
+ "ambiguous_output_source",
18
+ "unresolved_output_source",
19
+ "non_equi_join",
20
+ "unsupported_construct",
21
+ ]
22
+
23
+
24
+ class QuerySummary(BaseModel):
25
+ dialect: str
26
+ statement_type: str
27
+ has_ctes: bool
28
+ relation_count: int
29
+ cte_count: int
30
+ output_count: int = 0
31
+
32
+
33
+ class Relation(BaseModel):
34
+ model_config = ConfigDict(populate_by_name=True)
35
+
36
+ id: str
37
+ kind: RelationKind
38
+ name: str
39
+ qualified_name: str | None = None
40
+ schema_name: str | None = Field(
41
+ default=None,
42
+ alias="schema",
43
+ serialization_alias="schema",
44
+ )
45
+
46
+
47
+ class RelationUsage(BaseModel):
48
+ relation_id: str
49
+ alias: str | None = None
50
+ context: RelationUsageContext
51
+ sql: str
52
+ normalized_sql: str | None = None
53
+
54
+
55
+ class RelationEdge(BaseModel):
56
+ kind: EdgeKind
57
+ source_id: str
58
+ target_id: str
59
+ label: str | None = None
60
+ confidence: Confidence = "high"
61
+ sql: str | None = None
62
+ normalized_sql: str | None = None
63
+
64
+
65
+ class OutputSourceHint(BaseModel):
66
+ relation_id: str | None = None
67
+ column_name: str | None = None
68
+ confidence: Confidence = "medium"
69
+
70
+
71
+ class OutputColumn(BaseModel):
72
+ name: str
73
+ ordinal: int
74
+ expression_sql: str
75
+ normalized_expression_sql: str | None = None
76
+ inferred: bool = False
77
+ sources: list[OutputSourceHint] = Field(default_factory=list)
78
+
79
+
80
+ class WarningEntry(BaseModel):
81
+ code: WarningCode
82
+ message: str
83
+ location: str | None = None
84
+
85
+
86
+ class QueryMap(BaseModel):
87
+ version: str = "1"
88
+ summary: QuerySummary
89
+ relations: list[Relation] = Field(default_factory=list)
90
+ relation_usages: list[RelationUsage] = Field(default_factory=list)
91
+ edges: list[RelationEdge] = Field(default_factory=list)
92
+ outputs: list[OutputColumn] = Field(default_factory=list)
93
+ warnings: list[WarningEntry] = Field(default_factory=list)
@@ -0,0 +1,57 @@
1
+ """sqlglot parsing helpers for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import sqlglot
8
+ from sqlglot import exp
9
+
10
+ from .errors import QueryMapContractError, QueryMapParseError
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class ParsedStatement:
15
+ statement: exp.Expression
16
+ root_expression: exp.Expression
17
+ dialect: str
18
+
19
+
20
+ def parse_statement(sql: str, *, dialect: str) -> ParsedStatement:
21
+ """Parse exactly one SQL statement into a supported AST."""
22
+ cleaned = (sql or "").strip()
23
+ if not cleaned:
24
+ raise QueryMapParseError("SQL input is empty.")
25
+
26
+ try:
27
+ statements = [stmt for stmt in sqlglot.parse(cleaned, read=dialect) if stmt is not None]
28
+ except Exception as exc:
29
+ raise QueryMapParseError(f"Failed to parse SQL with dialect {dialect!r}: {exc}") from exc
30
+
31
+ if not statements:
32
+ raise QueryMapParseError("SQL input produced no parseable statements.")
33
+ if len(statements) != 1:
34
+ raise QueryMapContractError("catalogkit-query accepts exactly one SQL statement per invocation.")
35
+
36
+ statement = statements[0]
37
+ root_expression = _unwrap_root_expression(statement)
38
+ if not isinstance(root_expression, exp.Query):
39
+ raise QueryMapContractError(
40
+ "catalogkit-query supports exactly one SELECT, INSERT ... SELECT, or CREATE ... AS SELECT statement per invocation."
41
+ )
42
+ return ParsedStatement(statement=statement, root_expression=root_expression, dialect=dialect)
43
+
44
+
45
+ def _unwrap_root_expression(statement: exp.Expression) -> exp.Expression:
46
+ """Return the query-like root expression used for structure mapping."""
47
+ if isinstance(statement, exp.Create):
48
+ expression = statement.args.get("expression")
49
+ if isinstance(expression, exp.Expression):
50
+ return expression
51
+
52
+ if isinstance(statement, exp.Insert):
53
+ expression = statement.args.get("expression")
54
+ if isinstance(expression, exp.Expression):
55
+ return expression
56
+
57
+ return statement
@@ -0,0 +1,210 @@
1
+ """Relation discovery for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ from catalogkit.core import Node, cte_id, leaf_name, normalize_identifier, normalize_identifier_part, schema_name, table_id
9
+ from sqlglot import exp
10
+
11
+ from .ast_utils import (
12
+ cte_name,
13
+ has_join_ancestor,
14
+ iter_ctes,
15
+ iter_table_nodes,
16
+ iter_table_nodes_skipping_ctes,
17
+ qualified_table_name,
18
+ )
19
+ from .models import Relation
20
+
21
+ RelationUsageContext = Literal["from", "join", "cte_body"]
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class RelationUsage:
26
+ relation_id: str
27
+ alias: str | None
28
+ context: RelationUsageContext
29
+ sql: str
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class RelationExtraction:
34
+ nodes: list[Node]
35
+ relations: list[Relation]
36
+ relation_usages: list[RelationUsage]
37
+ cte_nodes: dict[str, Node]
38
+ table_nodes: dict[str, Node]
39
+ relation_by_id: dict[str, Relation]
40
+
41
+
42
+ def extract_relations(root_expression: exp.Expression, *, dialect: str) -> RelationExtraction:
43
+ """Extract canonical table and CTE nodes plus usage evidence."""
44
+ cte_nodes = _extract_cte_nodes(root_expression)
45
+ cte_relations = _extract_cte_relations(root_expression)
46
+ table_nodes: dict[str, Node] = {}
47
+ relation_by_id: dict[str, Relation] = {
48
+ relation.id: relation for relation in cte_relations.values()
49
+ }
50
+ relation_usages: list[RelationUsage] = []
51
+
52
+ for cte in iter_ctes(root_expression):
53
+ for table in iter_table_nodes(cte.this):
54
+ node = _resolve_node(
55
+ table,
56
+ cte_nodes=cte_nodes,
57
+ table_nodes=table_nodes,
58
+ relation_by_id=relation_by_id,
59
+ cte_relations=cte_relations,
60
+ )
61
+ if node is None:
62
+ continue
63
+ relation_usages.append(
64
+ _build_usage(
65
+ relation_id=node.id,
66
+ table=table,
67
+ context="join" if has_join_ancestor(table, stop_node=cte.this) else "cte_body",
68
+ dialect=dialect,
69
+ )
70
+ )
71
+
72
+ for table in iter_table_nodes_skipping_ctes(root_expression):
73
+ node = _resolve_node(
74
+ table,
75
+ cte_nodes=cte_nodes,
76
+ table_nodes=table_nodes,
77
+ relation_by_id=relation_by_id,
78
+ cte_relations=cte_relations,
79
+ )
80
+ if node is None:
81
+ continue
82
+ relation_usages.append(
83
+ _build_usage(
84
+ relation_id=node.id,
85
+ table=table,
86
+ context="join" if has_join_ancestor(table, stop_node=root_expression) else "from",
87
+ dialect=dialect,
88
+ )
89
+ )
90
+
91
+ nodes = sorted([*table_nodes.values(), *cte_nodes.values()], key=lambda node: node.id)
92
+ relations = sorted(relation_by_id.values(), key=lambda relation: relation.id)
93
+ return RelationExtraction(
94
+ nodes=nodes,
95
+ relations=relations,
96
+ relation_usages=_dedupe_usages(relation_usages),
97
+ cte_nodes=cte_nodes,
98
+ table_nodes=table_nodes,
99
+ relation_by_id=relation_by_id,
100
+ )
101
+
102
+
103
+ def _extract_cte_nodes(root_expression: exp.Expression) -> dict[str, Node]:
104
+ nodes: dict[str, Node] = {}
105
+ for cte in iter_ctes(root_expression):
106
+ raw_name = cte_name(cte)
107
+ if not raw_name:
108
+ continue
109
+ normalized_name = normalize_identifier_part(raw_name)
110
+ node = Node(
111
+ id=cte_id(raw_name),
112
+ kind="cte",
113
+ name=normalized_name,
114
+ )
115
+ nodes[normalized_name] = node
116
+ return nodes
117
+
118
+
119
+ def _extract_cte_relations(root_expression: exp.Expression) -> dict[str, Relation]:
120
+ relations: dict[str, Relation] = {}
121
+ for cte in iter_ctes(root_expression):
122
+ raw_name = cte_name(cte)
123
+ if not raw_name:
124
+ continue
125
+ normalized_name = normalize_identifier_part(raw_name)
126
+ relations[normalized_name] = Relation(
127
+ id=cte_id(raw_name),
128
+ kind="cte",
129
+ name=raw_name,
130
+ )
131
+ return relations
132
+
133
+
134
+ def _resolve_node(
135
+ table: exp.Table,
136
+ *,
137
+ cte_nodes: dict[str, Node],
138
+ table_nodes: dict[str, Node],
139
+ relation_by_id: dict[str, Relation],
140
+ cte_relations: dict[str, Relation],
141
+ ) -> Node | None:
142
+ table_name = str(table.name or "").strip()
143
+ if not table_name:
144
+ return None
145
+
146
+ normalized_table_name = normalize_identifier_part(table_name)
147
+ cte_node = cte_nodes.get(normalized_table_name)
148
+ if cte_node is not None:
149
+ relation_by_id[cte_node.id] = cte_relations[normalized_table_name]
150
+ return cte_node
151
+
152
+ qualified_name = qualified_table_name(table)
153
+ normalized_qualified_name = normalize_identifier(qualified_name)
154
+ node = table_nodes.get(normalized_qualified_name)
155
+ if node is None:
156
+ raw_schema_name = _raw_schema_name(table)
157
+ node = Node(
158
+ id=table_id(qualified_name),
159
+ kind="table",
160
+ name=leaf_name(qualified_name),
161
+ qualified_name=normalized_qualified_name,
162
+ schema=schema_name(qualified_name),
163
+ )
164
+ table_nodes[normalized_qualified_name] = node
165
+ relation_by_id[node.id] = Relation(
166
+ id=node.id,
167
+ kind="table",
168
+ name=table_name,
169
+ qualified_name=qualified_name,
170
+ schema=raw_schema_name,
171
+ )
172
+ return node
173
+
174
+
175
+ def _build_usage(
176
+ *,
177
+ relation_id: str,
178
+ table: exp.Table,
179
+ context: RelationUsageContext,
180
+ dialect: str,
181
+ ) -> RelationUsage:
182
+ return RelationUsage(
183
+ relation_id=relation_id,
184
+ alias=str(table.alias or "").strip() or None,
185
+ context=context,
186
+ sql=table.sql(dialect=dialect),
187
+ )
188
+
189
+
190
+ def _dedupe_usages(usages: list[RelationUsage]) -> list[RelationUsage]:
191
+ seen: set[tuple[str, str | None, str, str]] = set()
192
+ deduped: list[RelationUsage] = []
193
+ for usage in usages:
194
+ key = (usage.relation_id, usage.alias, usage.context, usage.sql)
195
+ if key in seen:
196
+ continue
197
+ seen.add(key)
198
+ deduped.append(usage)
199
+ return deduped
200
+
201
+
202
+ def _raw_schema_name(table: exp.Table) -> str | None:
203
+ parts = [
204
+ str(part).strip()
205
+ for part in (table.catalog, table.db)
206
+ if str(part or "").strip()
207
+ ]
208
+ if not parts:
209
+ return None
210
+ return ".".join(parts)
@@ -0,0 +1 @@
1
+ """Renderers for catalogkit-query."""
@@ -0,0 +1,10 @@
1
+ """JSON renderer for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..models import QueryMap
6
+
7
+
8
+ def render_json(query_map: QueryMap) -> dict:
9
+ """Return the canonical JSON-serializable catalogkit-query artifact."""
10
+ return query_map.model_dump(mode="json", by_alias=True)
@@ -0,0 +1,42 @@
1
+ """Text renderer for catalogkit-query."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..models import QueryMap
6
+
7
+
8
+ def render_text(query_map: QueryMap) -> str:
9
+ """Render the public catalogkit-query artifact for human reading."""
10
+ lines: list[str] = []
11
+ summary = query_map.summary
12
+
13
+ lines.append("catalogkit-query")
14
+ lines.append(f"dialect: {summary.dialect}")
15
+ lines.append(f"statement_type: {summary.statement_type}")
16
+ lines.append(f"has_ctes: {summary.has_ctes}")
17
+ lines.append("")
18
+
19
+ lines.append("relations:")
20
+ for relation in query_map.relations:
21
+ display_name = relation.qualified_name or relation.name
22
+ lines.append(f" - [{relation.kind}] {display_name}")
23
+
24
+ lines.append("")
25
+ lines.append("relation_usages:")
26
+ for usage in query_map.relation_usages:
27
+ alias = f" alias={usage.alias}" if usage.alias else ""
28
+ lines.append(f" - {usage.context}: {usage.relation_id}{alias} :: {usage.sql}")
29
+
30
+ lines.append("")
31
+ lines.append("dependencies:")
32
+ for edge in query_map.edges:
33
+ lines.append(f" - {edge.source_id} -> {edge.target_id}")
34
+
35
+ if query_map.warnings:
36
+ lines.append("")
37
+ lines.append("warnings:")
38
+ for warning in query_map.warnings:
39
+ location = f" [{warning.location}]" if warning.location else ""
40
+ lines.append(f" - {warning.code}: {warning.message}{location}")
41
+
42
+ return "\n".join(lines)
@@ -0,0 +1,104 @@
1
+ Metadata-Version: 2.4
2
+ Name: catalogkit-query
3
+ Version: 0.1.0
4
+ Summary: Map one SQL statement into deterministic CatalogKit dependencies.
5
+ Author: ClearMetric Labs
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/Clearmetric-Labs/CatalogKit
8
+ Project-URL: Source, https://github.com/Clearmetric-Labs/CatalogKit
9
+ Project-URL: Issues, https://github.com/Clearmetric-Labs/CatalogKit/issues
10
+ Keywords: sql,lineage,dependencies,ctes,sqlglot
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Database
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: catalogkit-core>=0.1.0
24
+ Requires-Dist: sqlglot>=25.0.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
27
+ Provides-Extra: release
28
+ Requires-Dist: build>=1.2.2; extra == "release"
29
+ Requires-Dist: twine>=5.1.1; extra == "release"
30
+
31
+ # catalogkit-query
32
+
33
+ `catalogkit-query` maps one supported SQL statement into a deterministic `QueryMap`
34
+ artifact so you can answer "what feeds what in this query?" fast.
35
+
36
+ It is a narrow static-analysis tool:
37
+
38
+ - input: exactly one SQL statement from one SQL file
39
+ - output: canonical relations, relation usages, dependency edges, and warnings
40
+ - no warehouse credentials
41
+ - no dbt project
42
+ - no AI key
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ python -m pip install catalogkit-query
48
+ ```
49
+
50
+ ## Imports
51
+
52
+ ```python
53
+ from catalogkit.query import build_catalog_artifact, build_query_map
54
+ ```
55
+
56
+ For local development:
57
+
58
+ ```bash
59
+ python -m pip install -e ../catalogkit-core
60
+ python -m pip install -e ".[dev,release]"
61
+ ```
62
+
63
+ ## Quickstart
64
+
65
+ ```bash
66
+ catalogkit-query --dialect postgres ./examples/ugly_real_world.sql
67
+ catalogkit-query --dialect postgres --format json ./examples/ugly_real_world.sql
68
+ ```
69
+
70
+ ## Output Contract
71
+
72
+ `catalogkit-query` preserves its public `QueryMap` shape:
73
+
74
+ - `summary`
75
+ - `relations`
76
+ - `relation_usages`
77
+ - `edges`
78
+ - `outputs`
79
+ - `warnings`
80
+
81
+ For CatalogKit composition, the package also exposes a shared
82
+ `CatalogArtifact` builder backed by `catalogkit-core`.
83
+
84
+ The shared core artifact contains:
85
+
86
+ - `version`
87
+ - `nodes`
88
+ - `edges`
89
+ - `warnings`
90
+
91
+ ## Supported Statements
92
+
93
+ `catalogkit-query` accepts exactly one supported statement per invocation:
94
+
95
+ - `SELECT ...`
96
+ - `INSERT ... SELECT ...`
97
+ - `CREATE ... AS SELECT ...`
98
+
99
+ Unsupported statement shapes fail loudly.
100
+
101
+ ## Contract Docs
102
+
103
+ - [`../catalogkit-core/docs/contract.md`](../catalogkit-core/docs/contract.md)
104
+ - [`docs/limitations.md`](docs/limitations.md)
@@ -0,0 +1,20 @@
1
+ catalogkit/query/__init__.py,sha256=6qW-plZ1StWh0O2Q_FxnyuOgGroUPxixIOIGdUzQSKo,873
2
+ catalogkit/query/__main__.py,sha256=iIrnnBjgpYVZD63t9ru8KzuP4CGDw6Cz_0k4UI962PI,175
3
+ catalogkit/query/_version.py,sha256=VDsBswzfU7gNsBkIaxxV6tsi0ixlms3WjN7HYgZJZ2g,46
4
+ catalogkit/query/api.py,sha256=xklIf3019k_StOzHqZhRlRVgngEgMJ224Auw0KcRS-M,969
5
+ catalogkit/query/ast_utils.py,sha256=ZUxja752oU6gVJ29hDd9hOFU-YDvGhwYb7s5J6e80Z4,2272
6
+ catalogkit/query/build.py,sha256=2C_XlIhb1kBPcYZCmdgVIB35ROCq9eq8QHCQEw05ZKI,8733
7
+ catalogkit/query/cli.py,sha256=JchrYitnOPFR-z7iVmOwh4H4wN-RY_73G5SA_6morVM,1692
8
+ catalogkit/query/ctes.py,sha256=a7CgBz7Jfs8VOBMPsI_TIP3qSfkjjLcV_H2833KETqw,3687
9
+ catalogkit/query/errors.py,sha256=9WFMZGViim10tDJ9sYDzb-_5TSr0ZLDnaDJg803SSB0,408
10
+ catalogkit/query/models.py,sha256=2BLSWPai196SAe_g4GeAuVq8m_JvMB2NzvTkYPEytzY,2272
11
+ catalogkit/query/parser.py,sha256=NEAMWtZGPEIXY8zBb-YvufJd1ZTEjfdLyrzEWSAJouQ,2016
12
+ catalogkit/query/relations.py,sha256=1egUNMsikbRWJf5WX7SO_QLXBQze46z4iOvZsIAg40E,6359
13
+ catalogkit/query/render/__init__.py,sha256=syO-eCIlCdZ1a_xkesnnpdjUQaOt9HUsutl5IAI6tDw,38
14
+ catalogkit/query/render/json.py,sha256=V_0VJltBB88gNxdN_ykkkxMB6vHQWJrLC-eQgY-sszY,293
15
+ catalogkit/query/render/text.py,sha256=fev3tMlo_ri8e0yk-jFKnvNx90c_z-tD3IFjIIwKbFo,1426
16
+ catalogkit_query-0.1.0.dist-info/METADATA,sha256=z_H6V1vM0O1hehzYAM9hDnWaXk0SUGciKg9rfChOAdc,2847
17
+ catalogkit_query-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
18
+ catalogkit_query-0.1.0.dist-info/entry_points.txt,sha256=p9fIIzi20wQMw9wVYJpS8L2w0Pl-HT07ga8MHY5joB4,63
19
+ catalogkit_query-0.1.0.dist-info/top_level.txt,sha256=RsnUdiXrSqkJn7elbvcSi3dAxxLtc6rLuzPGHGR_44I,11
20
+ catalogkit_query-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ catalogkit-query = catalogkit.query.cli:main
@@ -0,0 +1 @@
1
+ catalogkit