catalogkit-query 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- catalogkit/query/__init__.py +39 -0
- catalogkit/query/__main__.py +9 -0
- catalogkit/query/_version.py +3 -0
- catalogkit/query/api.py +39 -0
- catalogkit/query/ast_utils.py +72 -0
- catalogkit/query/build.py +249 -0
- catalogkit/query/cli.py +61 -0
- catalogkit/query/ctes.py +111 -0
- catalogkit/query/errors.py +15 -0
- catalogkit/query/models.py +93 -0
- catalogkit/query/parser.py +57 -0
- catalogkit/query/relations.py +210 -0
- catalogkit/query/render/__init__.py +1 -0
- catalogkit/query/render/json.py +10 -0
- catalogkit/query/render/text.py +42 -0
- catalogkit_query-0.1.0.dist-info/METADATA +104 -0
- catalogkit_query-0.1.0.dist-info/RECORD +20 -0
- catalogkit_query-0.1.0.dist-info/WHEEL +5 -0
- catalogkit_query-0.1.0.dist-info/entry_points.txt +2 -0
- catalogkit_query-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Public package surface for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from catalogkit.core import CatalogArtifact
|
|
6
|
+
|
|
7
|
+
from ._version import __version__
|
|
8
|
+
from .api import build_catalog_artifact, build_query_map, render_json, render_text
|
|
9
|
+
from .errors import QueryMapContractError, QueryMapError, QueryMapParseError
|
|
10
|
+
from .models import (
|
|
11
|
+
OutputColumn,
|
|
12
|
+
OutputSourceHint,
|
|
13
|
+
QueryMap,
|
|
14
|
+
QuerySummary,
|
|
15
|
+
Relation,
|
|
16
|
+
RelationEdge,
|
|
17
|
+
RelationUsage,
|
|
18
|
+
WarningEntry,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"__version__",
|
|
23
|
+
"build_catalog_artifact",
|
|
24
|
+
"build_query_map",
|
|
25
|
+
"CatalogArtifact",
|
|
26
|
+
"OutputColumn",
|
|
27
|
+
"OutputSourceHint",
|
|
28
|
+
"QueryMap",
|
|
29
|
+
"QuerySummary",
|
|
30
|
+
"QueryMapContractError",
|
|
31
|
+
"QueryMapError",
|
|
32
|
+
"QueryMapParseError",
|
|
33
|
+
"Relation",
|
|
34
|
+
"RelationEdge",
|
|
35
|
+
"RelationUsage",
|
|
36
|
+
"render_json",
|
|
37
|
+
"render_text",
|
|
38
|
+
"WarningEntry",
|
|
39
|
+
]
|
catalogkit/query/api.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Public API for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from catalogkit.core import CatalogArtifact
|
|
6
|
+
|
|
7
|
+
from .build import build_catalog_artifact_from_parsed, build_query_map_from_parsed
|
|
8
|
+
from .models import QueryMap
|
|
9
|
+
from .parser import parse_statement
|
|
10
|
+
from .render.json import render_json
|
|
11
|
+
from .render.text import render_text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_query_map(
|
|
15
|
+
sql: str,
|
|
16
|
+
*,
|
|
17
|
+
dialect: str,
|
|
18
|
+
) -> QueryMap:
|
|
19
|
+
"""Build the public catalogkit-query artifact for one SQL statement."""
|
|
20
|
+
parsed = parse_statement(sql, dialect=dialect)
|
|
21
|
+
return build_query_map_from_parsed(parsed)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_catalog_artifact(
|
|
25
|
+
sql: str,
|
|
26
|
+
*,
|
|
27
|
+
dialect: str,
|
|
28
|
+
) -> CatalogArtifact:
|
|
29
|
+
"""Build the shared catalog artifact for CatalogKit composition."""
|
|
30
|
+
parsed = parse_statement(sql, dialect=dialect)
|
|
31
|
+
return build_catalog_artifact_from_parsed(parsed)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"build_catalog_artifact",
|
|
36
|
+
"build_query_map",
|
|
37
|
+
"render_json",
|
|
38
|
+
"render_text",
|
|
39
|
+
]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Shared sqlglot AST helpers for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
|
|
7
|
+
from sqlglot import exp
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def iter_ctes(root_expression: exp.Expression) -> Iterable[exp.CTE]:
|
|
11
|
+
"""Yield every CTE in the parsed query."""
|
|
12
|
+
yield from root_expression.find_all(exp.CTE)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cte_name(cte: exp.CTE) -> str:
|
|
16
|
+
"""Return the raw CTE name from the AST."""
|
|
17
|
+
alias = cte.alias
|
|
18
|
+
if isinstance(alias, str):
|
|
19
|
+
return alias
|
|
20
|
+
return str(alias or "").strip()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def qualified_table_name(table: exp.Table) -> str:
|
|
24
|
+
"""Return the dotted table name as it appears in the AST."""
|
|
25
|
+
parts = [
|
|
26
|
+
str(part).strip()
|
|
27
|
+
for part in (table.catalog, table.db, table.name)
|
|
28
|
+
if str(part or "").strip()
|
|
29
|
+
]
|
|
30
|
+
return ".".join(parts)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def has_join_ancestor(node: exp.Expression, *, stop_node: exp.Expression) -> bool:
|
|
34
|
+
"""Detect whether a table node appears under a JOIN subtree."""
|
|
35
|
+
current = node.parent
|
|
36
|
+
while current is not None and current is not stop_node:
|
|
37
|
+
if isinstance(current, exp.Join):
|
|
38
|
+
return True
|
|
39
|
+
current = current.parent
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def iter_table_nodes(expression: exp.Expression | None) -> Iterable[exp.Table]:
|
|
44
|
+
"""Yield all table nodes under an expression."""
|
|
45
|
+
if expression is None:
|
|
46
|
+
return
|
|
47
|
+
for table in expression.find_all(exp.Table):
|
|
48
|
+
yield table
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def iter_table_nodes_skipping_ctes(
|
|
52
|
+
expression: exp.Expression | None,
|
|
53
|
+
) -> Iterable[exp.Table]:
|
|
54
|
+
"""Yield table nodes while skipping traversal into nested CTE definitions."""
|
|
55
|
+
if expression is None:
|
|
56
|
+
return
|
|
57
|
+
yield from _walk_tables(expression, skip_ctes=True)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _walk_tables(expression: exp.Expression, *, skip_ctes: bool) -> Iterable[exp.Table]:
|
|
61
|
+
if skip_ctes and isinstance(expression, exp.CTE):
|
|
62
|
+
return
|
|
63
|
+
if isinstance(expression, exp.Table):
|
|
64
|
+
yield expression
|
|
65
|
+
return
|
|
66
|
+
for child in expression.args.values():
|
|
67
|
+
if isinstance(child, list):
|
|
68
|
+
for item in child:
|
|
69
|
+
if isinstance(item, exp.Expression):
|
|
70
|
+
yield from _walk_tables(item, skip_ctes=skip_ctes)
|
|
71
|
+
elif isinstance(child, exp.Expression):
|
|
72
|
+
yield from _walk_tables(child, skip_ctes=skip_ctes)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Artifact assembly for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, cast
|
|
6
|
+
|
|
7
|
+
from catalogkit.core import CatalogArtifact, Edge, Evidence, Warning, merge
|
|
8
|
+
from sqlglot import exp
|
|
9
|
+
|
|
10
|
+
from .ctes import extract_dependency_edges
|
|
11
|
+
from .errors import QueryMapContractError
|
|
12
|
+
from .models import (
|
|
13
|
+
EdgeKind as QueryMapEdgeKind,
|
|
14
|
+
QueryMap,
|
|
15
|
+
QuerySummary,
|
|
16
|
+
RelationEdge,
|
|
17
|
+
RelationUsage,
|
|
18
|
+
WarningCode,
|
|
19
|
+
WarningEntry,
|
|
20
|
+
)
|
|
21
|
+
from .parser import ParsedStatement
|
|
22
|
+
from .relations import RelationExtraction, extract_relations
|
|
23
|
+
|
|
24
|
+
_QUERYMAP_EDGE_KINDS = {"depends_on", "joins"}
|
|
25
|
+
_QUERYMAP_WARNING_CODES = {
|
|
26
|
+
"parse_recovered",
|
|
27
|
+
"select_star",
|
|
28
|
+
"table_star",
|
|
29
|
+
"ambiguous_output_source",
|
|
30
|
+
"unresolved_output_source",
|
|
31
|
+
"non_equi_join",
|
|
32
|
+
"unsupported_construct",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def build_query_map_from_parsed(parsed: ParsedStatement) -> QueryMap:
|
|
37
|
+
"""Build the public catalogkit-query artifact for one parsed SQL statement."""
|
|
38
|
+
artifact, relation_extraction = _build_catalog_artifact_from_parsed(parsed)
|
|
39
|
+
|
|
40
|
+
edges = [_relation_edge_from_catalog_edge(edge) for edge in artifact.edges]
|
|
41
|
+
warnings = [_warning_entry_from_warning(warning) for warning in artifact.warnings]
|
|
42
|
+
summary = QuerySummary(
|
|
43
|
+
dialect=parsed.dialect,
|
|
44
|
+
statement_type=parsed.statement.key.lower(),
|
|
45
|
+
has_ctes=bool(relation_extraction.cte_nodes),
|
|
46
|
+
relation_count=len(relation_extraction.relations),
|
|
47
|
+
cte_count=len(relation_extraction.cte_nodes),
|
|
48
|
+
output_count=0,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return QueryMap(
|
|
52
|
+
version=artifact.version,
|
|
53
|
+
summary=summary,
|
|
54
|
+
relations=relation_extraction.relations,
|
|
55
|
+
relation_usages=[
|
|
56
|
+
RelationUsage(
|
|
57
|
+
relation_id=usage.relation_id,
|
|
58
|
+
alias=usage.alias,
|
|
59
|
+
context=usage.context,
|
|
60
|
+
sql=usage.sql,
|
|
61
|
+
normalized_sql=usage.sql,
|
|
62
|
+
)
|
|
63
|
+
for usage in relation_extraction.relation_usages
|
|
64
|
+
],
|
|
65
|
+
edges=_dedupe_query_map_edges(edges),
|
|
66
|
+
outputs=[],
|
|
67
|
+
warnings=_dedupe_warnings(warnings),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def build_catalog_artifact_from_parsed(parsed: ParsedStatement) -> CatalogArtifact:
|
|
72
|
+
"""Build the shared catalog artifact used for CatalogKit composition."""
|
|
73
|
+
artifact, _ = _build_catalog_artifact_from_parsed(parsed)
|
|
74
|
+
return artifact
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _build_catalog_artifact_from_parsed(
|
|
78
|
+
parsed: ParsedStatement,
|
|
79
|
+
) -> tuple[CatalogArtifact, RelationExtraction]:
|
|
80
|
+
"""Build the internal shared artifact and keep extraction metadata nearby."""
|
|
81
|
+
relation_extraction = extract_relations(parsed.root_expression, dialect=parsed.dialect)
|
|
82
|
+
dependency_extraction = extract_dependency_edges(
|
|
83
|
+
parsed.root_expression,
|
|
84
|
+
relation_extraction=relation_extraction,
|
|
85
|
+
dialect=parsed.dialect,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not relation_extraction.nodes:
|
|
89
|
+
raise QueryMapContractError("No tables or CTE relations were found in the SQL statement.")
|
|
90
|
+
|
|
91
|
+
nodes_by_id = {node.id: node.model_copy(deep=True) for node in relation_extraction.nodes}
|
|
92
|
+
for usage in relation_extraction.relation_usages:
|
|
93
|
+
node = nodes_by_id[usage.relation_id]
|
|
94
|
+
node.evidence.append(
|
|
95
|
+
Evidence(
|
|
96
|
+
location=usage.context,
|
|
97
|
+
expression=usage.sql,
|
|
98
|
+
confidence="high",
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
artifact = CatalogArtifact(
|
|
103
|
+
nodes=sorted(nodes_by_id.values(), key=lambda node: node.id),
|
|
104
|
+
edges=dependency_extraction.edges,
|
|
105
|
+
warnings=[
|
|
106
|
+
*dependency_extraction.warnings,
|
|
107
|
+
*_extract_contract_warnings(parsed.root_expression, dialect=parsed.dialect),
|
|
108
|
+
],
|
|
109
|
+
)
|
|
110
|
+
return merge(artifact), relation_extraction
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _extract_contract_warnings(
|
|
114
|
+
root_expression: Any,
|
|
115
|
+
*,
|
|
116
|
+
dialect: str,
|
|
117
|
+
) -> list[Warning]:
|
|
118
|
+
warnings: list[Warning] = []
|
|
119
|
+
|
|
120
|
+
union_expression = root_expression.find(exp.Union)
|
|
121
|
+
if union_expression is not None:
|
|
122
|
+
warnings.append(
|
|
123
|
+
Warning(
|
|
124
|
+
code="unsupported_construct",
|
|
125
|
+
message="UNION queries are mapped at the relation level only in the MVP.",
|
|
126
|
+
location=union_expression.sql(dialect=dialect),
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
intersect_expression = root_expression.find(exp.Intersect)
|
|
130
|
+
if intersect_expression is not None:
|
|
131
|
+
warnings.append(
|
|
132
|
+
Warning(
|
|
133
|
+
code="unsupported_construct",
|
|
134
|
+
message="INTERSECT queries are mapped at the relation level only in the MVP.",
|
|
135
|
+
location=intersect_expression.sql(dialect=dialect),
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
except_expression = root_expression.find(exp.Except)
|
|
139
|
+
if except_expression is not None:
|
|
140
|
+
warnings.append(
|
|
141
|
+
Warning(
|
|
142
|
+
code="unsupported_construct",
|
|
143
|
+
message="EXCEPT queries are mapped at the relation level only in the MVP.",
|
|
144
|
+
location=except_expression.sql(dialect=dialect),
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
for select in root_expression.find_all(exp.Select):
|
|
149
|
+
for selection in select.expressions:
|
|
150
|
+
if isinstance(selection, exp.Star):
|
|
151
|
+
warnings.append(
|
|
152
|
+
Warning(
|
|
153
|
+
code="select_star",
|
|
154
|
+
message="SELECT * was detected; output mapping is deferred in the MVP.",
|
|
155
|
+
location=selection.sql(dialect=dialect),
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
elif isinstance(selection, exp.Column) and str(selection.name or "").strip() == "*":
|
|
159
|
+
warnings.append(
|
|
160
|
+
Warning(
|
|
161
|
+
code="table_star",
|
|
162
|
+
message="table.* was detected; output mapping is deferred in the MVP.",
|
|
163
|
+
location=selection.sql(dialect=dialect),
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
for join in root_expression.find_all(exp.Join):
|
|
168
|
+
on_clause = join.args.get("on")
|
|
169
|
+
using_clause = join.args.get("using")
|
|
170
|
+
if on_clause is None and using_clause is None:
|
|
171
|
+
warnings.append(
|
|
172
|
+
Warning(
|
|
173
|
+
code="unsupported_construct",
|
|
174
|
+
message="JOIN without ON/USING is not modeled beyond relation dependency mapping.",
|
|
175
|
+
location=join.sql(dialect=dialect),
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
continue
|
|
179
|
+
if on_clause is not None and not _is_equality_join(on_clause):
|
|
180
|
+
warnings.append(
|
|
181
|
+
Warning(
|
|
182
|
+
code="non_equi_join",
|
|
183
|
+
message="Non-equality join detected; MVP preserves relation dependencies but does not model join semantics.",
|
|
184
|
+
location=join.sql(dialect=dialect),
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return warnings
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _is_equality_join(expression: Any) -> bool:
|
|
192
|
+
if isinstance(expression, exp.EQ):
|
|
193
|
+
return True
|
|
194
|
+
if isinstance(expression, exp.And):
|
|
195
|
+
return _is_equality_join(expression.left) and _is_equality_join(expression.right)
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _relation_edge_from_catalog_edge(edge: Edge) -> RelationEdge:
|
|
200
|
+
if edge.kind not in _QUERYMAP_EDGE_KINDS:
|
|
201
|
+
raise QueryMapContractError(
|
|
202
|
+
f"catalogkit-query cannot emit unsupported edge kind {edge.kind!r} in its public contract."
|
|
203
|
+
)
|
|
204
|
+
sql = edge.evidence[0].expression if edge.evidence else None
|
|
205
|
+
return RelationEdge(
|
|
206
|
+
kind=cast(QueryMapEdgeKind, edge.kind),
|
|
207
|
+
source_id=edge.source_id,
|
|
208
|
+
target_id=edge.target_id,
|
|
209
|
+
label=edge.label,
|
|
210
|
+
confidence=edge.confidence,
|
|
211
|
+
sql=sql,
|
|
212
|
+
normalized_sql=sql,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _warning_entry_from_warning(warning: Warning) -> WarningEntry:
|
|
217
|
+
if warning.code not in _QUERYMAP_WARNING_CODES:
|
|
218
|
+
raise QueryMapContractError(
|
|
219
|
+
f"catalogkit-query cannot emit unsupported warning code {warning.code!r} in its public contract."
|
|
220
|
+
)
|
|
221
|
+
return WarningEntry(
|
|
222
|
+
code=cast(WarningCode, warning.code),
|
|
223
|
+
message=warning.message,
|
|
224
|
+
location=warning.location,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _dedupe_query_map_edges(edges: list[RelationEdge]) -> list[RelationEdge]:
|
|
229
|
+
seen: set[tuple[str, str, str | None]] = set()
|
|
230
|
+
deduped: list[RelationEdge] = []
|
|
231
|
+
for edge in edges:
|
|
232
|
+
key = (edge.source_id, edge.target_id, edge.sql)
|
|
233
|
+
if key in seen:
|
|
234
|
+
continue
|
|
235
|
+
seen.add(key)
|
|
236
|
+
deduped.append(edge)
|
|
237
|
+
return deduped
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _dedupe_warnings(warnings: list[WarningEntry]) -> list[WarningEntry]:
|
|
241
|
+
seen: set[tuple[str, str, str | None]] = set()
|
|
242
|
+
deduped: list[WarningEntry] = []
|
|
243
|
+
for warning in warnings:
|
|
244
|
+
key = (warning.code, warning.message, warning.location)
|
|
245
|
+
if key in seen:
|
|
246
|
+
continue
|
|
247
|
+
seen.add(key)
|
|
248
|
+
deduped.append(warning)
|
|
249
|
+
return deduped
|
catalogkit/query/cli.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""CLI entrypoint for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from . import __version__
|
|
11
|
+
from .api import build_query_map, render_json, render_text
|
|
12
|
+
from .errors import QueryMapError
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
16
|
+
parser = argparse.ArgumentParser(
|
|
17
|
+
prog="catalogkit-query",
|
|
18
|
+
description="Map one supported SQL statement into deterministic catalog dependencies.",
|
|
19
|
+
)
|
|
20
|
+
parser.add_argument(
|
|
21
|
+
"sql_file",
|
|
22
|
+
help="Path to a UTF-8 SQL file containing exactly one supported statement.",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument(
|
|
25
|
+
"--dialect",
|
|
26
|
+
required=True,
|
|
27
|
+
help="sqlglot dialect name, for example postgres, snowflake, tsql, or bigquery.",
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"--format",
|
|
31
|
+
choices=("text", "json"),
|
|
32
|
+
default="text",
|
|
33
|
+
help="Renderer output format.",
|
|
34
|
+
)
|
|
35
|
+
parser.add_argument(
|
|
36
|
+
"--version",
|
|
37
|
+
action="version",
|
|
38
|
+
version=f"%(prog)s {__version__}",
|
|
39
|
+
)
|
|
40
|
+
return parser
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def main(argv: list[str] | None = None) -> int:
|
|
44
|
+
parser = build_parser()
|
|
45
|
+
args = parser.parse_args(argv)
|
|
46
|
+
|
|
47
|
+
try:
|
|
48
|
+
sql = Path(args.sql_file).read_text(encoding="utf-8")
|
|
49
|
+
artifact = build_query_map(sql, dialect=args.dialect)
|
|
50
|
+
if args.format == "json":
|
|
51
|
+
print(json.dumps(render_json(artifact), indent=2, sort_keys=False))
|
|
52
|
+
else:
|
|
53
|
+
print(render_text(artifact))
|
|
54
|
+
return 0
|
|
55
|
+
except (OSError, QueryMapError) as exc:
|
|
56
|
+
print(f"catalogkit-query error: {exc}", file=sys.stderr)
|
|
57
|
+
return 1
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
if __name__ == "__main__":
|
|
61
|
+
raise SystemExit(main())
|
catalogkit/query/ctes.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Dependency edge extraction for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
from catalogkit.core import Edge, Evidence, Warning, cte_id, normalize_identifier, normalize_identifier_part
|
|
8
|
+
from sqlglot import exp
|
|
9
|
+
|
|
10
|
+
from .ast_utils import cte_name, iter_ctes, iter_table_nodes, iter_table_nodes_skipping_ctes, qualified_table_name
|
|
11
|
+
from .errors import QueryMapContractError
|
|
12
|
+
from .relations import RelationExtraction
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True)
|
|
16
|
+
class DependencyExtraction:
|
|
17
|
+
edges: list[Edge]
|
|
18
|
+
warnings: list[Warning]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def extract_dependency_edges(
|
|
22
|
+
root_expression: exp.Expression,
|
|
23
|
+
*,
|
|
24
|
+
relation_extraction: RelationExtraction,
|
|
25
|
+
dialect: str,
|
|
26
|
+
) -> DependencyExtraction:
|
|
27
|
+
"""Extract relation-level dependency edges for root query and CTEs."""
|
|
28
|
+
edges: list[Edge] = []
|
|
29
|
+
warnings: list[Warning] = []
|
|
30
|
+
|
|
31
|
+
for cte in iter_ctes(root_expression):
|
|
32
|
+
raw_cte_name = cte_name(cte)
|
|
33
|
+
if not raw_cte_name:
|
|
34
|
+
continue
|
|
35
|
+
source_id = cte_id(raw_cte_name)
|
|
36
|
+
for table in iter_table_nodes(cte.this):
|
|
37
|
+
target_id = _resolve_relation_id(table, relation_extraction=relation_extraction)
|
|
38
|
+
if target_id is None:
|
|
39
|
+
continue
|
|
40
|
+
if target_id == source_id:
|
|
41
|
+
warnings.append(
|
|
42
|
+
Warning(
|
|
43
|
+
code="unsupported_construct",
|
|
44
|
+
message=f"Recursive CTE {normalize_identifier_part(raw_cte_name)!r} is not modeled explicitly in the MVP.",
|
|
45
|
+
location=table.sql(dialect=dialect),
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
continue
|
|
49
|
+
edges.append(
|
|
50
|
+
Edge(
|
|
51
|
+
kind="depends_on",
|
|
52
|
+
source_id=source_id,
|
|
53
|
+
target_id=target_id,
|
|
54
|
+
label="depends_on",
|
|
55
|
+
confidence="high",
|
|
56
|
+
evidence=[
|
|
57
|
+
Evidence(
|
|
58
|
+
location="cte_body",
|
|
59
|
+
expression=table.sql(dialect=dialect),
|
|
60
|
+
confidence="high",
|
|
61
|
+
)
|
|
62
|
+
],
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
for table in iter_table_nodes_skipping_ctes(root_expression):
|
|
67
|
+
target_id = _resolve_relation_id(table, relation_extraction=relation_extraction)
|
|
68
|
+
if target_id is None:
|
|
69
|
+
continue
|
|
70
|
+
edges.append(
|
|
71
|
+
Edge(
|
|
72
|
+
kind="depends_on",
|
|
73
|
+
source_id="query:root",
|
|
74
|
+
target_id=target_id,
|
|
75
|
+
label="depends_on",
|
|
76
|
+
confidence="high",
|
|
77
|
+
evidence=[
|
|
78
|
+
Evidence(
|
|
79
|
+
location="root_query",
|
|
80
|
+
expression=table.sql(dialect=dialect),
|
|
81
|
+
confidence="high",
|
|
82
|
+
)
|
|
83
|
+
],
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return DependencyExtraction(edges=edges, warnings=warnings)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _resolve_relation_id(
|
|
91
|
+
table: exp.Table,
|
|
92
|
+
*,
|
|
93
|
+
relation_extraction: RelationExtraction,
|
|
94
|
+
) -> str | None:
|
|
95
|
+
name = str(table.name or "").strip()
|
|
96
|
+
if not name:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
cte_node = relation_extraction.cte_nodes.get(normalize_identifier_part(name))
|
|
100
|
+
if cte_node is not None:
|
|
101
|
+
return cte_node.id
|
|
102
|
+
|
|
103
|
+
qualified_name = qualified_table_name(table)
|
|
104
|
+
normalized_qualified_name = normalize_identifier(qualified_name)
|
|
105
|
+
table_node = relation_extraction.table_nodes.get(normalized_qualified_name)
|
|
106
|
+
if table_node is not None:
|
|
107
|
+
return table_node.id
|
|
108
|
+
|
|
109
|
+
raise QueryMapContractError(
|
|
110
|
+
f"Unresolved relation id for table reference {qualified_name!r}."
|
|
111
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Package-specific errors for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class QueryMapError(Exception):
|
|
7
|
+
"""Base class for catalogkit-query failures."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QueryMapParseError(QueryMapError):
|
|
11
|
+
"""Raised when SQL cannot be parsed into a supported AST."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class QueryMapContractError(QueryMapError):
|
|
15
|
+
"""Raised when parsed SQL cannot be represented by the current contract."""
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Public artifact models for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
Confidence = Literal["high", "medium", "low"]
|
|
10
|
+
RelationKind = Literal["table", "cte"]
|
|
11
|
+
RelationUsageContext = Literal["from", "join", "cte_body"]
|
|
12
|
+
EdgeKind = Literal["depends_on", "joins"]
|
|
13
|
+
WarningCode = Literal[
|
|
14
|
+
"parse_recovered",
|
|
15
|
+
"select_star",
|
|
16
|
+
"table_star",
|
|
17
|
+
"ambiguous_output_source",
|
|
18
|
+
"unresolved_output_source",
|
|
19
|
+
"non_equi_join",
|
|
20
|
+
"unsupported_construct",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class QuerySummary(BaseModel):
|
|
25
|
+
dialect: str
|
|
26
|
+
statement_type: str
|
|
27
|
+
has_ctes: bool
|
|
28
|
+
relation_count: int
|
|
29
|
+
cte_count: int
|
|
30
|
+
output_count: int = 0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Relation(BaseModel):
|
|
34
|
+
model_config = ConfigDict(populate_by_name=True)
|
|
35
|
+
|
|
36
|
+
id: str
|
|
37
|
+
kind: RelationKind
|
|
38
|
+
name: str
|
|
39
|
+
qualified_name: str | None = None
|
|
40
|
+
schema_name: str | None = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
alias="schema",
|
|
43
|
+
serialization_alias="schema",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RelationUsage(BaseModel):
|
|
48
|
+
relation_id: str
|
|
49
|
+
alias: str | None = None
|
|
50
|
+
context: RelationUsageContext
|
|
51
|
+
sql: str
|
|
52
|
+
normalized_sql: str | None = None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class RelationEdge(BaseModel):
|
|
56
|
+
kind: EdgeKind
|
|
57
|
+
source_id: str
|
|
58
|
+
target_id: str
|
|
59
|
+
label: str | None = None
|
|
60
|
+
confidence: Confidence = "high"
|
|
61
|
+
sql: str | None = None
|
|
62
|
+
normalized_sql: str | None = None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class OutputSourceHint(BaseModel):
|
|
66
|
+
relation_id: str | None = None
|
|
67
|
+
column_name: str | None = None
|
|
68
|
+
confidence: Confidence = "medium"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class OutputColumn(BaseModel):
|
|
72
|
+
name: str
|
|
73
|
+
ordinal: int
|
|
74
|
+
expression_sql: str
|
|
75
|
+
normalized_expression_sql: str | None = None
|
|
76
|
+
inferred: bool = False
|
|
77
|
+
sources: list[OutputSourceHint] = Field(default_factory=list)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class WarningEntry(BaseModel):
|
|
81
|
+
code: WarningCode
|
|
82
|
+
message: str
|
|
83
|
+
location: str | None = None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class QueryMap(BaseModel):
|
|
87
|
+
version: str = "1"
|
|
88
|
+
summary: QuerySummary
|
|
89
|
+
relations: list[Relation] = Field(default_factory=list)
|
|
90
|
+
relation_usages: list[RelationUsage] = Field(default_factory=list)
|
|
91
|
+
edges: list[RelationEdge] = Field(default_factory=list)
|
|
92
|
+
outputs: list[OutputColumn] = Field(default_factory=list)
|
|
93
|
+
warnings: list[WarningEntry] = Field(default_factory=list)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""sqlglot parsing helpers for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import sqlglot
|
|
8
|
+
from sqlglot import exp
|
|
9
|
+
|
|
10
|
+
from .errors import QueryMapContractError, QueryMapParseError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True)
|
|
14
|
+
class ParsedStatement:
|
|
15
|
+
statement: exp.Expression
|
|
16
|
+
root_expression: exp.Expression
|
|
17
|
+
dialect: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_statement(sql: str, *, dialect: str) -> ParsedStatement:
|
|
21
|
+
"""Parse exactly one SQL statement into a supported AST."""
|
|
22
|
+
cleaned = (sql or "").strip()
|
|
23
|
+
if not cleaned:
|
|
24
|
+
raise QueryMapParseError("SQL input is empty.")
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
statements = [stmt for stmt in sqlglot.parse(cleaned, read=dialect) if stmt is not None]
|
|
28
|
+
except Exception as exc:
|
|
29
|
+
raise QueryMapParseError(f"Failed to parse SQL with dialect {dialect!r}: {exc}") from exc
|
|
30
|
+
|
|
31
|
+
if not statements:
|
|
32
|
+
raise QueryMapParseError("SQL input produced no parseable statements.")
|
|
33
|
+
if len(statements) != 1:
|
|
34
|
+
raise QueryMapContractError("catalogkit-query accepts exactly one SQL statement per invocation.")
|
|
35
|
+
|
|
36
|
+
statement = statements[0]
|
|
37
|
+
root_expression = _unwrap_root_expression(statement)
|
|
38
|
+
if not isinstance(root_expression, exp.Query):
|
|
39
|
+
raise QueryMapContractError(
|
|
40
|
+
"catalogkit-query supports exactly one SELECT, INSERT ... SELECT, or CREATE ... AS SELECT statement per invocation."
|
|
41
|
+
)
|
|
42
|
+
return ParsedStatement(statement=statement, root_expression=root_expression, dialect=dialect)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _unwrap_root_expression(statement: exp.Expression) -> exp.Expression:
|
|
46
|
+
"""Return the query-like root expression used for structure mapping."""
|
|
47
|
+
if isinstance(statement, exp.Create):
|
|
48
|
+
expression = statement.args.get("expression")
|
|
49
|
+
if isinstance(expression, exp.Expression):
|
|
50
|
+
return expression
|
|
51
|
+
|
|
52
|
+
if isinstance(statement, exp.Insert):
|
|
53
|
+
expression = statement.args.get("expression")
|
|
54
|
+
if isinstance(expression, exp.Expression):
|
|
55
|
+
return expression
|
|
56
|
+
|
|
57
|
+
return statement
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""Relation discovery for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from catalogkit.core import Node, cte_id, leaf_name, normalize_identifier, normalize_identifier_part, schema_name, table_id
|
|
9
|
+
from sqlglot import exp
|
|
10
|
+
|
|
11
|
+
from .ast_utils import (
|
|
12
|
+
cte_name,
|
|
13
|
+
has_join_ancestor,
|
|
14
|
+
iter_ctes,
|
|
15
|
+
iter_table_nodes,
|
|
16
|
+
iter_table_nodes_skipping_ctes,
|
|
17
|
+
qualified_table_name,
|
|
18
|
+
)
|
|
19
|
+
from .models import Relation
|
|
20
|
+
|
|
21
|
+
RelationUsageContext = Literal["from", "join", "cte_body"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class RelationUsage:
|
|
26
|
+
relation_id: str
|
|
27
|
+
alias: str | None
|
|
28
|
+
context: RelationUsageContext
|
|
29
|
+
sql: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class RelationExtraction:
|
|
34
|
+
nodes: list[Node]
|
|
35
|
+
relations: list[Relation]
|
|
36
|
+
relation_usages: list[RelationUsage]
|
|
37
|
+
cte_nodes: dict[str, Node]
|
|
38
|
+
table_nodes: dict[str, Node]
|
|
39
|
+
relation_by_id: dict[str, Relation]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def extract_relations(root_expression: exp.Expression, *, dialect: str) -> RelationExtraction:
|
|
43
|
+
"""Extract canonical table and CTE nodes plus usage evidence."""
|
|
44
|
+
cte_nodes = _extract_cte_nodes(root_expression)
|
|
45
|
+
cte_relations = _extract_cte_relations(root_expression)
|
|
46
|
+
table_nodes: dict[str, Node] = {}
|
|
47
|
+
relation_by_id: dict[str, Relation] = {
|
|
48
|
+
relation.id: relation for relation in cte_relations.values()
|
|
49
|
+
}
|
|
50
|
+
relation_usages: list[RelationUsage] = []
|
|
51
|
+
|
|
52
|
+
for cte in iter_ctes(root_expression):
|
|
53
|
+
for table in iter_table_nodes(cte.this):
|
|
54
|
+
node = _resolve_node(
|
|
55
|
+
table,
|
|
56
|
+
cte_nodes=cte_nodes,
|
|
57
|
+
table_nodes=table_nodes,
|
|
58
|
+
relation_by_id=relation_by_id,
|
|
59
|
+
cte_relations=cte_relations,
|
|
60
|
+
)
|
|
61
|
+
if node is None:
|
|
62
|
+
continue
|
|
63
|
+
relation_usages.append(
|
|
64
|
+
_build_usage(
|
|
65
|
+
relation_id=node.id,
|
|
66
|
+
table=table,
|
|
67
|
+
context="join" if has_join_ancestor(table, stop_node=cte.this) else "cte_body",
|
|
68
|
+
dialect=dialect,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for table in iter_table_nodes_skipping_ctes(root_expression):
|
|
73
|
+
node = _resolve_node(
|
|
74
|
+
table,
|
|
75
|
+
cte_nodes=cte_nodes,
|
|
76
|
+
table_nodes=table_nodes,
|
|
77
|
+
relation_by_id=relation_by_id,
|
|
78
|
+
cte_relations=cte_relations,
|
|
79
|
+
)
|
|
80
|
+
if node is None:
|
|
81
|
+
continue
|
|
82
|
+
relation_usages.append(
|
|
83
|
+
_build_usage(
|
|
84
|
+
relation_id=node.id,
|
|
85
|
+
table=table,
|
|
86
|
+
context="join" if has_join_ancestor(table, stop_node=root_expression) else "from",
|
|
87
|
+
dialect=dialect,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
nodes = sorted([*table_nodes.values(), *cte_nodes.values()], key=lambda node: node.id)
|
|
92
|
+
relations = sorted(relation_by_id.values(), key=lambda relation: relation.id)
|
|
93
|
+
return RelationExtraction(
|
|
94
|
+
nodes=nodes,
|
|
95
|
+
relations=relations,
|
|
96
|
+
relation_usages=_dedupe_usages(relation_usages),
|
|
97
|
+
cte_nodes=cte_nodes,
|
|
98
|
+
table_nodes=table_nodes,
|
|
99
|
+
relation_by_id=relation_by_id,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _extract_cte_nodes(root_expression: exp.Expression) -> dict[str, Node]:
|
|
104
|
+
nodes: dict[str, Node] = {}
|
|
105
|
+
for cte in iter_ctes(root_expression):
|
|
106
|
+
raw_name = cte_name(cte)
|
|
107
|
+
if not raw_name:
|
|
108
|
+
continue
|
|
109
|
+
normalized_name = normalize_identifier_part(raw_name)
|
|
110
|
+
node = Node(
|
|
111
|
+
id=cte_id(raw_name),
|
|
112
|
+
kind="cte",
|
|
113
|
+
name=normalized_name,
|
|
114
|
+
)
|
|
115
|
+
nodes[normalized_name] = node
|
|
116
|
+
return nodes
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _extract_cte_relations(root_expression: exp.Expression) -> dict[str, Relation]:
|
|
120
|
+
relations: dict[str, Relation] = {}
|
|
121
|
+
for cte in iter_ctes(root_expression):
|
|
122
|
+
raw_name = cte_name(cte)
|
|
123
|
+
if not raw_name:
|
|
124
|
+
continue
|
|
125
|
+
normalized_name = normalize_identifier_part(raw_name)
|
|
126
|
+
relations[normalized_name] = Relation(
|
|
127
|
+
id=cte_id(raw_name),
|
|
128
|
+
kind="cte",
|
|
129
|
+
name=raw_name,
|
|
130
|
+
)
|
|
131
|
+
return relations
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _resolve_node(
|
|
135
|
+
table: exp.Table,
|
|
136
|
+
*,
|
|
137
|
+
cte_nodes: dict[str, Node],
|
|
138
|
+
table_nodes: dict[str, Node],
|
|
139
|
+
relation_by_id: dict[str, Relation],
|
|
140
|
+
cte_relations: dict[str, Relation],
|
|
141
|
+
) -> Node | None:
|
|
142
|
+
table_name = str(table.name or "").strip()
|
|
143
|
+
if not table_name:
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
normalized_table_name = normalize_identifier_part(table_name)
|
|
147
|
+
cte_node = cte_nodes.get(normalized_table_name)
|
|
148
|
+
if cte_node is not None:
|
|
149
|
+
relation_by_id[cte_node.id] = cte_relations[normalized_table_name]
|
|
150
|
+
return cte_node
|
|
151
|
+
|
|
152
|
+
qualified_name = qualified_table_name(table)
|
|
153
|
+
normalized_qualified_name = normalize_identifier(qualified_name)
|
|
154
|
+
node = table_nodes.get(normalized_qualified_name)
|
|
155
|
+
if node is None:
|
|
156
|
+
raw_schema_name = _raw_schema_name(table)
|
|
157
|
+
node = Node(
|
|
158
|
+
id=table_id(qualified_name),
|
|
159
|
+
kind="table",
|
|
160
|
+
name=leaf_name(qualified_name),
|
|
161
|
+
qualified_name=normalized_qualified_name,
|
|
162
|
+
schema=schema_name(qualified_name),
|
|
163
|
+
)
|
|
164
|
+
table_nodes[normalized_qualified_name] = node
|
|
165
|
+
relation_by_id[node.id] = Relation(
|
|
166
|
+
id=node.id,
|
|
167
|
+
kind="table",
|
|
168
|
+
name=table_name,
|
|
169
|
+
qualified_name=qualified_name,
|
|
170
|
+
schema=raw_schema_name,
|
|
171
|
+
)
|
|
172
|
+
return node
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _build_usage(
|
|
176
|
+
*,
|
|
177
|
+
relation_id: str,
|
|
178
|
+
table: exp.Table,
|
|
179
|
+
context: RelationUsageContext,
|
|
180
|
+
dialect: str,
|
|
181
|
+
) -> RelationUsage:
|
|
182
|
+
return RelationUsage(
|
|
183
|
+
relation_id=relation_id,
|
|
184
|
+
alias=str(table.alias or "").strip() or None,
|
|
185
|
+
context=context,
|
|
186
|
+
sql=table.sql(dialect=dialect),
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _dedupe_usages(usages: list[RelationUsage]) -> list[RelationUsage]:
|
|
191
|
+
seen: set[tuple[str, str | None, str, str]] = set()
|
|
192
|
+
deduped: list[RelationUsage] = []
|
|
193
|
+
for usage in usages:
|
|
194
|
+
key = (usage.relation_id, usage.alias, usage.context, usage.sql)
|
|
195
|
+
if key in seen:
|
|
196
|
+
continue
|
|
197
|
+
seen.add(key)
|
|
198
|
+
deduped.append(usage)
|
|
199
|
+
return deduped
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _raw_schema_name(table: exp.Table) -> str | None:
|
|
203
|
+
parts = [
|
|
204
|
+
str(part).strip()
|
|
205
|
+
for part in (table.catalog, table.db)
|
|
206
|
+
if str(part or "").strip()
|
|
207
|
+
]
|
|
208
|
+
if not parts:
|
|
209
|
+
return None
|
|
210
|
+
return ".".join(parts)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Renderers for catalogkit-query."""
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""JSON renderer for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..models import QueryMap
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def render_json(query_map: QueryMap) -> dict:
|
|
9
|
+
"""Return the canonical JSON-serializable catalogkit-query artifact."""
|
|
10
|
+
return query_map.model_dump(mode="json", by_alias=True)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Text renderer for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..models import QueryMap
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def render_text(query_map: QueryMap) -> str:
|
|
9
|
+
"""Render the public catalogkit-query artifact for human reading."""
|
|
10
|
+
lines: list[str] = []
|
|
11
|
+
summary = query_map.summary
|
|
12
|
+
|
|
13
|
+
lines.append("catalogkit-query")
|
|
14
|
+
lines.append(f"dialect: {summary.dialect}")
|
|
15
|
+
lines.append(f"statement_type: {summary.statement_type}")
|
|
16
|
+
lines.append(f"has_ctes: {summary.has_ctes}")
|
|
17
|
+
lines.append("")
|
|
18
|
+
|
|
19
|
+
lines.append("relations:")
|
|
20
|
+
for relation in query_map.relations:
|
|
21
|
+
display_name = relation.qualified_name or relation.name
|
|
22
|
+
lines.append(f" - [{relation.kind}] {display_name}")
|
|
23
|
+
|
|
24
|
+
lines.append("")
|
|
25
|
+
lines.append("relation_usages:")
|
|
26
|
+
for usage in query_map.relation_usages:
|
|
27
|
+
alias = f" alias={usage.alias}" if usage.alias else ""
|
|
28
|
+
lines.append(f" - {usage.context}: {usage.relation_id}{alias} :: {usage.sql}")
|
|
29
|
+
|
|
30
|
+
lines.append("")
|
|
31
|
+
lines.append("dependencies:")
|
|
32
|
+
for edge in query_map.edges:
|
|
33
|
+
lines.append(f" - {edge.source_id} -> {edge.target_id}")
|
|
34
|
+
|
|
35
|
+
if query_map.warnings:
|
|
36
|
+
lines.append("")
|
|
37
|
+
lines.append("warnings:")
|
|
38
|
+
for warning in query_map.warnings:
|
|
39
|
+
location = f" [{warning.location}]" if warning.location else ""
|
|
40
|
+
lines.append(f" - {warning.code}: {warning.message}{location}")
|
|
41
|
+
|
|
42
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: catalogkit-query
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Map one SQL statement into deterministic CatalogKit dependencies.
|
|
5
|
+
Author: ClearMetric Labs
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/Clearmetric-Labs/CatalogKit
|
|
8
|
+
Project-URL: Source, https://github.com/Clearmetric-Labs/CatalogKit
|
|
9
|
+
Project-URL: Issues, https://github.com/Clearmetric-Labs/CatalogKit/issues
|
|
10
|
+
Keywords: sql,lineage,dependencies,ctes,sqlglot
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: catalogkit-core>=0.1.0
|
|
24
|
+
Requires-Dist: sqlglot>=25.0.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
27
|
+
Provides-Extra: release
|
|
28
|
+
Requires-Dist: build>=1.2.2; extra == "release"
|
|
29
|
+
Requires-Dist: twine>=5.1.1; extra == "release"
|
|
30
|
+
|
|
31
|
+
# catalogkit-query
|
|
32
|
+
|
|
33
|
+
`catalogkit-query` maps one supported SQL statement into a deterministic `QueryMap`
|
|
34
|
+
artifact so you can answer "what feeds what in this query?" fast.
|
|
35
|
+
|
|
36
|
+
It is a narrow static-analysis tool:
|
|
37
|
+
|
|
38
|
+
- input: exactly one SQL statement from one SQL file
|
|
39
|
+
- output: canonical relations, relation usages, dependency edges, and warnings
|
|
40
|
+
- no warehouse credentials
|
|
41
|
+
- no dbt project
|
|
42
|
+
- no AI key
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
python -m pip install catalogkit-query
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Imports
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from catalogkit.query import build_catalog_artifact, build_query_map
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For local development:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
python -m pip install -e ../catalogkit-core
|
|
60
|
+
python -m pip install -e ".[dev,release]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quickstart
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
catalogkit-query --dialect postgres ./examples/ugly_real_world.sql
|
|
67
|
+
catalogkit-query --dialect postgres --format json ./examples/ugly_real_world.sql
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Output Contract
|
|
71
|
+
|
|
72
|
+
`catalogkit-query` preserves its public `QueryMap` shape:
|
|
73
|
+
|
|
74
|
+
- `summary`
|
|
75
|
+
- `relations`
|
|
76
|
+
- `relation_usages`
|
|
77
|
+
- `edges`
|
|
78
|
+
- `outputs`
|
|
79
|
+
- `warnings`
|
|
80
|
+
|
|
81
|
+
For CatalogKit composition, the package also exposes a shared
|
|
82
|
+
`CatalogArtifact` builder backed by `catalogkit-core`.
|
|
83
|
+
|
|
84
|
+
The shared core artifact contains:
|
|
85
|
+
|
|
86
|
+
- `version`
|
|
87
|
+
- `nodes`
|
|
88
|
+
- `edges`
|
|
89
|
+
- `warnings`
|
|
90
|
+
|
|
91
|
+
## Supported Statements
|
|
92
|
+
|
|
93
|
+
`catalogkit-query` accepts exactly one supported statement per invocation:
|
|
94
|
+
|
|
95
|
+
- `SELECT ...`
|
|
96
|
+
- `INSERT ... SELECT ...`
|
|
97
|
+
- `CREATE ... AS SELECT ...`
|
|
98
|
+
|
|
99
|
+
Unsupported statement shapes fail loudly.
|
|
100
|
+
|
|
101
|
+
## Contract Docs
|
|
102
|
+
|
|
103
|
+
- [`../catalogkit-core/docs/contract.md`](../catalogkit-core/docs/contract.md)
|
|
104
|
+
- [`docs/limitations.md`](docs/limitations.md)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
catalogkit/query/__init__.py,sha256=6qW-plZ1StWh0O2Q_FxnyuOgGroUPxixIOIGdUzQSKo,873
|
|
2
|
+
catalogkit/query/__main__.py,sha256=iIrnnBjgpYVZD63t9ru8KzuP4CGDw6Cz_0k4UI962PI,175
|
|
3
|
+
catalogkit/query/_version.py,sha256=VDsBswzfU7gNsBkIaxxV6tsi0ixlms3WjN7HYgZJZ2g,46
|
|
4
|
+
catalogkit/query/api.py,sha256=xklIf3019k_StOzHqZhRlRVgngEgMJ224Auw0KcRS-M,969
|
|
5
|
+
catalogkit/query/ast_utils.py,sha256=ZUxja752oU6gVJ29hDd9hOFU-YDvGhwYb7s5J6e80Z4,2272
|
|
6
|
+
catalogkit/query/build.py,sha256=2C_XlIhb1kBPcYZCmdgVIB35ROCq9eq8QHCQEw05ZKI,8733
|
|
7
|
+
catalogkit/query/cli.py,sha256=JchrYitnOPFR-z7iVmOwh4H4wN-RY_73G5SA_6morVM,1692
|
|
8
|
+
catalogkit/query/ctes.py,sha256=a7CgBz7Jfs8VOBMPsI_TIP3qSfkjjLcV_H2833KETqw,3687
|
|
9
|
+
catalogkit/query/errors.py,sha256=9WFMZGViim10tDJ9sYDzb-_5TSr0ZLDnaDJg803SSB0,408
|
|
10
|
+
catalogkit/query/models.py,sha256=2BLSWPai196SAe_g4GeAuVq8m_JvMB2NzvTkYPEytzY,2272
|
|
11
|
+
catalogkit/query/parser.py,sha256=NEAMWtZGPEIXY8zBb-YvufJd1ZTEjfdLyrzEWSAJouQ,2016
|
|
12
|
+
catalogkit/query/relations.py,sha256=1egUNMsikbRWJf5WX7SO_QLXBQze46z4iOvZsIAg40E,6359
|
|
13
|
+
catalogkit/query/render/__init__.py,sha256=syO-eCIlCdZ1a_xkesnnpdjUQaOt9HUsutl5IAI6tDw,38
|
|
14
|
+
catalogkit/query/render/json.py,sha256=V_0VJltBB88gNxdN_ykkkxMB6vHQWJrLC-eQgY-sszY,293
|
|
15
|
+
catalogkit/query/render/text.py,sha256=fev3tMlo_ri8e0yk-jFKnvNx90c_z-tD3IFjIIwKbFo,1426
|
|
16
|
+
catalogkit_query-0.1.0.dist-info/METADATA,sha256=z_H6V1vM0O1hehzYAM9hDnWaXk0SUGciKg9rfChOAdc,2847
|
|
17
|
+
catalogkit_query-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
18
|
+
catalogkit_query-0.1.0.dist-info/entry_points.txt,sha256=p9fIIzi20wQMw9wVYJpS8L2w0Pl-HT07ga8MHY5joB4,63
|
|
19
|
+
catalogkit_query-0.1.0.dist-info/top_level.txt,sha256=RsnUdiXrSqkJn7elbvcSi3dAxxLtc6rLuzPGHGR_44I,11
|
|
20
|
+
catalogkit_query-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
catalogkit
|