catalogkit-query 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- catalogkit_query-0.1.0/PKG-INFO +104 -0
- catalogkit_query-0.1.0/README.md +74 -0
- catalogkit_query-0.1.0/pyproject.toml +65 -0
- catalogkit_query-0.1.0/setup.cfg +4 -0
- catalogkit_query-0.1.0/src/catalogkit/query/__init__.py +39 -0
- catalogkit_query-0.1.0/src/catalogkit/query/__main__.py +9 -0
- catalogkit_query-0.1.0/src/catalogkit/query/_version.py +3 -0
- catalogkit_query-0.1.0/src/catalogkit/query/api.py +39 -0
- catalogkit_query-0.1.0/src/catalogkit/query/ast_utils.py +72 -0
- catalogkit_query-0.1.0/src/catalogkit/query/build.py +249 -0
- catalogkit_query-0.1.0/src/catalogkit/query/cli.py +61 -0
- catalogkit_query-0.1.0/src/catalogkit/query/ctes.py +111 -0
- catalogkit_query-0.1.0/src/catalogkit/query/errors.py +15 -0
- catalogkit_query-0.1.0/src/catalogkit/query/models.py +93 -0
- catalogkit_query-0.1.0/src/catalogkit/query/parser.py +57 -0
- catalogkit_query-0.1.0/src/catalogkit/query/relations.py +210 -0
- catalogkit_query-0.1.0/src/catalogkit/query/render/__init__.py +1 -0
- catalogkit_query-0.1.0/src/catalogkit/query/render/json.py +10 -0
- catalogkit_query-0.1.0/src/catalogkit/query/render/text.py +42 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/PKG-INFO +104 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/SOURCES.txt +27 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/dependency_links.txt +1 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/entry_points.txt +2 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/requires.txt +9 -0
- catalogkit_query-0.1.0/src/catalogkit_query.egg-info/top_level.txt +1 -0
- catalogkit_query-0.1.0/tests/test_build.py +59 -0
- catalogkit_query-0.1.0/tests/test_cli.py +68 -0
- catalogkit_query-0.1.0/tests/test_errors.py +16 -0
- catalogkit_query-0.1.0/tests/test_statement_support.py +47 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: catalogkit-query
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Map one SQL statement into deterministic CatalogKit dependencies.
|
|
5
|
+
Author: ClearMetric Labs
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/Clearmetric-Labs/CatalogKit
|
|
8
|
+
Project-URL: Source, https://github.com/Clearmetric-Labs/CatalogKit
|
|
9
|
+
Project-URL: Issues, https://github.com/Clearmetric-Labs/CatalogKit/issues
|
|
10
|
+
Keywords: sql,lineage,dependencies,ctes,sqlglot
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Database
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: catalogkit-core>=0.1.0
|
|
24
|
+
Requires-Dist: sqlglot>=25.0.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
27
|
+
Provides-Extra: release
|
|
28
|
+
Requires-Dist: build>=1.2.2; extra == "release"
|
|
29
|
+
Requires-Dist: twine>=5.1.1; extra == "release"
|
|
30
|
+
|
|
31
|
+
# catalogkit-query
|
|
32
|
+
|
|
33
|
+
`catalogkit-query` maps one supported SQL statement into a deterministic `QueryMap`
|
|
34
|
+
artifact so you can answer "what feeds what in this query?" fast.
|
|
35
|
+
|
|
36
|
+
It is a narrow static-analysis tool:
|
|
37
|
+
|
|
38
|
+
- input: exactly one SQL statement from one SQL file
|
|
39
|
+
- output: canonical relations, relation usages, dependency edges, and warnings
|
|
40
|
+
- no warehouse credentials
|
|
41
|
+
- no dbt project
|
|
42
|
+
- no AI key
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
python -m pip install catalogkit-query
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Imports
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from catalogkit.query import build_catalog_artifact, build_query_map
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For local development:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
python -m pip install -e ../catalogkit-core
|
|
60
|
+
python -m pip install -e ".[dev,release]"
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quickstart
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
catalogkit-query --dialect postgres ./examples/ugly_real_world.sql
|
|
67
|
+
catalogkit-query --dialect postgres --format json ./examples/ugly_real_world.sql
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Output Contract
|
|
71
|
+
|
|
72
|
+
`catalogkit-query` preserves its public `QueryMap` shape:
|
|
73
|
+
|
|
74
|
+
- `summary`
|
|
75
|
+
- `relations`
|
|
76
|
+
- `relation_usages`
|
|
77
|
+
- `edges`
|
|
78
|
+
- `outputs`
|
|
79
|
+
- `warnings`
|
|
80
|
+
|
|
81
|
+
For CatalogKit composition, the package also exposes a shared
|
|
82
|
+
`CatalogArtifact` builder backed by `catalogkit-core`.
|
|
83
|
+
|
|
84
|
+
The shared core artifact contains:
|
|
85
|
+
|
|
86
|
+
- `version`
|
|
87
|
+
- `nodes`
|
|
88
|
+
- `edges`
|
|
89
|
+
- `warnings`
|
|
90
|
+
|
|
91
|
+
## Supported Statements
|
|
92
|
+
|
|
93
|
+
`catalogkit-query` accepts exactly one supported statement per invocation:
|
|
94
|
+
|
|
95
|
+
- `SELECT ...`
|
|
96
|
+
- `INSERT ... SELECT ...`
|
|
97
|
+
- `CREATE ... AS SELECT ...`
|
|
98
|
+
|
|
99
|
+
Unsupported statement shapes fail loudly.
|
|
100
|
+
|
|
101
|
+
## Contract Docs
|
|
102
|
+
|
|
103
|
+
- [`../catalogkit-core/docs/contract.md`](../catalogkit-core/docs/contract.md)
|
|
104
|
+
- [`docs/limitations.md`](docs/limitations.md)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# catalogkit-query
|
|
2
|
+
|
|
3
|
+
`catalogkit-query` maps one supported SQL statement into a deterministic `QueryMap`
|
|
4
|
+
artifact so you can answer "what feeds what in this query?" fast.
|
|
5
|
+
|
|
6
|
+
It is a narrow static-analysis tool:
|
|
7
|
+
|
|
8
|
+
- input: exactly one SQL statement from one SQL file
|
|
9
|
+
- output: canonical relations, relation usages, dependency edges, and warnings
|
|
10
|
+
- no warehouse credentials
|
|
11
|
+
- no dbt project
|
|
12
|
+
- no AI key
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
python -m pip install catalogkit-query
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Imports
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from catalogkit.query import build_catalog_artifact, build_query_map
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
For local development:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python -m pip install -e ../catalogkit-core
|
|
30
|
+
python -m pip install -e ".[dev,release]"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quickstart
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
catalogkit-query --dialect postgres ./examples/ugly_real_world.sql
|
|
37
|
+
catalogkit-query --dialect postgres --format json ./examples/ugly_real_world.sql
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Output Contract
|
|
41
|
+
|
|
42
|
+
`catalogkit-query` preserves its public `QueryMap` shape:
|
|
43
|
+
|
|
44
|
+
- `summary`
|
|
45
|
+
- `relations`
|
|
46
|
+
- `relation_usages`
|
|
47
|
+
- `edges`
|
|
48
|
+
- `outputs`
|
|
49
|
+
- `warnings`
|
|
50
|
+
|
|
51
|
+
For CatalogKit composition, the package also exposes a shared
|
|
52
|
+
`CatalogArtifact` builder backed by `catalogkit-core`.
|
|
53
|
+
|
|
54
|
+
The shared core artifact contains:
|
|
55
|
+
|
|
56
|
+
- `version`
|
|
57
|
+
- `nodes`
|
|
58
|
+
- `edges`
|
|
59
|
+
- `warnings`
|
|
60
|
+
|
|
61
|
+
## Supported Statements
|
|
62
|
+
|
|
63
|
+
`catalogkit-query` accepts exactly one supported statement per invocation:
|
|
64
|
+
|
|
65
|
+
- `SELECT ...`
|
|
66
|
+
- `INSERT ... SELECT ...`
|
|
67
|
+
- `CREATE ... AS SELECT ...`
|
|
68
|
+
|
|
69
|
+
Unsupported statement shapes fail loudly.
|
|
70
|
+
|
|
71
|
+
## Contract Docs
|
|
72
|
+
|
|
73
|
+
- [`../catalogkit-core/docs/contract.md`](../catalogkit-core/docs/contract.md)
|
|
74
|
+
- [`docs/limitations.md`](docs/limitations.md)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "catalogkit-query"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Map one SQL statement into deterministic CatalogKit dependencies."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "Apache-2.0"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "ClearMetric Labs"},
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"sql",
|
|
17
|
+
"lineage",
|
|
18
|
+
"dependencies",
|
|
19
|
+
"ctes",
|
|
20
|
+
"sqlglot",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
"Programming Language :: Python :: 3.13",
|
|
31
|
+
"Topic :: Database",
|
|
32
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
33
|
+
]
|
|
34
|
+
dependencies = [
|
|
35
|
+
"catalogkit-core>=0.1.0",
|
|
36
|
+
"sqlglot>=25.0.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest>=7.0.0",
|
|
42
|
+
]
|
|
43
|
+
release = [
|
|
44
|
+
"build>=1.2.2",
|
|
45
|
+
"twine>=5.1.1",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[project.scripts]
|
|
49
|
+
catalogkit-query = "catalogkit.query.cli:main"
|
|
50
|
+
|
|
51
|
+
[project.urls]
|
|
52
|
+
Homepage = "https://github.com/Clearmetric-Labs/CatalogKit"
|
|
53
|
+
Source = "https://github.com/Clearmetric-Labs/CatalogKit"
|
|
54
|
+
Issues = "https://github.com/Clearmetric-Labs/CatalogKit/issues"
|
|
55
|
+
|
|
56
|
+
[tool.setuptools.dynamic]
|
|
57
|
+
version = {attr = "catalogkit.query._version.__version__"}
|
|
58
|
+
|
|
59
|
+
[tool.setuptools.package-dir]
|
|
60
|
+
"" = "src"
|
|
61
|
+
|
|
62
|
+
[tool.setuptools.packages.find]
|
|
63
|
+
where = ["src"]
|
|
64
|
+
include = ["catalogkit.query*"]
|
|
65
|
+
namespaces = true
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Public package surface for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from catalogkit.core import CatalogArtifact
|
|
6
|
+
|
|
7
|
+
from ._version import __version__
|
|
8
|
+
from .api import build_catalog_artifact, build_query_map, render_json, render_text
|
|
9
|
+
from .errors import QueryMapContractError, QueryMapError, QueryMapParseError
|
|
10
|
+
from .models import (
|
|
11
|
+
OutputColumn,
|
|
12
|
+
OutputSourceHint,
|
|
13
|
+
QueryMap,
|
|
14
|
+
QuerySummary,
|
|
15
|
+
Relation,
|
|
16
|
+
RelationEdge,
|
|
17
|
+
RelationUsage,
|
|
18
|
+
WarningEntry,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"__version__",
|
|
23
|
+
"build_catalog_artifact",
|
|
24
|
+
"build_query_map",
|
|
25
|
+
"CatalogArtifact",
|
|
26
|
+
"OutputColumn",
|
|
27
|
+
"OutputSourceHint",
|
|
28
|
+
"QueryMap",
|
|
29
|
+
"QuerySummary",
|
|
30
|
+
"QueryMapContractError",
|
|
31
|
+
"QueryMapError",
|
|
32
|
+
"QueryMapParseError",
|
|
33
|
+
"Relation",
|
|
34
|
+
"RelationEdge",
|
|
35
|
+
"RelationUsage",
|
|
36
|
+
"render_json",
|
|
37
|
+
"render_text",
|
|
38
|
+
"WarningEntry",
|
|
39
|
+
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Public API for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from catalogkit.core import CatalogArtifact
|
|
6
|
+
|
|
7
|
+
from .build import build_catalog_artifact_from_parsed, build_query_map_from_parsed
|
|
8
|
+
from .models import QueryMap
|
|
9
|
+
from .parser import parse_statement
|
|
10
|
+
from .render.json import render_json
|
|
11
|
+
from .render.text import render_text
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_query_map(
|
|
15
|
+
sql: str,
|
|
16
|
+
*,
|
|
17
|
+
dialect: str,
|
|
18
|
+
) -> QueryMap:
|
|
19
|
+
"""Build the public catalogkit-query artifact for one SQL statement."""
|
|
20
|
+
parsed = parse_statement(sql, dialect=dialect)
|
|
21
|
+
return build_query_map_from_parsed(parsed)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def build_catalog_artifact(
|
|
25
|
+
sql: str,
|
|
26
|
+
*,
|
|
27
|
+
dialect: str,
|
|
28
|
+
) -> CatalogArtifact:
|
|
29
|
+
"""Build the shared catalog artifact for CatalogKit composition."""
|
|
30
|
+
parsed = parse_statement(sql, dialect=dialect)
|
|
31
|
+
return build_catalog_artifact_from_parsed(parsed)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"build_catalog_artifact",
|
|
36
|
+
"build_query_map",
|
|
37
|
+
"render_json",
|
|
38
|
+
"render_text",
|
|
39
|
+
]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Shared sqlglot AST helpers for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
|
|
7
|
+
from sqlglot import exp
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def iter_ctes(root_expression: exp.Expression) -> Iterable[exp.CTE]:
|
|
11
|
+
"""Yield every CTE in the parsed query."""
|
|
12
|
+
yield from root_expression.find_all(exp.CTE)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cte_name(cte: exp.CTE) -> str:
|
|
16
|
+
"""Return the raw CTE name from the AST."""
|
|
17
|
+
alias = cte.alias
|
|
18
|
+
if isinstance(alias, str):
|
|
19
|
+
return alias
|
|
20
|
+
return str(alias or "").strip()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def qualified_table_name(table: exp.Table) -> str:
|
|
24
|
+
"""Return the dotted table name as it appears in the AST."""
|
|
25
|
+
parts = [
|
|
26
|
+
str(part).strip()
|
|
27
|
+
for part in (table.catalog, table.db, table.name)
|
|
28
|
+
if str(part or "").strip()
|
|
29
|
+
]
|
|
30
|
+
return ".".join(parts)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def has_join_ancestor(node: exp.Expression, *, stop_node: exp.Expression) -> bool:
|
|
34
|
+
"""Detect whether a table node appears under a JOIN subtree."""
|
|
35
|
+
current = node.parent
|
|
36
|
+
while current is not None and current is not stop_node:
|
|
37
|
+
if isinstance(current, exp.Join):
|
|
38
|
+
return True
|
|
39
|
+
current = current.parent
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def iter_table_nodes(expression: exp.Expression | None) -> Iterable[exp.Table]:
|
|
44
|
+
"""Yield all table nodes under an expression."""
|
|
45
|
+
if expression is None:
|
|
46
|
+
return
|
|
47
|
+
for table in expression.find_all(exp.Table):
|
|
48
|
+
yield table
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def iter_table_nodes_skipping_ctes(
|
|
52
|
+
expression: exp.Expression | None,
|
|
53
|
+
) -> Iterable[exp.Table]:
|
|
54
|
+
"""Yield table nodes while skipping traversal into nested CTE definitions."""
|
|
55
|
+
if expression is None:
|
|
56
|
+
return
|
|
57
|
+
yield from _walk_tables(expression, skip_ctes=True)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _walk_tables(expression: exp.Expression, *, skip_ctes: bool) -> Iterable[exp.Table]:
|
|
61
|
+
if skip_ctes and isinstance(expression, exp.CTE):
|
|
62
|
+
return
|
|
63
|
+
if isinstance(expression, exp.Table):
|
|
64
|
+
yield expression
|
|
65
|
+
return
|
|
66
|
+
for child in expression.args.values():
|
|
67
|
+
if isinstance(child, list):
|
|
68
|
+
for item in child:
|
|
69
|
+
if isinstance(item, exp.Expression):
|
|
70
|
+
yield from _walk_tables(item, skip_ctes=skip_ctes)
|
|
71
|
+
elif isinstance(child, exp.Expression):
|
|
72
|
+
yield from _walk_tables(child, skip_ctes=skip_ctes)
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Artifact assembly for catalogkit-query."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, cast
|
|
6
|
+
|
|
7
|
+
from catalogkit.core import CatalogArtifact, Edge, Evidence, Warning, merge
|
|
8
|
+
from sqlglot import exp
|
|
9
|
+
|
|
10
|
+
from .ctes import extract_dependency_edges
|
|
11
|
+
from .errors import QueryMapContractError
|
|
12
|
+
from .models import (
|
|
13
|
+
EdgeKind as QueryMapEdgeKind,
|
|
14
|
+
QueryMap,
|
|
15
|
+
QuerySummary,
|
|
16
|
+
RelationEdge,
|
|
17
|
+
RelationUsage,
|
|
18
|
+
WarningCode,
|
|
19
|
+
WarningEntry,
|
|
20
|
+
)
|
|
21
|
+
from .parser import ParsedStatement
|
|
22
|
+
from .relations import RelationExtraction, extract_relations
|
|
23
|
+
|
|
24
|
+
_QUERYMAP_EDGE_KINDS = {"depends_on", "joins"}
|
|
25
|
+
_QUERYMAP_WARNING_CODES = {
|
|
26
|
+
"parse_recovered",
|
|
27
|
+
"select_star",
|
|
28
|
+
"table_star",
|
|
29
|
+
"ambiguous_output_source",
|
|
30
|
+
"unresolved_output_source",
|
|
31
|
+
"non_equi_join",
|
|
32
|
+
"unsupported_construct",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def build_query_map_from_parsed(parsed: ParsedStatement) -> QueryMap:
|
|
37
|
+
"""Build the public catalogkit-query artifact for one parsed SQL statement."""
|
|
38
|
+
artifact, relation_extraction = _build_catalog_artifact_from_parsed(parsed)
|
|
39
|
+
|
|
40
|
+
edges = [_relation_edge_from_catalog_edge(edge) for edge in artifact.edges]
|
|
41
|
+
warnings = [_warning_entry_from_warning(warning) for warning in artifact.warnings]
|
|
42
|
+
summary = QuerySummary(
|
|
43
|
+
dialect=parsed.dialect,
|
|
44
|
+
statement_type=parsed.statement.key.lower(),
|
|
45
|
+
has_ctes=bool(relation_extraction.cte_nodes),
|
|
46
|
+
relation_count=len(relation_extraction.relations),
|
|
47
|
+
cte_count=len(relation_extraction.cte_nodes),
|
|
48
|
+
output_count=0,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return QueryMap(
|
|
52
|
+
version=artifact.version,
|
|
53
|
+
summary=summary,
|
|
54
|
+
relations=relation_extraction.relations,
|
|
55
|
+
relation_usages=[
|
|
56
|
+
RelationUsage(
|
|
57
|
+
relation_id=usage.relation_id,
|
|
58
|
+
alias=usage.alias,
|
|
59
|
+
context=usage.context,
|
|
60
|
+
sql=usage.sql,
|
|
61
|
+
normalized_sql=usage.sql,
|
|
62
|
+
)
|
|
63
|
+
for usage in relation_extraction.relation_usages
|
|
64
|
+
],
|
|
65
|
+
edges=_dedupe_query_map_edges(edges),
|
|
66
|
+
outputs=[],
|
|
67
|
+
warnings=_dedupe_warnings(warnings),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def build_catalog_artifact_from_parsed(parsed: ParsedStatement) -> CatalogArtifact:
|
|
72
|
+
"""Build the shared catalog artifact used for CatalogKit composition."""
|
|
73
|
+
artifact, _ = _build_catalog_artifact_from_parsed(parsed)
|
|
74
|
+
return artifact
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _build_catalog_artifact_from_parsed(
|
|
78
|
+
parsed: ParsedStatement,
|
|
79
|
+
) -> tuple[CatalogArtifact, RelationExtraction]:
|
|
80
|
+
"""Build the internal shared artifact and keep extraction metadata nearby."""
|
|
81
|
+
relation_extraction = extract_relations(parsed.root_expression, dialect=parsed.dialect)
|
|
82
|
+
dependency_extraction = extract_dependency_edges(
|
|
83
|
+
parsed.root_expression,
|
|
84
|
+
relation_extraction=relation_extraction,
|
|
85
|
+
dialect=parsed.dialect,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if not relation_extraction.nodes:
|
|
89
|
+
raise QueryMapContractError("No tables or CTE relations were found in the SQL statement.")
|
|
90
|
+
|
|
91
|
+
nodes_by_id = {node.id: node.model_copy(deep=True) for node in relation_extraction.nodes}
|
|
92
|
+
for usage in relation_extraction.relation_usages:
|
|
93
|
+
node = nodes_by_id[usage.relation_id]
|
|
94
|
+
node.evidence.append(
|
|
95
|
+
Evidence(
|
|
96
|
+
location=usage.context,
|
|
97
|
+
expression=usage.sql,
|
|
98
|
+
confidence="high",
|
|
99
|
+
)
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
artifact = CatalogArtifact(
|
|
103
|
+
nodes=sorted(nodes_by_id.values(), key=lambda node: node.id),
|
|
104
|
+
edges=dependency_extraction.edges,
|
|
105
|
+
warnings=[
|
|
106
|
+
*dependency_extraction.warnings,
|
|
107
|
+
*_extract_contract_warnings(parsed.root_expression, dialect=parsed.dialect),
|
|
108
|
+
],
|
|
109
|
+
)
|
|
110
|
+
return merge(artifact), relation_extraction
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _extract_contract_warnings(
|
|
114
|
+
root_expression: Any,
|
|
115
|
+
*,
|
|
116
|
+
dialect: str,
|
|
117
|
+
) -> list[Warning]:
|
|
118
|
+
warnings: list[Warning] = []
|
|
119
|
+
|
|
120
|
+
union_expression = root_expression.find(exp.Union)
|
|
121
|
+
if union_expression is not None:
|
|
122
|
+
warnings.append(
|
|
123
|
+
Warning(
|
|
124
|
+
code="unsupported_construct",
|
|
125
|
+
message="UNION queries are mapped at the relation level only in the MVP.",
|
|
126
|
+
location=union_expression.sql(dialect=dialect),
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
intersect_expression = root_expression.find(exp.Intersect)
|
|
130
|
+
if intersect_expression is not None:
|
|
131
|
+
warnings.append(
|
|
132
|
+
Warning(
|
|
133
|
+
code="unsupported_construct",
|
|
134
|
+
message="INTERSECT queries are mapped at the relation level only in the MVP.",
|
|
135
|
+
location=intersect_expression.sql(dialect=dialect),
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
except_expression = root_expression.find(exp.Except)
|
|
139
|
+
if except_expression is not None:
|
|
140
|
+
warnings.append(
|
|
141
|
+
Warning(
|
|
142
|
+
code="unsupported_construct",
|
|
143
|
+
message="EXCEPT queries are mapped at the relation level only in the MVP.",
|
|
144
|
+
location=except_expression.sql(dialect=dialect),
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
for select in root_expression.find_all(exp.Select):
|
|
149
|
+
for selection in select.expressions:
|
|
150
|
+
if isinstance(selection, exp.Star):
|
|
151
|
+
warnings.append(
|
|
152
|
+
Warning(
|
|
153
|
+
code="select_star",
|
|
154
|
+
message="SELECT * was detected; output mapping is deferred in the MVP.",
|
|
155
|
+
location=selection.sql(dialect=dialect),
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
elif isinstance(selection, exp.Column) and str(selection.name or "").strip() == "*":
|
|
159
|
+
warnings.append(
|
|
160
|
+
Warning(
|
|
161
|
+
code="table_star",
|
|
162
|
+
message="table.* was detected; output mapping is deferred in the MVP.",
|
|
163
|
+
location=selection.sql(dialect=dialect),
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
for join in root_expression.find_all(exp.Join):
|
|
168
|
+
on_clause = join.args.get("on")
|
|
169
|
+
using_clause = join.args.get("using")
|
|
170
|
+
if on_clause is None and using_clause is None:
|
|
171
|
+
warnings.append(
|
|
172
|
+
Warning(
|
|
173
|
+
code="unsupported_construct",
|
|
174
|
+
message="JOIN without ON/USING is not modeled beyond relation dependency mapping.",
|
|
175
|
+
location=join.sql(dialect=dialect),
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
continue
|
|
179
|
+
if on_clause is not None and not _is_equality_join(on_clause):
|
|
180
|
+
warnings.append(
|
|
181
|
+
Warning(
|
|
182
|
+
code="non_equi_join",
|
|
183
|
+
message="Non-equality join detected; MVP preserves relation dependencies but does not model join semantics.",
|
|
184
|
+
location=join.sql(dialect=dialect),
|
|
185
|
+
)
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return warnings
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def _is_equality_join(expression: Any) -> bool:
|
|
192
|
+
if isinstance(expression, exp.EQ):
|
|
193
|
+
return True
|
|
194
|
+
if isinstance(expression, exp.And):
|
|
195
|
+
return _is_equality_join(expression.left) and _is_equality_join(expression.right)
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _relation_edge_from_catalog_edge(edge: Edge) -> RelationEdge:
|
|
200
|
+
if edge.kind not in _QUERYMAP_EDGE_KINDS:
|
|
201
|
+
raise QueryMapContractError(
|
|
202
|
+
f"catalogkit-query cannot emit unsupported edge kind {edge.kind!r} in its public contract."
|
|
203
|
+
)
|
|
204
|
+
sql = edge.evidence[0].expression if edge.evidence else None
|
|
205
|
+
return RelationEdge(
|
|
206
|
+
kind=cast(QueryMapEdgeKind, edge.kind),
|
|
207
|
+
source_id=edge.source_id,
|
|
208
|
+
target_id=edge.target_id,
|
|
209
|
+
label=edge.label,
|
|
210
|
+
confidence=edge.confidence,
|
|
211
|
+
sql=sql,
|
|
212
|
+
normalized_sql=sql,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _warning_entry_from_warning(warning: Warning) -> WarningEntry:
|
|
217
|
+
if warning.code not in _QUERYMAP_WARNING_CODES:
|
|
218
|
+
raise QueryMapContractError(
|
|
219
|
+
f"catalogkit-query cannot emit unsupported warning code {warning.code!r} in its public contract."
|
|
220
|
+
)
|
|
221
|
+
return WarningEntry(
|
|
222
|
+
code=cast(WarningCode, warning.code),
|
|
223
|
+
message=warning.message,
|
|
224
|
+
location=warning.location,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _dedupe_query_map_edges(edges: list[RelationEdge]) -> list[RelationEdge]:
|
|
229
|
+
seen: set[tuple[str, str, str | None]] = set()
|
|
230
|
+
deduped: list[RelationEdge] = []
|
|
231
|
+
for edge in edges:
|
|
232
|
+
key = (edge.source_id, edge.target_id, edge.sql)
|
|
233
|
+
if key in seen:
|
|
234
|
+
continue
|
|
235
|
+
seen.add(key)
|
|
236
|
+
deduped.append(edge)
|
|
237
|
+
return deduped
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _dedupe_warnings(warnings: list[WarningEntry]) -> list[WarningEntry]:
|
|
241
|
+
seen: set[tuple[str, str, str | None]] = set()
|
|
242
|
+
deduped: list[WarningEntry] = []
|
|
243
|
+
for warning in warnings:
|
|
244
|
+
key = (warning.code, warning.message, warning.location)
|
|
245
|
+
if key in seen:
|
|
246
|
+
continue
|
|
247
|
+
seen.add(key)
|
|
248
|
+
deduped.append(warning)
|
|
249
|
+
return deduped
|