sql-glider 0.1.8__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.8.dist-info → sql_glider-0.1.12.dist-info}/METADATA +1 -1
- {sql_glider-0.1.8.dist-info → sql_glider-0.1.12.dist-info}/RECORD +12 -10
- sqlglider/_version.py +2 -2
- sqlglider/cli.py +101 -2
- sqlglider/graph/builder.py +206 -20
- sqlglider/graph/formatters.py +98 -0
- sqlglider/lineage/analyzer.py +245 -4
- sqlglider/utils/config.py +5 -0
- sqlglider/utils/schema.py +62 -0
- {sql_glider-0.1.8.dist-info → sql_glider-0.1.12.dist-info}/WHEEL +0 -0
- {sql_glider-0.1.8.dist-info → sql_glider-0.1.12.dist-info}/entry_points.txt +0 -0
- {sql_glider-0.1.8.dist-info → sql_glider-0.1.12.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.12
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
|
|
2
|
-
sqlglider/_version.py,sha256=
|
|
3
|
-
sqlglider/cli.py,sha256=
|
|
2
|
+
sqlglider/_version.py,sha256=cEPXLUpTV7EzqolnyXW8nf8Hr6IVyBji9CzB6Cq_Ar0,706
|
|
3
|
+
sqlglider/cli.py,sha256=qEDLZ1a6yr-BzrtkBsJEHPByMmRERsGKZsYFTn9kaMY,55624
|
|
4
4
|
sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
|
|
5
5
|
sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
|
|
6
6
|
sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
|
|
@@ -11,13 +11,14 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
|
|
|
11
11
|
sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
|
|
12
12
|
sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
|
|
13
13
|
sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
|
|
14
|
-
sqlglider/graph/builder.py,sha256=
|
|
14
|
+
sqlglider/graph/builder.py,sha256=o0SnH5eWUUPpzRSdsdCXEva3QTlhLDagJulJ2hRFQqA,19895
|
|
15
|
+
sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
|
|
15
16
|
sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
|
|
16
17
|
sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
|
|
17
18
|
sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
|
|
18
19
|
sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9ZlgTQ,2781
|
|
19
20
|
sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
|
|
20
|
-
sqlglider/lineage/analyzer.py,sha256
|
|
21
|
+
sqlglider/lineage/analyzer.py,sha256=-LUeVNEsjfEWoKAJ2qVIiJO1noqwae4jQkwkkkVbAT8,75950
|
|
21
22
|
sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
|
|
22
23
|
sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
|
|
23
24
|
sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
|
|
@@ -25,10 +26,11 @@ sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g
|
|
|
25
26
|
sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
|
|
26
27
|
sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
|
|
27
28
|
sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
|
|
28
|
-
sqlglider/utils/config.py,sha256=
|
|
29
|
+
sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
|
|
29
30
|
sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
|
|
30
|
-
|
|
31
|
-
sql_glider-0.1.
|
|
32
|
-
sql_glider-0.1.
|
|
33
|
-
sql_glider-0.1.
|
|
34
|
-
sql_glider-0.1.
|
|
31
|
+
sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
|
|
32
|
+
sql_glider-0.1.12.dist-info/METADATA,sha256=73yuoWaAE5DKE9wobDXxbERSP2Pq-WpdqCnaswAa9fQ,28446
|
|
33
|
+
sql_glider-0.1.12.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
34
|
+
sql_glider-0.1.12.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
|
|
35
|
+
sql_glider-0.1.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
36
|
+
sql_glider-0.1.12.dist-info/RECORD,,
|
sqlglider/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.12'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 12)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
sqlglider/cli.py
CHANGED
|
@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
|
|
|
12
12
|
from typing_extensions import Annotated
|
|
13
13
|
|
|
14
14
|
from sqlglider.global_models import AnalysisLevel, NodeFormat
|
|
15
|
-
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
15
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
|
|
16
16
|
from sqlglider.lineage.formatters import (
|
|
17
17
|
CsvFormatter,
|
|
18
18
|
JsonFormatter,
|
|
@@ -166,6 +166,11 @@ def lineage(
|
|
|
166
166
|
exists=True,
|
|
167
167
|
help="Path to variables file (JSON or YAML)",
|
|
168
168
|
),
|
|
169
|
+
no_star: bool = typer.Option(
|
|
170
|
+
False,
|
|
171
|
+
"--no-star",
|
|
172
|
+
help="Fail if SELECT * cannot be resolved to actual columns",
|
|
173
|
+
),
|
|
169
174
|
) -> None:
|
|
170
175
|
"""
|
|
171
176
|
Analyze column or table lineage for a SQL file.
|
|
@@ -207,6 +212,7 @@ def lineage(
|
|
|
207
212
|
level_str = level or config.level or "column"
|
|
208
213
|
output_format = output_format or config.output_format or "text"
|
|
209
214
|
templater = templater or config.templater # None means no templating
|
|
215
|
+
no_star = no_star or config.no_star or False
|
|
210
216
|
# Validate and convert level to enum
|
|
211
217
|
try:
|
|
212
218
|
analysis_level = AnalysisLevel(level_str)
|
|
@@ -261,7 +267,7 @@ def lineage(
|
|
|
261
267
|
)
|
|
262
268
|
|
|
263
269
|
# Create analyzer
|
|
264
|
-
analyzer = LineageAnalyzer(sql, dialect=dialect)
|
|
270
|
+
analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
|
|
265
271
|
|
|
266
272
|
# Unified lineage analysis (handles both single and multi-query files)
|
|
267
273
|
results = analyzer.analyze_queries(
|
|
@@ -990,6 +996,40 @@ def graph_build(
|
|
|
990
996
|
exists=True,
|
|
991
997
|
help="Path to variables file (JSON or YAML)",
|
|
992
998
|
),
|
|
999
|
+
no_star: bool = typer.Option(
|
|
1000
|
+
False,
|
|
1001
|
+
"--no-star",
|
|
1002
|
+
help="Fail if SELECT * cannot be resolved to actual columns",
|
|
1003
|
+
),
|
|
1004
|
+
resolve_schema: bool = typer.Option(
|
|
1005
|
+
False,
|
|
1006
|
+
"--resolve-schema",
|
|
1007
|
+
help="Extract schema from all files before lineage analysis, "
|
|
1008
|
+
"enabling cross-file star resolution",
|
|
1009
|
+
),
|
|
1010
|
+
catalog_type: Optional[str] = typer.Option(
|
|
1011
|
+
None,
|
|
1012
|
+
"--catalog-type",
|
|
1013
|
+
"-c",
|
|
1014
|
+
help="Catalog provider for pulling DDL of tables not found in files "
|
|
1015
|
+
"(requires --resolve-schema). E.g. 'databricks'",
|
|
1016
|
+
),
|
|
1017
|
+
dump_schema: Optional[Path] = typer.Option(
|
|
1018
|
+
None,
|
|
1019
|
+
"--dump-schema",
|
|
1020
|
+
help="Dump resolved schema to file (requires --resolve-schema)",
|
|
1021
|
+
),
|
|
1022
|
+
dump_schema_format: Optional[str] = typer.Option(
|
|
1023
|
+
None,
|
|
1024
|
+
"--dump-schema-format",
|
|
1025
|
+
help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
|
|
1026
|
+
),
|
|
1027
|
+
strict_schema: bool = typer.Option(
|
|
1028
|
+
False,
|
|
1029
|
+
"--strict-schema",
|
|
1030
|
+
help="Fail if any column's table cannot be identified during schema extraction "
|
|
1031
|
+
"(requires --resolve-schema)",
|
|
1032
|
+
),
|
|
993
1033
|
) -> None:
|
|
994
1034
|
"""
|
|
995
1035
|
Build a lineage graph from SQL files.
|
|
@@ -1024,6 +1064,38 @@ def graph_build(
|
|
|
1024
1064
|
config = load_config()
|
|
1025
1065
|
dialect = dialect or config.dialect or "spark"
|
|
1026
1066
|
templater = templater or config.templater # None means no templating
|
|
1067
|
+
no_star = no_star or config.no_star or False
|
|
1068
|
+
resolve_schema = resolve_schema or config.resolve_schema or False
|
|
1069
|
+
strict_schema = strict_schema or config.strict_schema or False
|
|
1070
|
+
|
|
1071
|
+
if strict_schema and not resolve_schema:
|
|
1072
|
+
err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
|
|
1073
|
+
raise typer.Exit(1)
|
|
1074
|
+
|
|
1075
|
+
if catalog_type and not resolve_schema:
|
|
1076
|
+
err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
|
|
1077
|
+
raise typer.Exit(1)
|
|
1078
|
+
|
|
1079
|
+
# Resolve dump_schema options from config
|
|
1080
|
+
dump_schema = dump_schema or (
|
|
1081
|
+
Path(config.dump_schema) if config.dump_schema else None
|
|
1082
|
+
)
|
|
1083
|
+
dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
|
|
1084
|
+
|
|
1085
|
+
if dump_schema and not resolve_schema:
|
|
1086
|
+
err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
|
|
1087
|
+
raise typer.Exit(1)
|
|
1088
|
+
|
|
1089
|
+
if dump_schema_format not in ("text", "json", "csv"):
|
|
1090
|
+
err_console.print(
|
|
1091
|
+
f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
|
|
1092
|
+
"Use 'text', 'json', or 'csv'."
|
|
1093
|
+
)
|
|
1094
|
+
raise typer.Exit(1)
|
|
1095
|
+
|
|
1096
|
+
# Only inherit catalog_type from config when resolve_schema is active
|
|
1097
|
+
if resolve_schema and not catalog_type:
|
|
1098
|
+
catalog_type = config.catalog_type
|
|
1027
1099
|
|
|
1028
1100
|
# Validate and convert node format to enum
|
|
1029
1101
|
try:
|
|
@@ -1076,10 +1148,22 @@ def graph_build(
|
|
|
1076
1148
|
sql_preprocessor = _preprocess
|
|
1077
1149
|
|
|
1078
1150
|
try:
|
|
1151
|
+
# Build catalog config from config file if available
|
|
1152
|
+
catalog_config_dict = None
|
|
1153
|
+
if catalog_type and config.catalog:
|
|
1154
|
+
provider_config = getattr(config.catalog, catalog_type, None)
|
|
1155
|
+
if provider_config:
|
|
1156
|
+
catalog_config_dict = provider_config.model_dump(exclude_none=True)
|
|
1157
|
+
|
|
1079
1158
|
builder = GraphBuilder(
|
|
1080
1159
|
node_format=node_format_enum,
|
|
1081
1160
|
dialect=dialect,
|
|
1082
1161
|
sql_preprocessor=sql_preprocessor,
|
|
1162
|
+
no_star=no_star,
|
|
1163
|
+
resolve_schema=resolve_schema,
|
|
1164
|
+
catalog_type=catalog_type,
|
|
1165
|
+
catalog_config=catalog_config_dict,
|
|
1166
|
+
strict_schema=strict_schema,
|
|
1083
1167
|
)
|
|
1084
1168
|
|
|
1085
1169
|
# Process manifest if provided
|
|
@@ -1102,6 +1186,17 @@ def graph_build(
|
|
|
1102
1186
|
raise typer.Exit(1)
|
|
1103
1187
|
builder.add_files(all_files, dialect=dialect)
|
|
1104
1188
|
|
|
1189
|
+
# Dump resolved schema if requested
|
|
1190
|
+
if dump_schema:
|
|
1191
|
+
from sqlglider.graph.formatters import format_schema
|
|
1192
|
+
|
|
1193
|
+
schema_content = format_schema(builder.resolved_schema, dump_schema_format)
|
|
1194
|
+
dump_schema.write_text(schema_content, encoding="utf-8")
|
|
1195
|
+
console.print(
|
|
1196
|
+
f"[green]Schema dumped to {dump_schema} "
|
|
1197
|
+
f"({len(builder.resolved_schema)} table(s))[/green]"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1105
1200
|
# Build and save graph
|
|
1106
1201
|
graph = builder.build()
|
|
1107
1202
|
save_graph(graph, output)
|
|
@@ -1111,6 +1206,10 @@ def graph_build(
|
|
|
1111
1206
|
f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
|
|
1112
1207
|
)
|
|
1113
1208
|
|
|
1209
|
+
except SchemaResolutionError as e:
|
|
1210
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1211
|
+
raise typer.Exit(1)
|
|
1212
|
+
|
|
1114
1213
|
except FileNotFoundError as e:
|
|
1115
1214
|
err_console.print(f"[red]Error:[/red] {e}")
|
|
1116
1215
|
raise typer.Exit(1)
|
sqlglider/graph/builder.py
CHANGED
|
@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
|
|
|
16
16
|
LineageGraph,
|
|
17
17
|
Manifest,
|
|
18
18
|
)
|
|
19
|
-
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
19
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
|
|
20
20
|
from sqlglider.utils.file_utils import read_sql_file
|
|
21
|
+
from sqlglider.utils.schema import parse_ddl_to_schema
|
|
21
22
|
|
|
22
23
|
console = Console(stderr=True)
|
|
23
24
|
|
|
@@ -33,6 +34,11 @@ class GraphBuilder:
|
|
|
33
34
|
node_format: NodeFormat = NodeFormat.QUALIFIED,
|
|
34
35
|
dialect: str = "spark",
|
|
35
36
|
sql_preprocessor: Optional[SqlPreprocessor] = None,
|
|
37
|
+
no_star: bool = False,
|
|
38
|
+
resolve_schema: bool = False,
|
|
39
|
+
catalog_type: Optional[str] = None,
|
|
40
|
+
catalog_config: Optional[Dict[str, object]] = None,
|
|
41
|
+
strict_schema: bool = False,
|
|
36
42
|
):
|
|
37
43
|
"""
|
|
38
44
|
Initialize the graph builder.
|
|
@@ -43,15 +49,32 @@ class GraphBuilder:
|
|
|
43
49
|
sql_preprocessor: Optional function to preprocess SQL before analysis.
|
|
44
50
|
Takes (sql: str, file_path: Path) and returns processed SQL.
|
|
45
51
|
Useful for templating (e.g., Jinja2 rendering).
|
|
52
|
+
no_star: If True, fail when SELECT * cannot be resolved to columns
|
|
53
|
+
resolve_schema: If True, run a schema extraction pass across all
|
|
54
|
+
files before lineage analysis so that schema from any file is
|
|
55
|
+
available when analyzing every other file.
|
|
56
|
+
catalog_type: Optional catalog provider name (e.g. "databricks").
|
|
57
|
+
When set together with resolve_schema, DDL is pulled from the
|
|
58
|
+
catalog for tables whose schema could not be inferred from files.
|
|
59
|
+
catalog_config: Optional provider-specific configuration dict
|
|
60
|
+
passed to the catalog's configure() method.
|
|
61
|
+
strict_schema: If True, fail during schema extraction when an
|
|
62
|
+
unqualified column cannot be attributed to a table.
|
|
46
63
|
"""
|
|
47
64
|
self.node_format = node_format
|
|
48
65
|
self.dialect = dialect
|
|
49
66
|
self.sql_preprocessor = sql_preprocessor
|
|
67
|
+
self.no_star = no_star
|
|
68
|
+
self.resolve_schema = resolve_schema
|
|
69
|
+
self.catalog_type = catalog_type
|
|
70
|
+
self.catalog_config = catalog_config
|
|
71
|
+
self.strict_schema = strict_schema
|
|
50
72
|
self.graph: rx.PyDiGraph = rx.PyDiGraph()
|
|
51
73
|
self._node_index_map: Dict[str, int] = {} # identifier -> rustworkx node index
|
|
52
74
|
self._source_files: Set[str] = set()
|
|
53
75
|
self._edge_set: Set[tuple] = set() # (source, target) for dedup
|
|
54
76
|
self._skipped_files: List[tuple[str, str]] = [] # (file_path, reason)
|
|
77
|
+
self._resolved_schema: Dict[str, Dict[str, str]] = {} # accumulated schema
|
|
55
78
|
|
|
56
79
|
def add_file(
|
|
57
80
|
self,
|
|
@@ -82,7 +105,12 @@ class GraphBuilder:
|
|
|
82
105
|
if self.sql_preprocessor:
|
|
83
106
|
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
84
107
|
|
|
85
|
-
analyzer = LineageAnalyzer(
|
|
108
|
+
analyzer = LineageAnalyzer(
|
|
109
|
+
sql_content,
|
|
110
|
+
dialect=file_dialect,
|
|
111
|
+
no_star=self.no_star,
|
|
112
|
+
schema=self._resolved_schema if self._resolved_schema else None,
|
|
113
|
+
)
|
|
86
114
|
results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
|
|
87
115
|
|
|
88
116
|
# Print warnings for any skipped queries within the file
|
|
@@ -204,23 +232,37 @@ class GraphBuilder:
|
|
|
204
232
|
entry_dialect = entry.dialect or dialect or self.dialect
|
|
205
233
|
files_with_dialects.append((file_path, entry_dialect))
|
|
206
234
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
235
|
+
if not files_with_dialects:
|
|
236
|
+
return self
|
|
237
|
+
|
|
238
|
+
# Two-pass schema resolution
|
|
239
|
+
if self.resolve_schema:
|
|
240
|
+
console.print("[blue]Pass 1: Extracting schema from files[/blue]")
|
|
241
|
+
file_paths_only = [fp for fp, _ in files_with_dialects]
|
|
242
|
+
self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
|
|
243
|
+
if self.catalog_type:
|
|
244
|
+
self._resolved_schema = self._fill_schema_from_catalog(
|
|
245
|
+
self._resolved_schema, file_paths_only, dialect
|
|
246
|
+
)
|
|
247
|
+
console.print(
|
|
248
|
+
f"[blue]Schema resolved for "
|
|
249
|
+
f"{len(self._resolved_schema)} table(s)[/blue]"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
total = len(files_with_dialects)
|
|
253
|
+
description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
|
|
254
|
+
with Progress(
|
|
255
|
+
TextColumn("[progress.description]{task.description}"),
|
|
256
|
+
BarColumn(),
|
|
257
|
+
TaskProgressColumn(),
|
|
258
|
+
console=console,
|
|
259
|
+
transient=False,
|
|
260
|
+
) as progress:
|
|
261
|
+
task = progress.add_task(description, total=total)
|
|
262
|
+
for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
|
|
263
|
+
console.print(f"Parsing file {i}/{total}: {file_path.name}")
|
|
264
|
+
self.add_file(file_path, file_dialect)
|
|
265
|
+
progress.advance(task)
|
|
224
266
|
|
|
225
267
|
return self
|
|
226
268
|
|
|
@@ -244,8 +286,24 @@ class GraphBuilder:
|
|
|
244
286
|
if not file_paths:
|
|
245
287
|
return self
|
|
246
288
|
|
|
289
|
+
# Two-pass schema resolution: extract schema from all files first
|
|
290
|
+
if self.resolve_schema:
|
|
291
|
+
console.print("[blue]Pass 1: Extracting schema from files[/blue]")
|
|
292
|
+
self._resolved_schema = self._extract_schemas(file_paths, dialect)
|
|
293
|
+
if self.catalog_type:
|
|
294
|
+
self._resolved_schema = self._fill_schema_from_catalog(
|
|
295
|
+
self._resolved_schema, file_paths, dialect
|
|
296
|
+
)
|
|
297
|
+
console.print(
|
|
298
|
+
f"[blue]Schema resolved for "
|
|
299
|
+
f"{len(self._resolved_schema)} table(s)[/blue]"
|
|
300
|
+
)
|
|
301
|
+
|
|
247
302
|
if show_progress:
|
|
248
303
|
total = len(file_paths)
|
|
304
|
+
description = (
|
|
305
|
+
"Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
|
|
306
|
+
)
|
|
249
307
|
with Progress(
|
|
250
308
|
TextColumn("[progress.description]{task.description}"),
|
|
251
309
|
BarColumn(),
|
|
@@ -253,7 +311,7 @@ class GraphBuilder:
|
|
|
253
311
|
console=console,
|
|
254
312
|
transient=False,
|
|
255
313
|
) as progress:
|
|
256
|
-
task = progress.add_task(
|
|
314
|
+
task = progress.add_task(description, total=total)
|
|
257
315
|
for i, file_path in enumerate(file_paths, start=1):
|
|
258
316
|
console.print(f"Parsing file {i}/{total}: {file_path.name}")
|
|
259
317
|
self.add_file(file_path, dialect)
|
|
@@ -263,6 +321,129 @@ class GraphBuilder:
|
|
|
263
321
|
self.add_file(file_path, dialect)
|
|
264
322
|
return self
|
|
265
323
|
|
|
324
|
+
def _extract_schemas(
|
|
325
|
+
self,
|
|
326
|
+
file_paths: List[Path],
|
|
327
|
+
dialect: Optional[str] = None,
|
|
328
|
+
) -> Dict[str, Dict[str, str]]:
|
|
329
|
+
"""Run schema extraction pass across all files.
|
|
330
|
+
|
|
331
|
+
Parses each file and extracts schema from CREATE TABLE/VIEW
|
|
332
|
+
statements without performing lineage analysis.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
file_paths: SQL files to extract schema from
|
|
336
|
+
dialect: SQL dialect override
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Accumulated schema dict from all files
|
|
340
|
+
"""
|
|
341
|
+
schema: Dict[str, Dict[str, str]] = {}
|
|
342
|
+
total = len(file_paths)
|
|
343
|
+
with Progress(
|
|
344
|
+
TextColumn("[progress.description]{task.description}"),
|
|
345
|
+
BarColumn(),
|
|
346
|
+
TaskProgressColumn(),
|
|
347
|
+
console=console,
|
|
348
|
+
transient=False,
|
|
349
|
+
) as progress:
|
|
350
|
+
task = progress.add_task("Pass 1: Extracting schema", total=total)
|
|
351
|
+
for i, file_path in enumerate(file_paths, start=1):
|
|
352
|
+
console.print(f"Extracting schema {i}/{total}: {file_path.name}")
|
|
353
|
+
file_dialect = dialect or self.dialect
|
|
354
|
+
try:
|
|
355
|
+
sql_content = read_sql_file(file_path)
|
|
356
|
+
if self.sql_preprocessor:
|
|
357
|
+
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
358
|
+
analyzer = LineageAnalyzer(
|
|
359
|
+
sql_content,
|
|
360
|
+
dialect=file_dialect,
|
|
361
|
+
schema=schema,
|
|
362
|
+
strict_schema=self.strict_schema,
|
|
363
|
+
)
|
|
364
|
+
file_schema = analyzer.extract_schema_only()
|
|
365
|
+
schema.update(file_schema)
|
|
366
|
+
except SchemaResolutionError:
|
|
367
|
+
raise
|
|
368
|
+
except Exception:
|
|
369
|
+
# Schema extraction failures are non-fatal; the file
|
|
370
|
+
# will be reported during the lineage pass if it also fails.
|
|
371
|
+
pass
|
|
372
|
+
progress.advance(task)
|
|
373
|
+
return schema
|
|
374
|
+
|
|
375
|
+
def _fill_schema_from_catalog(
|
|
376
|
+
self,
|
|
377
|
+
schema: Dict[str, Dict[str, str]],
|
|
378
|
+
file_paths: List[Path],
|
|
379
|
+
dialect: Optional[str] = None,
|
|
380
|
+
) -> Dict[str, Dict[str, str]]:
|
|
381
|
+
"""Pull DDL from catalog for tables not yet in schema.
|
|
382
|
+
|
|
383
|
+
Extracts all table names referenced across the files, identifies
|
|
384
|
+
those missing from the schema, and fetches their DDL from the
|
|
385
|
+
configured catalog provider.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
schema: Schema dict already populated from file extraction
|
|
389
|
+
file_paths: SQL files to scan for table references
|
|
390
|
+
dialect: SQL dialect override
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Updated schema dict with catalog-sourced entries added
|
|
394
|
+
"""
|
|
395
|
+
from sqlglider.catalog import get_catalog
|
|
396
|
+
|
|
397
|
+
catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
|
|
398
|
+
if self.catalog_config:
|
|
399
|
+
catalog.configure(self.catalog_config)
|
|
400
|
+
|
|
401
|
+
# Collect all referenced table names across files
|
|
402
|
+
all_tables: Set[str] = set()
|
|
403
|
+
for file_path in file_paths:
|
|
404
|
+
file_dialect = dialect or self.dialect
|
|
405
|
+
try:
|
|
406
|
+
sql_content = read_sql_file(file_path)
|
|
407
|
+
if self.sql_preprocessor:
|
|
408
|
+
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
409
|
+
analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
|
|
410
|
+
tables_results = analyzer.analyze_tables()
|
|
411
|
+
for result in tables_results:
|
|
412
|
+
for table_info in result.tables:
|
|
413
|
+
# Skip CTEs — they don't exist in catalogs
|
|
414
|
+
from sqlglider.lineage.analyzer import ObjectType
|
|
415
|
+
|
|
416
|
+
if table_info.object_type != ObjectType.CTE:
|
|
417
|
+
all_tables.add(table_info.name)
|
|
418
|
+
except Exception:
|
|
419
|
+
pass
|
|
420
|
+
|
|
421
|
+
# Find tables missing from schema
|
|
422
|
+
missing = [t for t in all_tables if t not in schema]
|
|
423
|
+
if not missing:
|
|
424
|
+
return schema
|
|
425
|
+
|
|
426
|
+
console.print(
|
|
427
|
+
f"[blue]Pulling DDL from {self.catalog_type} "
|
|
428
|
+
f"for {len(missing)} table(s)...[/blue]"
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
ddl_results = catalog.get_ddl_batch(missing)
|
|
432
|
+
file_dialect = dialect or self.dialect
|
|
433
|
+
for table_name, ddl in ddl_results.items():
|
|
434
|
+
if ddl.startswith("ERROR:"):
|
|
435
|
+
console.print(
|
|
436
|
+
f"[yellow]Warning:[/yellow] Could not pull DDL "
|
|
437
|
+
f"for {table_name}: {ddl}"
|
|
438
|
+
)
|
|
439
|
+
continue
|
|
440
|
+
parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
|
|
441
|
+
for name, cols in parsed_schema.items():
|
|
442
|
+
if name not in schema:
|
|
443
|
+
schema[name] = cols
|
|
444
|
+
|
|
445
|
+
return schema
|
|
446
|
+
|
|
266
447
|
def _ensure_node(
|
|
267
448
|
self,
|
|
268
449
|
identifier: str,
|
|
@@ -343,6 +524,11 @@ class GraphBuilder:
|
|
|
343
524
|
"""Get mapping from node identifiers to rustworkx indices."""
|
|
344
525
|
return self._node_index_map.copy()
|
|
345
526
|
|
|
527
|
+
@property
|
|
528
|
+
def resolved_schema(self) -> Dict[str, Dict[str, str]]:
|
|
529
|
+
"""Get the resolved schema dictionary from schema extraction pass."""
|
|
530
|
+
return self._resolved_schema.copy()
|
|
531
|
+
|
|
346
532
|
@property
|
|
347
533
|
def skipped_files(self) -> List[tuple[str, str]]:
|
|
348
534
|
"""Get list of files that were skipped during graph building."""
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Output formatters for resolved schema data."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from typing import Dict
|
|
7
|
+
|
|
8
|
+
SchemaDict = Dict[str, Dict[str, str]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def format_schema_text(schema: SchemaDict) -> str:
|
|
12
|
+
"""Format resolved schema as human-readable text.
|
|
13
|
+
|
|
14
|
+
Output format:
|
|
15
|
+
customers
|
|
16
|
+
id
|
|
17
|
+
name
|
|
18
|
+
|
|
19
|
+
schema.orders
|
|
20
|
+
order_id
|
|
21
|
+
customer_id
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
schema: Resolved schema dictionary mapping table names to column dicts.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Text-formatted string.
|
|
28
|
+
"""
|
|
29
|
+
lines: list[str] = []
|
|
30
|
+
for table_name in sorted(schema):
|
|
31
|
+
if lines:
|
|
32
|
+
lines.append("")
|
|
33
|
+
lines.append(table_name)
|
|
34
|
+
for column_name in sorted(schema[table_name]):
|
|
35
|
+
lines.append(f" {column_name}")
|
|
36
|
+
return "\n".join(lines) + "\n" if lines else ""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def format_schema_json(schema: SchemaDict) -> str:
|
|
40
|
+
"""Format resolved schema as JSON.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
schema: Resolved schema dictionary mapping table names to column dicts.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
JSON-formatted string.
|
|
47
|
+
"""
|
|
48
|
+
sorted_schema = {k: schema[k] for k in sorted(schema)}
|
|
49
|
+
return json.dumps(sorted_schema, indent=2)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def format_schema_csv(schema: SchemaDict) -> str:
|
|
53
|
+
"""Format resolved schema as CSV.
|
|
54
|
+
|
|
55
|
+
Output format:
|
|
56
|
+
table,column,type
|
|
57
|
+
customers,id,UNKNOWN
|
|
58
|
+
customers,name,UNKNOWN
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
schema: Resolved schema dictionary mapping table names to column dicts.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
CSV-formatted string.
|
|
65
|
+
"""
|
|
66
|
+
output = StringIO()
|
|
67
|
+
writer = csv.writer(output)
|
|
68
|
+
writer.writerow(["table", "column", "type"])
|
|
69
|
+
for table_name in sorted(schema):
|
|
70
|
+
for column_name in sorted(schema[table_name]):
|
|
71
|
+
writer.writerow([table_name, column_name, schema[table_name][column_name]])
|
|
72
|
+
return output.getvalue()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
|
|
76
|
+
"""Format resolved schema in the specified format.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
schema: Resolved schema dictionary.
|
|
80
|
+
output_format: One of "text", "json", or "csv".
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Formatted string.
|
|
84
|
+
|
|
85
|
+
Raises:
|
|
86
|
+
ValueError: If output_format is not recognized.
|
|
87
|
+
"""
|
|
88
|
+
formatters = {
|
|
89
|
+
"text": format_schema_text,
|
|
90
|
+
"json": format_schema_json,
|
|
91
|
+
"csv": format_schema_csv,
|
|
92
|
+
}
|
|
93
|
+
formatter = formatters.get(output_format)
|
|
94
|
+
if formatter is None:
|
|
95
|
+
raise ValueError(
|
|
96
|
+
f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
|
|
97
|
+
)
|
|
98
|
+
return formatter(schema)
|
sqlglider/lineage/analyzer.py
CHANGED
|
@@ -11,6 +11,14 @@ from sqlglot.lineage import Node, lineage
|
|
|
11
11
|
from sqlglider.global_models import AnalysisLevel
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class StarResolutionError(Exception):
|
|
15
|
+
"""Raised when SELECT * cannot be resolved and no_star mode is enabled."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SchemaResolutionError(Exception):
|
|
19
|
+
"""Raised when a column's table cannot be identified and strict_schema is enabled."""
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
class TableUsage(str, Enum):
|
|
15
23
|
"""How a table is used in a query."""
|
|
16
24
|
|
|
@@ -85,23 +93,40 @@ WarningCallback = Callable[[str], None]
|
|
|
85
93
|
class LineageAnalyzer:
|
|
86
94
|
"""Analyze column and table lineage for SQL queries."""
|
|
87
95
|
|
|
88
|
-
def __init__(
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
sql: str,
|
|
99
|
+
dialect: str = "spark",
|
|
100
|
+
no_star: bool = False,
|
|
101
|
+
schema: Optional[Dict[str, Dict[str, str]]] = None,
|
|
102
|
+
strict_schema: bool = False,
|
|
103
|
+
):
|
|
89
104
|
"""
|
|
90
105
|
Initialize the lineage analyzer.
|
|
91
106
|
|
|
92
107
|
Args:
|
|
93
108
|
sql: SQL query string to analyze (can contain multiple statements)
|
|
94
109
|
dialect: SQL dialect (default: spark)
|
|
110
|
+
no_star: If True, fail when SELECT * cannot be resolved to columns
|
|
111
|
+
schema: Optional external schema mapping table names to column
|
|
112
|
+
definitions (e.g. {"table": {"col": "UNKNOWN"}}). File-derived
|
|
113
|
+
schema from CREATE statements will merge on top.
|
|
114
|
+
strict_schema: If True, fail during schema extraction when an
|
|
115
|
+
unqualified column cannot be attributed to a table (e.g.
|
|
116
|
+
in a multi-table SELECT without table qualifiers).
|
|
95
117
|
|
|
96
118
|
Raises:
|
|
97
119
|
ParseError: If the SQL cannot be parsed
|
|
98
120
|
"""
|
|
99
121
|
self.sql = sql
|
|
100
122
|
self.dialect = dialect
|
|
123
|
+
self._no_star = no_star
|
|
124
|
+
self._strict_schema = strict_schema
|
|
101
125
|
self._skipped_queries: List[SkippedQuery] = []
|
|
102
126
|
# File-scoped schema context for cross-statement lineage
|
|
103
127
|
# Maps table/view names to their column definitions
|
|
104
|
-
self.
|
|
128
|
+
self._initial_schema: Dict[str, Dict[str, str]] = dict(schema) if schema else {}
|
|
129
|
+
self._file_schema: Dict[str, Dict[str, str]] = dict(self._initial_schema)
|
|
105
130
|
|
|
106
131
|
try:
|
|
107
132
|
# Parse all statements in the SQL string
|
|
@@ -126,6 +151,27 @@ class LineageAnalyzer:
|
|
|
126
151
|
"""Get list of queries that were skipped during analysis."""
|
|
127
152
|
return self._skipped_queries.copy()
|
|
128
153
|
|
|
154
|
+
def get_extracted_schema(self) -> Dict[str, Dict[str, str]]:
|
|
155
|
+
"""Return the accumulated file schema after analysis."""
|
|
156
|
+
return dict(self._file_schema)
|
|
157
|
+
|
|
158
|
+
def extract_schema_only(self) -> Dict[str, Dict[str, str]]:
|
|
159
|
+
"""Parse all statements and extract schema without running lineage.
|
|
160
|
+
|
|
161
|
+
Iterates through all expressions, extracting schema from:
|
|
162
|
+
1. CREATE TABLE/VIEW AS SELECT statements (existing behavior)
|
|
163
|
+
2. DQL statements by inferring table columns from qualified column
|
|
164
|
+
references (e.g., ``SELECT t.id FROM table t`` infers
|
|
165
|
+
``table: {id: UNKNOWN}``)
|
|
166
|
+
|
|
167
|
+
Returns the accumulated schema dict.
|
|
168
|
+
"""
|
|
169
|
+
self._file_schema = dict(self._initial_schema)
|
|
170
|
+
for expr in self.expressions:
|
|
171
|
+
self._extract_schema_from_statement(expr)
|
|
172
|
+
self._extract_schema_from_dql(expr)
|
|
173
|
+
return dict(self._file_schema)
|
|
174
|
+
|
|
129
175
|
def get_output_columns(self) -> List[str]:
|
|
130
176
|
"""
|
|
131
177
|
Extract all output column names from the query with full qualification.
|
|
@@ -171,6 +217,12 @@ class LineageAnalyzer:
|
|
|
171
217
|
columns.append(qualified_name)
|
|
172
218
|
self._column_mapping[qualified_name] = star_col
|
|
173
219
|
if not columns:
|
|
220
|
+
if self._no_star:
|
|
221
|
+
raise StarResolutionError(
|
|
222
|
+
f"SELECT * could not be resolved to columns "
|
|
223
|
+
f"for target table '{target_table}'. "
|
|
224
|
+
f"Provide schema context or avoid using SELECT *."
|
|
225
|
+
)
|
|
174
226
|
# Fallback: can't resolve *, use * as column name
|
|
175
227
|
qualified_name = f"{target_table}.*"
|
|
176
228
|
columns.append(qualified_name)
|
|
@@ -200,6 +252,12 @@ class LineageAnalyzer:
|
|
|
200
252
|
columns.append(qualified_name)
|
|
201
253
|
self._column_mapping[qualified_name] = col
|
|
202
254
|
if not qualified_star_cols:
|
|
255
|
+
if self._no_star:
|
|
256
|
+
raise StarResolutionError(
|
|
257
|
+
f"SELECT {source_table}.* could not be resolved "
|
|
258
|
+
f"to columns for target table '{target_table}'. "
|
|
259
|
+
f"Provide schema context or avoid using SELECT *."
|
|
260
|
+
)
|
|
203
261
|
# Fallback: can't resolve t.*, use * as column name
|
|
204
262
|
qualified_name = f"{target_table}.*"
|
|
205
263
|
columns.append(qualified_name)
|
|
@@ -226,6 +284,23 @@ class LineageAnalyzer:
|
|
|
226
284
|
# Get the first SELECT for table resolution (handles UNION case)
|
|
227
285
|
first_select = self._get_first_select(select_node)
|
|
228
286
|
for projection in projections:
|
|
287
|
+
# Handle SELECT * in DQL context
|
|
288
|
+
if isinstance(projection, exp.Star):
|
|
289
|
+
if first_select:
|
|
290
|
+
star_columns = self._resolve_star_columns(first_select)
|
|
291
|
+
for star_col in star_columns:
|
|
292
|
+
columns.append(star_col)
|
|
293
|
+
self._column_mapping[star_col] = star_col
|
|
294
|
+
if not columns:
|
|
295
|
+
if self._no_star:
|
|
296
|
+
raise StarResolutionError(
|
|
297
|
+
"SELECT * could not be resolved to columns. "
|
|
298
|
+
"Provide schema context or avoid using SELECT *."
|
|
299
|
+
)
|
|
300
|
+
columns.append("*")
|
|
301
|
+
self._column_mapping["*"] = "*"
|
|
302
|
+
continue
|
|
303
|
+
|
|
229
304
|
# Get the underlying expression (unwrap alias if present)
|
|
230
305
|
if isinstance(projection, exp.Alias):
|
|
231
306
|
source_expr = projection.this
|
|
@@ -236,6 +311,30 @@ class LineageAnalyzer:
|
|
|
236
311
|
column_name = None
|
|
237
312
|
lineage_name = None
|
|
238
313
|
|
|
314
|
+
# Handle table-qualified star in DQL context (e.g., t.*)
|
|
315
|
+
if isinstance(source_expr, exp.Column) and isinstance(
|
|
316
|
+
source_expr.this, exp.Star
|
|
317
|
+
):
|
|
318
|
+
source_table = source_expr.table
|
|
319
|
+
dql_star_cols: List[str] = []
|
|
320
|
+
if source_table and first_select:
|
|
321
|
+
dql_star_cols = self._resolve_qualified_star(
|
|
322
|
+
source_table, first_select
|
|
323
|
+
)
|
|
324
|
+
for col in dql_star_cols:
|
|
325
|
+
columns.append(col)
|
|
326
|
+
self._column_mapping[col] = col
|
|
327
|
+
if not dql_star_cols:
|
|
328
|
+
if self._no_star:
|
|
329
|
+
raise StarResolutionError(
|
|
330
|
+
f"SELECT {source_table}.* could not be resolved "
|
|
331
|
+
f"to columns. "
|
|
332
|
+
f"Provide schema context or avoid using SELECT *."
|
|
333
|
+
)
|
|
334
|
+
columns.append("*")
|
|
335
|
+
self._column_mapping["*"] = "*"
|
|
336
|
+
continue
|
|
337
|
+
|
|
239
338
|
# Try to extract fully qualified name
|
|
240
339
|
if isinstance(source_expr, exp.Column):
|
|
241
340
|
# Get table and column parts
|
|
@@ -367,7 +466,7 @@ class LineageAnalyzer:
|
|
|
367
466
|
"""
|
|
368
467
|
results = []
|
|
369
468
|
self._skipped_queries = [] # Reset skipped queries for this analysis
|
|
370
|
-
self._file_schema =
|
|
469
|
+
self._file_schema = dict(self._initial_schema) # Reset to external schema
|
|
371
470
|
|
|
372
471
|
for query_index, expr, preview in self._iterate_queries(table_filter):
|
|
373
472
|
# Temporarily swap self.expr to analyze this query
|
|
@@ -407,6 +506,8 @@ class LineageAnalyzer:
|
|
|
407
506
|
level=level,
|
|
408
507
|
)
|
|
409
508
|
)
|
|
509
|
+
except StarResolutionError:
|
|
510
|
+
raise
|
|
410
511
|
except ValueError as e:
|
|
411
512
|
# Unsupported statement type - track it and continue
|
|
412
513
|
stmt_type = self._get_statement_type(expr)
|
|
@@ -615,6 +716,12 @@ class LineageAnalyzer:
|
|
|
615
716
|
if isinstance(target, exp.Table):
|
|
616
717
|
return (self._get_qualified_table_name(target), ObjectType.UNKNOWN)
|
|
617
718
|
|
|
719
|
+
# CACHE TABLE
|
|
720
|
+
elif isinstance(self.expr, exp.Cache):
|
|
721
|
+
target = self.expr.this
|
|
722
|
+
if isinstance(target, exp.Table):
|
|
723
|
+
return (self._get_qualified_table_name(target), ObjectType.TABLE)
|
|
724
|
+
|
|
618
725
|
# DELETE FROM table
|
|
619
726
|
elif isinstance(self.expr, exp.Delete):
|
|
620
727
|
target = self.expr.this
|
|
@@ -706,6 +813,10 @@ class LineageAnalyzer:
|
|
|
706
813
|
elif isinstance(self.expr, exp.Drop):
|
|
707
814
|
return table_node is self.expr.this
|
|
708
815
|
|
|
816
|
+
# For CACHE TABLE, the target is self.expr.this
|
|
817
|
+
elif isinstance(self.expr, exp.Cache):
|
|
818
|
+
return table_node is self.expr.this
|
|
819
|
+
|
|
709
820
|
return False
|
|
710
821
|
|
|
711
822
|
def _analyze_column_lineage_internal(
|
|
@@ -741,7 +852,12 @@ class LineageAnalyzer:
|
|
|
741
852
|
|
|
742
853
|
lineage_items = []
|
|
743
854
|
# Get SQL for current expression only (not full multi-query SQL)
|
|
744
|
-
|
|
855
|
+
# For CACHE TABLE, pass just the SELECT since sqlglot.lineage doesn't
|
|
856
|
+
# natively understand CACHE statements
|
|
857
|
+
if isinstance(self.expr, exp.Cache) and self.expr.expression:
|
|
858
|
+
current_query_sql = self.expr.expression.sql(dialect=self.dialect)
|
|
859
|
+
else:
|
|
860
|
+
current_query_sql = self.expr.sql(dialect=self.dialect)
|
|
745
861
|
|
|
746
862
|
for col in columns_to_analyze:
|
|
747
863
|
try:
|
|
@@ -889,6 +1005,7 @@ class LineageAnalyzer:
|
|
|
889
1005
|
"Drop": f"DROP {getattr(target_expr, 'kind', '')}".strip(),
|
|
890
1006
|
"Alter": "ALTER",
|
|
891
1007
|
"Truncate": "TRUNCATE",
|
|
1008
|
+
"Cache": "CACHE TABLE",
|
|
892
1009
|
"Command": "COMMAND",
|
|
893
1010
|
}
|
|
894
1011
|
|
|
@@ -943,6 +1060,17 @@ class LineageAnalyzer:
|
|
|
943
1060
|
):
|
|
944
1061
|
return (target_name, select_node)
|
|
945
1062
|
|
|
1063
|
+
# Check for CACHE TABLE AS SELECT
|
|
1064
|
+
elif isinstance(self.expr, exp.Cache):
|
|
1065
|
+
target = self.expr.this
|
|
1066
|
+
if isinstance(target, exp.Table):
|
|
1067
|
+
target_name = self._get_qualified_table_name(target)
|
|
1068
|
+
select_node = self.expr.expression
|
|
1069
|
+
if isinstance(
|
|
1070
|
+
select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
|
|
1071
|
+
):
|
|
1072
|
+
return (target_name, select_node)
|
|
1073
|
+
|
|
946
1074
|
# Check for MERGE statement
|
|
947
1075
|
elif isinstance(self.expr, exp.Merge):
|
|
948
1076
|
target = self.expr.this
|
|
@@ -1339,6 +1467,119 @@ class LineageAnalyzer:
|
|
|
1339
1467
|
# Store with UNKNOWN type - SQLGlot only needs column names for expansion
|
|
1340
1468
|
self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
|
|
1341
1469
|
|
|
1470
|
+
def _extract_schema_from_dql(self, expr: exp.Expression) -> None:
|
|
1471
|
+
"""Infer table schemas from column references in DQL.
|
|
1472
|
+
|
|
1473
|
+
Walks SELECT statements and extracts table-column mappings from:
|
|
1474
|
+
1. Qualified column references (e.g., ``c.id``) — always resolved.
|
|
1475
|
+
2. Unqualified column references (e.g., ``id``) — only when the
|
|
1476
|
+
SELECT has exactly one real table source (no joins), making
|
|
1477
|
+
attribution unambiguous.
|
|
1478
|
+
|
|
1479
|
+
Aliases are resolved back to actual table names. CTEs and subquery
|
|
1480
|
+
aliases are skipped since they don't represent external tables.
|
|
1481
|
+
|
|
1482
|
+
Args:
|
|
1483
|
+
expr: The SQL expression to extract schema from.
|
|
1484
|
+
"""
|
|
1485
|
+
# Find all SELECT nodes in the expression tree
|
|
1486
|
+
selects = list(expr.find_all(exp.Select))
|
|
1487
|
+
if not selects:
|
|
1488
|
+
return
|
|
1489
|
+
|
|
1490
|
+
for select_node in selects:
|
|
1491
|
+
# Build alias-to-table mapping for this SELECT scope
|
|
1492
|
+
alias_map: Dict[str, str] = {}
|
|
1493
|
+
cte_names: Set[str] = set()
|
|
1494
|
+
|
|
1495
|
+
# Collect CTE names so we can skip them
|
|
1496
|
+
parent = select_node
|
|
1497
|
+
while parent:
|
|
1498
|
+
with_clause = parent.args.get("with")
|
|
1499
|
+
if with_clause:
|
|
1500
|
+
for cte in with_clause.expressions:
|
|
1501
|
+
if isinstance(cte, exp.CTE) and cte.alias:
|
|
1502
|
+
cte_names.add(cte.alias.lower())
|
|
1503
|
+
parent = parent.parent if hasattr(parent, "parent") else None
|
|
1504
|
+
|
|
1505
|
+
# Collect subquery aliases so we can skip them too
|
|
1506
|
+
subquery_aliases: Set[str] = set()
|
|
1507
|
+
from_clause = select_node.args.get("from")
|
|
1508
|
+
if from_clause and isinstance(from_clause, exp.From):
|
|
1509
|
+
source = from_clause.this
|
|
1510
|
+
if isinstance(source, exp.Subquery) and source.alias:
|
|
1511
|
+
subquery_aliases.add(source.alias.lower())
|
|
1512
|
+
for join in select_node.find_all(exp.Join):
|
|
1513
|
+
if isinstance(join.this, exp.Subquery) and join.this.alias:
|
|
1514
|
+
subquery_aliases.add(join.this.alias.lower())
|
|
1515
|
+
|
|
1516
|
+
# Build alias map from FROM/JOIN table references
|
|
1517
|
+
real_tables: list[str] = [] # track non-CTE, non-subquery tables
|
|
1518
|
+
for table_ref in select_node.find_all(exp.Table):
|
|
1519
|
+
# Skip tables inside nested selects — they belong to inner scope
|
|
1520
|
+
if table_ref.find_ancestor(exp.Select) is not select_node:
|
|
1521
|
+
continue
|
|
1522
|
+
qualified = self._get_qualified_table_name(table_ref)
|
|
1523
|
+
if table_ref.alias:
|
|
1524
|
+
alias_map[table_ref.alias.lower()] = qualified
|
|
1525
|
+
else:
|
|
1526
|
+
alias_map[table_ref.name.lower()] = qualified
|
|
1527
|
+
# Track real tables (not CTEs or subqueries)
|
|
1528
|
+
if (
|
|
1529
|
+
qualified.lower() not in cte_names
|
|
1530
|
+
and qualified.lower() not in subquery_aliases
|
|
1531
|
+
):
|
|
1532
|
+
real_tables.append(qualified)
|
|
1533
|
+
|
|
1534
|
+
# Determine single-table target for unqualified columns
|
|
1535
|
+
# Only set when exactly one real table source exists (unambiguous)
|
|
1536
|
+
single_table: Optional[str] = (
|
|
1537
|
+
real_tables[0] if len(real_tables) == 1 else None
|
|
1538
|
+
)
|
|
1539
|
+
|
|
1540
|
+
# Walk all column references in this SELECT
|
|
1541
|
+
for column in select_node.find_all(exp.Column):
|
|
1542
|
+
if isinstance(column.this, exp.Star):
|
|
1543
|
+
continue
|
|
1544
|
+
|
|
1545
|
+
table_ref_name = column.table
|
|
1546
|
+
col_name = column.name
|
|
1547
|
+
|
|
1548
|
+
if table_ref_name:
|
|
1549
|
+
# Qualified column — resolve alias to actual table
|
|
1550
|
+
ref_lower = table_ref_name.lower()
|
|
1551
|
+
|
|
1552
|
+
# Skip CTE and subquery references
|
|
1553
|
+
if ref_lower in cte_names or ref_lower in subquery_aliases:
|
|
1554
|
+
continue
|
|
1555
|
+
|
|
1556
|
+
actual_table = alias_map.get(ref_lower)
|
|
1557
|
+
if not actual_table:
|
|
1558
|
+
continue
|
|
1559
|
+
|
|
1560
|
+
# Skip if it resolved to a CTE or subquery
|
|
1561
|
+
if (
|
|
1562
|
+
actual_table.lower() in cte_names
|
|
1563
|
+
or actual_table.lower() in subquery_aliases
|
|
1564
|
+
):
|
|
1565
|
+
continue
|
|
1566
|
+
else:
|
|
1567
|
+
# Unqualified column — attribute to single table if unambiguous
|
|
1568
|
+
if not single_table:
|
|
1569
|
+
if self._strict_schema:
|
|
1570
|
+
preview = select_node.sql(dialect=self.dialect)[:80]
|
|
1571
|
+
raise SchemaResolutionError(
|
|
1572
|
+
f"Cannot resolve table for unqualified column "
|
|
1573
|
+
f"'{col_name}' in multi-table query: {preview}"
|
|
1574
|
+
)
|
|
1575
|
+
continue
|
|
1576
|
+
actual_table = single_table
|
|
1577
|
+
|
|
1578
|
+
if actual_table not in self._file_schema:
|
|
1579
|
+
self._file_schema[actual_table] = {}
|
|
1580
|
+
if col_name not in self._file_schema[actual_table]:
|
|
1581
|
+
self._file_schema[actual_table][col_name] = "UNKNOWN"
|
|
1582
|
+
|
|
1342
1583
|
def _extract_columns_from_select(
|
|
1343
1584
|
self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
|
|
1344
1585
|
) -> List[str]:
|
sqlglider/utils/config.py
CHANGED
|
@@ -60,6 +60,11 @@ class ConfigSettings(BaseModel):
|
|
|
60
60
|
catalog_type: Optional[str] = None
|
|
61
61
|
ddl_folder: Optional[str] = None
|
|
62
62
|
catalog: Optional[CatalogConfig] = None
|
|
63
|
+
no_star: Optional[bool] = None
|
|
64
|
+
resolve_schema: Optional[bool] = None
|
|
65
|
+
dump_schema: Optional[str] = None
|
|
66
|
+
dump_schema_format: Optional[str] = None
|
|
67
|
+
strict_schema: Optional[bool] = None
|
|
63
68
|
|
|
64
69
|
|
|
65
70
|
def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Schema utilities for parsing DDL into schema dictionaries."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from sqlglot import exp, parse
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_ddl_to_schema(ddl: str, dialect: str = "spark") -> Dict[str, Dict[str, str]]:
|
|
9
|
+
"""Extract table schemas from DDL statements.
|
|
10
|
+
|
|
11
|
+
Parses CREATE TABLE/VIEW statements and extracts column names.
|
|
12
|
+
Only column names are needed — types are stored as "UNKNOWN" since
|
|
13
|
+
SQLGlot's lineage only uses names for star expansion.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
ddl: SQL string containing one or more CREATE TABLE/VIEW statements
|
|
17
|
+
dialect: SQL dialect for parsing
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Schema dict mapping table names to column definitions,
|
|
21
|
+
e.g. {"my_table": {"id": "UNKNOWN", "name": "UNKNOWN"}}
|
|
22
|
+
"""
|
|
23
|
+
schema: Dict[str, Dict[str, str]] = {}
|
|
24
|
+
expressions = parse(ddl, dialect=dialect)
|
|
25
|
+
|
|
26
|
+
for expr in expressions:
|
|
27
|
+
if expr is None:
|
|
28
|
+
continue
|
|
29
|
+
if not isinstance(expr, (exp.Create,)):
|
|
30
|
+
continue
|
|
31
|
+
|
|
32
|
+
# Get target table name
|
|
33
|
+
target = expr.this
|
|
34
|
+
if isinstance(target, exp.Schema):
|
|
35
|
+
# Schema node wraps the table and column definitions
|
|
36
|
+
columns = [
|
|
37
|
+
col.name for col in target.expressions if isinstance(col, exp.ColumnDef)
|
|
38
|
+
]
|
|
39
|
+
target = target.this
|
|
40
|
+
else:
|
|
41
|
+
columns = []
|
|
42
|
+
|
|
43
|
+
if not isinstance(target, exp.Table):
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
table_name = _get_qualified_name(target)
|
|
47
|
+
|
|
48
|
+
if columns:
|
|
49
|
+
schema[table_name] = {col: "UNKNOWN" for col in columns}
|
|
50
|
+
|
|
51
|
+
return schema
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_qualified_name(table: exp.Table) -> str:
|
|
55
|
+
"""Build a qualified table name from a SQLGlot Table expression."""
|
|
56
|
+
parts = []
|
|
57
|
+
if table.catalog:
|
|
58
|
+
parts.append(table.catalog)
|
|
59
|
+
if table.db:
|
|
60
|
+
parts.append(table.db)
|
|
61
|
+
parts.append(table.name)
|
|
62
|
+
return ".".join(parts)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|