sql-glider 0.1.8__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.8
3
+ Version: 0.1.12
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -1,6 +1,6 @@
1
1
  sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
- sqlglider/_version.py,sha256=Zaz3s9gl_rzsS46-ymJOALojMxviW77EJq_agE8knLk,704
3
- sqlglider/cli.py,sha256=9sweHRVLk2iBSzCzT2Gcj8y1g1XKzq26iApQsMaFbx4,51786
2
+ sqlglider/_version.py,sha256=cEPXLUpTV7EzqolnyXW8nf8Hr6IVyBji9CzB6Cq_Ar0,706
3
+ sqlglider/cli.py,sha256=qEDLZ1a6yr-BzrtkBsJEHPByMmRERsGKZsYFTn9kaMY,55624
4
4
  sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
5
  sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
6
6
  sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
@@ -11,13 +11,14 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
11
11
  sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
12
12
  sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
13
13
  sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
14
- sqlglider/graph/builder.py,sha256=0ASA749b_FkcKMEVWiijAJP1QKt54ICY7VXxUuo3-y0,11953
14
+ sqlglider/graph/builder.py,sha256=o0SnH5eWUUPpzRSdsdCXEva3QTlhLDagJulJ2hRFQqA,19895
15
+ sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
15
16
  sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
16
17
  sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
17
18
  sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
18
19
  sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9ZlgTQ,2781
19
20
  sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
20
- sqlglider/lineage/analyzer.py,sha256=Vfh0g9xVEEUkQ87KZlCcZVPltDJ6Uos67PBtDyQ_i8U,64679
21
+ sqlglider/lineage/analyzer.py,sha256=-LUeVNEsjfEWoKAJ2qVIiJO1noqwae4jQkwkkkVbAT8,75950
21
22
  sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
22
23
  sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
23
24
  sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
@@ -25,10 +26,11 @@ sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g
25
26
  sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
26
27
  sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
27
28
  sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
28
- sqlglider/utils/config.py,sha256=iNJgSXFw3pmL2MCdvW3SJp4X2T3AQP2QyQuXIXT-6H0,4761
29
+ sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
29
30
  sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
30
- sql_glider-0.1.8.dist-info/METADATA,sha256=ZXlzMjglSWRsOtaW8GIxJa62UKreUHR270WCRpGih-Q,28445
31
- sql_glider-0.1.8.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
32
- sql_glider-0.1.8.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
33
- sql_glider-0.1.8.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- sql_glider-0.1.8.dist-info/RECORD,,
31
+ sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
32
+ sql_glider-0.1.12.dist-info/METADATA,sha256=73yuoWaAE5DKE9wobDXxbERSP2Pq-WpdqCnaswAa9fQ,28446
33
+ sql_glider-0.1.12.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
+ sql_glider-0.1.12.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
35
+ sql_glider-0.1.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
+ sql_glider-0.1.12.dist-info/RECORD,,
sqlglider/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.8'
32
- __version_tuple__ = version_tuple = (0, 1, 8)
31
+ __version__ = version = '0.1.12'
32
+ __version_tuple__ = version_tuple = (0, 1, 12)
33
33
 
34
34
  __commit_id__ = commit_id = None
sqlglider/cli.py CHANGED
@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
12
12
  from typing_extensions import Annotated
13
13
 
14
14
  from sqlglider.global_models import AnalysisLevel, NodeFormat
15
- from sqlglider.lineage.analyzer import LineageAnalyzer
15
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
16
16
  from sqlglider.lineage.formatters import (
17
17
  CsvFormatter,
18
18
  JsonFormatter,
@@ -166,6 +166,11 @@ def lineage(
166
166
  exists=True,
167
167
  help="Path to variables file (JSON or YAML)",
168
168
  ),
169
+ no_star: bool = typer.Option(
170
+ False,
171
+ "--no-star",
172
+ help="Fail if SELECT * cannot be resolved to actual columns",
173
+ ),
169
174
  ) -> None:
170
175
  """
171
176
  Analyze column or table lineage for a SQL file.
@@ -207,6 +212,7 @@ def lineage(
207
212
  level_str = level or config.level or "column"
208
213
  output_format = output_format or config.output_format or "text"
209
214
  templater = templater or config.templater # None means no templating
215
+ no_star = no_star or config.no_star or False
210
216
  # Validate and convert level to enum
211
217
  try:
212
218
  analysis_level = AnalysisLevel(level_str)
@@ -261,7 +267,7 @@ def lineage(
261
267
  )
262
268
 
263
269
  # Create analyzer
264
- analyzer = LineageAnalyzer(sql, dialect=dialect)
270
+ analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
265
271
 
266
272
  # Unified lineage analysis (handles both single and multi-query files)
267
273
  results = analyzer.analyze_queries(
@@ -990,6 +996,40 @@ def graph_build(
990
996
  exists=True,
991
997
  help="Path to variables file (JSON or YAML)",
992
998
  ),
999
+ no_star: bool = typer.Option(
1000
+ False,
1001
+ "--no-star",
1002
+ help="Fail if SELECT * cannot be resolved to actual columns",
1003
+ ),
1004
+ resolve_schema: bool = typer.Option(
1005
+ False,
1006
+ "--resolve-schema",
1007
+ help="Extract schema from all files before lineage analysis, "
1008
+ "enabling cross-file star resolution",
1009
+ ),
1010
+ catalog_type: Optional[str] = typer.Option(
1011
+ None,
1012
+ "--catalog-type",
1013
+ "-c",
1014
+ help="Catalog provider for pulling DDL of tables not found in files "
1015
+ "(requires --resolve-schema). E.g. 'databricks'",
1016
+ ),
1017
+ dump_schema: Optional[Path] = typer.Option(
1018
+ None,
1019
+ "--dump-schema",
1020
+ help="Dump resolved schema to file (requires --resolve-schema)",
1021
+ ),
1022
+ dump_schema_format: Optional[str] = typer.Option(
1023
+ None,
1024
+ "--dump-schema-format",
1025
+ help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
1026
+ ),
1027
+ strict_schema: bool = typer.Option(
1028
+ False,
1029
+ "--strict-schema",
1030
+ help="Fail if any column's table cannot be identified during schema extraction "
1031
+ "(requires --resolve-schema)",
1032
+ ),
993
1033
  ) -> None:
994
1034
  """
995
1035
  Build a lineage graph from SQL files.
@@ -1024,6 +1064,38 @@ def graph_build(
1024
1064
  config = load_config()
1025
1065
  dialect = dialect or config.dialect or "spark"
1026
1066
  templater = templater or config.templater # None means no templating
1067
+ no_star = no_star or config.no_star or False
1068
+ resolve_schema = resolve_schema or config.resolve_schema or False
1069
+ strict_schema = strict_schema or config.strict_schema or False
1070
+
1071
+ if strict_schema and not resolve_schema:
1072
+ err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
1073
+ raise typer.Exit(1)
1074
+
1075
+ if catalog_type and not resolve_schema:
1076
+ err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
1077
+ raise typer.Exit(1)
1078
+
1079
+ # Resolve dump_schema options from config
1080
+ dump_schema = dump_schema or (
1081
+ Path(config.dump_schema) if config.dump_schema else None
1082
+ )
1083
+ dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
1084
+
1085
+ if dump_schema and not resolve_schema:
1086
+ err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
1087
+ raise typer.Exit(1)
1088
+
1089
+ if dump_schema_format not in ("text", "json", "csv"):
1090
+ err_console.print(
1091
+ f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
1092
+ "Use 'text', 'json', or 'csv'."
1093
+ )
1094
+ raise typer.Exit(1)
1095
+
1096
+ # Only inherit catalog_type from config when resolve_schema is active
1097
+ if resolve_schema and not catalog_type:
1098
+ catalog_type = config.catalog_type
1027
1099
 
1028
1100
  # Validate and convert node format to enum
1029
1101
  try:
@@ -1076,10 +1148,22 @@ def graph_build(
1076
1148
  sql_preprocessor = _preprocess
1077
1149
 
1078
1150
  try:
1151
+ # Build catalog config from config file if available
1152
+ catalog_config_dict = None
1153
+ if catalog_type and config.catalog:
1154
+ provider_config = getattr(config.catalog, catalog_type, None)
1155
+ if provider_config:
1156
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1157
+
1079
1158
  builder = GraphBuilder(
1080
1159
  node_format=node_format_enum,
1081
1160
  dialect=dialect,
1082
1161
  sql_preprocessor=sql_preprocessor,
1162
+ no_star=no_star,
1163
+ resolve_schema=resolve_schema,
1164
+ catalog_type=catalog_type,
1165
+ catalog_config=catalog_config_dict,
1166
+ strict_schema=strict_schema,
1083
1167
  )
1084
1168
 
1085
1169
  # Process manifest if provided
@@ -1102,6 +1186,17 @@ def graph_build(
1102
1186
  raise typer.Exit(1)
1103
1187
  builder.add_files(all_files, dialect=dialect)
1104
1188
 
1189
+ # Dump resolved schema if requested
1190
+ if dump_schema:
1191
+ from sqlglider.graph.formatters import format_schema
1192
+
1193
+ schema_content = format_schema(builder.resolved_schema, dump_schema_format)
1194
+ dump_schema.write_text(schema_content, encoding="utf-8")
1195
+ console.print(
1196
+ f"[green]Schema dumped to {dump_schema} "
1197
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1198
+ )
1199
+
1105
1200
  # Build and save graph
1106
1201
  graph = builder.build()
1107
1202
  save_graph(graph, output)
@@ -1111,6 +1206,10 @@ def graph_build(
1111
1206
  f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
1112
1207
  )
1113
1208
 
1209
+ except SchemaResolutionError as e:
1210
+ err_console.print(f"[red]Error:[/red] {e}")
1211
+ raise typer.Exit(1)
1212
+
1114
1213
  except FileNotFoundError as e:
1115
1214
  err_console.print(f"[red]Error:[/red] {e}")
1116
1215
  raise typer.Exit(1)
@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
20
20
  from sqlglider.utils.file_utils import read_sql_file
21
+ from sqlglider.utils.schema import parse_ddl_to_schema
21
22
 
22
23
  console = Console(stderr=True)
23
24
 
@@ -33,6 +34,11 @@ class GraphBuilder:
33
34
  node_format: NodeFormat = NodeFormat.QUALIFIED,
34
35
  dialect: str = "spark",
35
36
  sql_preprocessor: Optional[SqlPreprocessor] = None,
37
+ no_star: bool = False,
38
+ resolve_schema: bool = False,
39
+ catalog_type: Optional[str] = None,
40
+ catalog_config: Optional[Dict[str, object]] = None,
41
+ strict_schema: bool = False,
36
42
  ):
37
43
  """
38
44
  Initialize the graph builder.
@@ -43,15 +49,32 @@ class GraphBuilder:
43
49
  sql_preprocessor: Optional function to preprocess SQL before analysis.
44
50
  Takes (sql: str, file_path: Path) and returns processed SQL.
45
51
  Useful for templating (e.g., Jinja2 rendering).
52
+ no_star: If True, fail when SELECT * cannot be resolved to columns
53
+ resolve_schema: If True, run a schema extraction pass across all
54
+ files before lineage analysis so that schema from any file is
55
+ available when analyzing every other file.
56
+ catalog_type: Optional catalog provider name (e.g. "databricks").
57
+ When set together with resolve_schema, DDL is pulled from the
58
+ catalog for tables whose schema could not be inferred from files.
59
+ catalog_config: Optional provider-specific configuration dict
60
+ passed to the catalog's configure() method.
61
+ strict_schema: If True, fail during schema extraction when an
62
+ unqualified column cannot be attributed to a table.
46
63
  """
47
64
  self.node_format = node_format
48
65
  self.dialect = dialect
49
66
  self.sql_preprocessor = sql_preprocessor
67
+ self.no_star = no_star
68
+ self.resolve_schema = resolve_schema
69
+ self.catalog_type = catalog_type
70
+ self.catalog_config = catalog_config
71
+ self.strict_schema = strict_schema
50
72
  self.graph: rx.PyDiGraph = rx.PyDiGraph()
51
73
  self._node_index_map: Dict[str, int] = {} # identifier -> rustworkx node index
52
74
  self._source_files: Set[str] = set()
53
75
  self._edge_set: Set[tuple] = set() # (source, target) for dedup
54
76
  self._skipped_files: List[tuple[str, str]] = [] # (file_path, reason)
77
+ self._resolved_schema: Dict[str, Dict[str, str]] = {} # accumulated schema
55
78
 
56
79
  def add_file(
57
80
  self,
@@ -82,7 +105,12 @@ class GraphBuilder:
82
105
  if self.sql_preprocessor:
83
106
  sql_content = self.sql_preprocessor(sql_content, file_path)
84
107
 
85
- analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
108
+ analyzer = LineageAnalyzer(
109
+ sql_content,
110
+ dialect=file_dialect,
111
+ no_star=self.no_star,
112
+ schema=self._resolved_schema if self._resolved_schema else None,
113
+ )
86
114
  results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
87
115
 
88
116
  # Print warnings for any skipped queries within the file
@@ -204,23 +232,37 @@ class GraphBuilder:
204
232
  entry_dialect = entry.dialect or dialect or self.dialect
205
233
  files_with_dialects.append((file_path, entry_dialect))
206
234
 
207
- # Process with progress
208
- if files_with_dialects:
209
- total = len(files_with_dialects)
210
- with Progress(
211
- TextColumn("[progress.description]{task.description}"),
212
- BarColumn(),
213
- TaskProgressColumn(),
214
- console=console,
215
- transient=False,
216
- ) as progress:
217
- task = progress.add_task("Parsing", total=total)
218
- for i, (file_path, file_dialect) in enumerate(
219
- files_with_dialects, start=1
220
- ):
221
- console.print(f"Parsing file {i}/{total}: {file_path.name}")
222
- self.add_file(file_path, file_dialect)
223
- progress.advance(task)
235
+ if not files_with_dialects:
236
+ return self
237
+
238
+ # Two-pass schema resolution
239
+ if self.resolve_schema:
240
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
241
+ file_paths_only = [fp for fp, _ in files_with_dialects]
242
+ self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
243
+ if self.catalog_type:
244
+ self._resolved_schema = self._fill_schema_from_catalog(
245
+ self._resolved_schema, file_paths_only, dialect
246
+ )
247
+ console.print(
248
+ f"[blue]Schema resolved for "
249
+ f"{len(self._resolved_schema)} table(s)[/blue]"
250
+ )
251
+
252
+ total = len(files_with_dialects)
253
+ description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
254
+ with Progress(
255
+ TextColumn("[progress.description]{task.description}"),
256
+ BarColumn(),
257
+ TaskProgressColumn(),
258
+ console=console,
259
+ transient=False,
260
+ ) as progress:
261
+ task = progress.add_task(description, total=total)
262
+ for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
263
+ console.print(f"Parsing file {i}/{total}: {file_path.name}")
264
+ self.add_file(file_path, file_dialect)
265
+ progress.advance(task)
224
266
 
225
267
  return self
226
268
 
@@ -244,8 +286,24 @@ class GraphBuilder:
244
286
  if not file_paths:
245
287
  return self
246
288
 
289
+ # Two-pass schema resolution: extract schema from all files first
290
+ if self.resolve_schema:
291
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
292
+ self._resolved_schema = self._extract_schemas(file_paths, dialect)
293
+ if self.catalog_type:
294
+ self._resolved_schema = self._fill_schema_from_catalog(
295
+ self._resolved_schema, file_paths, dialect
296
+ )
297
+ console.print(
298
+ f"[blue]Schema resolved for "
299
+ f"{len(self._resolved_schema)} table(s)[/blue]"
300
+ )
301
+
247
302
  if show_progress:
248
303
  total = len(file_paths)
304
+ description = (
305
+ "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
306
+ )
249
307
  with Progress(
250
308
  TextColumn("[progress.description]{task.description}"),
251
309
  BarColumn(),
@@ -253,7 +311,7 @@ class GraphBuilder:
253
311
  console=console,
254
312
  transient=False,
255
313
  ) as progress:
256
- task = progress.add_task("Parsing", total=total)
314
+ task = progress.add_task(description, total=total)
257
315
  for i, file_path in enumerate(file_paths, start=1):
258
316
  console.print(f"Parsing file {i}/{total}: {file_path.name}")
259
317
  self.add_file(file_path, dialect)
@@ -263,6 +321,129 @@ class GraphBuilder:
263
321
  self.add_file(file_path, dialect)
264
322
  return self
265
323
 
324
+ def _extract_schemas(
325
+ self,
326
+ file_paths: List[Path],
327
+ dialect: Optional[str] = None,
328
+ ) -> Dict[str, Dict[str, str]]:
329
+ """Run schema extraction pass across all files.
330
+
331
+ Parses each file and extracts schema from CREATE TABLE/VIEW
332
+ statements without performing lineage analysis.
333
+
334
+ Args:
335
+ file_paths: SQL files to extract schema from
336
+ dialect: SQL dialect override
337
+
338
+ Returns:
339
+ Accumulated schema dict from all files
340
+ """
341
+ schema: Dict[str, Dict[str, str]] = {}
342
+ total = len(file_paths)
343
+ with Progress(
344
+ TextColumn("[progress.description]{task.description}"),
345
+ BarColumn(),
346
+ TaskProgressColumn(),
347
+ console=console,
348
+ transient=False,
349
+ ) as progress:
350
+ task = progress.add_task("Pass 1: Extracting schema", total=total)
351
+ for i, file_path in enumerate(file_paths, start=1):
352
+ console.print(f"Extracting schema {i}/{total}: {file_path.name}")
353
+ file_dialect = dialect or self.dialect
354
+ try:
355
+ sql_content = read_sql_file(file_path)
356
+ if self.sql_preprocessor:
357
+ sql_content = self.sql_preprocessor(sql_content, file_path)
358
+ analyzer = LineageAnalyzer(
359
+ sql_content,
360
+ dialect=file_dialect,
361
+ schema=schema,
362
+ strict_schema=self.strict_schema,
363
+ )
364
+ file_schema = analyzer.extract_schema_only()
365
+ schema.update(file_schema)
366
+ except SchemaResolutionError:
367
+ raise
368
+ except Exception:
369
+ # Schema extraction failures are non-fatal; the file
370
+ # will be reported during the lineage pass if it also fails.
371
+ pass
372
+ progress.advance(task)
373
+ return schema
374
+
375
+ def _fill_schema_from_catalog(
376
+ self,
377
+ schema: Dict[str, Dict[str, str]],
378
+ file_paths: List[Path],
379
+ dialect: Optional[str] = None,
380
+ ) -> Dict[str, Dict[str, str]]:
381
+ """Pull DDL from catalog for tables not yet in schema.
382
+
383
+ Extracts all table names referenced across the files, identifies
384
+ those missing from the schema, and fetches their DDL from the
385
+ configured catalog provider.
386
+
387
+ Args:
388
+ schema: Schema dict already populated from file extraction
389
+ file_paths: SQL files to scan for table references
390
+ dialect: SQL dialect override
391
+
392
+ Returns:
393
+ Updated schema dict with catalog-sourced entries added
394
+ """
395
+ from sqlglider.catalog import get_catalog
396
+
397
+ catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
398
+ if self.catalog_config:
399
+ catalog.configure(self.catalog_config)
400
+
401
+ # Collect all referenced table names across files
402
+ all_tables: Set[str] = set()
403
+ for file_path in file_paths:
404
+ file_dialect = dialect or self.dialect
405
+ try:
406
+ sql_content = read_sql_file(file_path)
407
+ if self.sql_preprocessor:
408
+ sql_content = self.sql_preprocessor(sql_content, file_path)
409
+ analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
410
+ tables_results = analyzer.analyze_tables()
411
+ for result in tables_results:
412
+ for table_info in result.tables:
413
+ # Skip CTEs — they don't exist in catalogs
414
+ from sqlglider.lineage.analyzer import ObjectType
415
+
416
+ if table_info.object_type != ObjectType.CTE:
417
+ all_tables.add(table_info.name)
418
+ except Exception:
419
+ pass
420
+
421
+ # Find tables missing from schema
422
+ missing = [t for t in all_tables if t not in schema]
423
+ if not missing:
424
+ return schema
425
+
426
+ console.print(
427
+ f"[blue]Pulling DDL from {self.catalog_type} "
428
+ f"for {len(missing)} table(s)...[/blue]"
429
+ )
430
+
431
+ ddl_results = catalog.get_ddl_batch(missing)
432
+ file_dialect = dialect or self.dialect
433
+ for table_name, ddl in ddl_results.items():
434
+ if ddl.startswith("ERROR:"):
435
+ console.print(
436
+ f"[yellow]Warning:[/yellow] Could not pull DDL "
437
+ f"for {table_name}: {ddl}"
438
+ )
439
+ continue
440
+ parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
441
+ for name, cols in parsed_schema.items():
442
+ if name not in schema:
443
+ schema[name] = cols
444
+
445
+ return schema
446
+
266
447
  def _ensure_node(
267
448
  self,
268
449
  identifier: str,
@@ -343,6 +524,11 @@ class GraphBuilder:
343
524
  """Get mapping from node identifiers to rustworkx indices."""
344
525
  return self._node_index_map.copy()
345
526
 
527
+ @property
528
+ def resolved_schema(self) -> Dict[str, Dict[str, str]]:
529
+ """Get the resolved schema dictionary from schema extraction pass."""
530
+ return self._resolved_schema.copy()
531
+
346
532
  @property
347
533
  def skipped_files(self) -> List[tuple[str, str]]:
348
534
  """Get list of files that were skipped during graph building."""
@@ -0,0 +1,98 @@
1
+ """Output formatters for resolved schema data."""
2
+
3
+ import csv
4
+ import json
5
+ from io import StringIO
6
+ from typing import Dict
7
+
8
+ SchemaDict = Dict[str, Dict[str, str]]
9
+
10
+
11
+ def format_schema_text(schema: SchemaDict) -> str:
12
+ """Format resolved schema as human-readable text.
13
+
14
+ Output format:
15
+ customers
16
+ id
17
+ name
18
+
19
+ schema.orders
20
+ order_id
21
+ customer_id
22
+
23
+ Args:
24
+ schema: Resolved schema dictionary mapping table names to column dicts.
25
+
26
+ Returns:
27
+ Text-formatted string.
28
+ """
29
+ lines: list[str] = []
30
+ for table_name in sorted(schema):
31
+ if lines:
32
+ lines.append("")
33
+ lines.append(table_name)
34
+ for column_name in sorted(schema[table_name]):
35
+ lines.append(f" {column_name}")
36
+ return "\n".join(lines) + "\n" if lines else ""
37
+
38
+
39
+ def format_schema_json(schema: SchemaDict) -> str:
40
+ """Format resolved schema as JSON.
41
+
42
+ Args:
43
+ schema: Resolved schema dictionary mapping table names to column dicts.
44
+
45
+ Returns:
46
+ JSON-formatted string.
47
+ """
48
+ sorted_schema = {k: schema[k] for k in sorted(schema)}
49
+ return json.dumps(sorted_schema, indent=2)
50
+
51
+
52
+ def format_schema_csv(schema: SchemaDict) -> str:
53
+ """Format resolved schema as CSV.
54
+
55
+ Output format:
56
+ table,column,type
57
+ customers,id,UNKNOWN
58
+ customers,name,UNKNOWN
59
+
60
+ Args:
61
+ schema: Resolved schema dictionary mapping table names to column dicts.
62
+
63
+ Returns:
64
+ CSV-formatted string.
65
+ """
66
+ output = StringIO()
67
+ writer = csv.writer(output)
68
+ writer.writerow(["table", "column", "type"])
69
+ for table_name in sorted(schema):
70
+ for column_name in sorted(schema[table_name]):
71
+ writer.writerow([table_name, column_name, schema[table_name][column_name]])
72
+ return output.getvalue()
73
+
74
+
75
+ def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
76
+ """Format resolved schema in the specified format.
77
+
78
+ Args:
79
+ schema: Resolved schema dictionary.
80
+ output_format: One of "text", "json", or "csv".
81
+
82
+ Returns:
83
+ Formatted string.
84
+
85
+ Raises:
86
+ ValueError: If output_format is not recognized.
87
+ """
88
+ formatters = {
89
+ "text": format_schema_text,
90
+ "json": format_schema_json,
91
+ "csv": format_schema_csv,
92
+ }
93
+ formatter = formatters.get(output_format)
94
+ if formatter is None:
95
+ raise ValueError(
96
+ f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
97
+ )
98
+ return formatter(schema)
@@ -11,6 +11,14 @@ from sqlglot.lineage import Node, lineage
11
11
  from sqlglider.global_models import AnalysisLevel
12
12
 
13
13
 
14
+ class StarResolutionError(Exception):
15
+ """Raised when SELECT * cannot be resolved and no_star mode is enabled."""
16
+
17
+
18
+ class SchemaResolutionError(Exception):
19
+ """Raised when a column's table cannot be identified and strict_schema is enabled."""
20
+
21
+
14
22
  class TableUsage(str, Enum):
15
23
  """How a table is used in a query."""
16
24
 
@@ -85,23 +93,40 @@ WarningCallback = Callable[[str], None]
85
93
  class LineageAnalyzer:
86
94
  """Analyze column and table lineage for SQL queries."""
87
95
 
88
- def __init__(self, sql: str, dialect: str = "spark"):
96
+ def __init__(
97
+ self,
98
+ sql: str,
99
+ dialect: str = "spark",
100
+ no_star: bool = False,
101
+ schema: Optional[Dict[str, Dict[str, str]]] = None,
102
+ strict_schema: bool = False,
103
+ ):
89
104
  """
90
105
  Initialize the lineage analyzer.
91
106
 
92
107
  Args:
93
108
  sql: SQL query string to analyze (can contain multiple statements)
94
109
  dialect: SQL dialect (default: spark)
110
+ no_star: If True, fail when SELECT * cannot be resolved to columns
111
+ schema: Optional external schema mapping table names to column
112
+ definitions (e.g. {"table": {"col": "UNKNOWN"}}). File-derived
113
+ schema from CREATE statements will merge on top.
114
+ strict_schema: If True, fail during schema extraction when an
115
+ unqualified column cannot be attributed to a table (e.g.
116
+ in a multi-table SELECT without table qualifiers).
95
117
 
96
118
  Raises:
97
119
  ParseError: If the SQL cannot be parsed
98
120
  """
99
121
  self.sql = sql
100
122
  self.dialect = dialect
123
+ self._no_star = no_star
124
+ self._strict_schema = strict_schema
101
125
  self._skipped_queries: List[SkippedQuery] = []
102
126
  # File-scoped schema context for cross-statement lineage
103
127
  # Maps table/view names to their column definitions
104
- self._file_schema: Dict[str, Dict[str, str]] = {}
128
+ self._initial_schema: Dict[str, Dict[str, str]] = dict(schema) if schema else {}
129
+ self._file_schema: Dict[str, Dict[str, str]] = dict(self._initial_schema)
105
130
 
106
131
  try:
107
132
  # Parse all statements in the SQL string
@@ -126,6 +151,27 @@ class LineageAnalyzer:
126
151
  """Get list of queries that were skipped during analysis."""
127
152
  return self._skipped_queries.copy()
128
153
 
154
+ def get_extracted_schema(self) -> Dict[str, Dict[str, str]]:
155
+ """Return the accumulated file schema after analysis."""
156
+ return dict(self._file_schema)
157
+
158
+ def extract_schema_only(self) -> Dict[str, Dict[str, str]]:
159
+ """Parse all statements and extract schema without running lineage.
160
+
161
+ Iterates through all expressions, extracting schema from:
162
+ 1. CREATE TABLE/VIEW AS SELECT statements (existing behavior)
163
+ 2. DQL statements by inferring table columns from qualified column
164
+ references (e.g., ``SELECT t.id FROM table t`` infers
165
+ ``table: {id: UNKNOWN}``)
166
+
167
+ Returns the accumulated schema dict.
168
+ """
169
+ self._file_schema = dict(self._initial_schema)
170
+ for expr in self.expressions:
171
+ self._extract_schema_from_statement(expr)
172
+ self._extract_schema_from_dql(expr)
173
+ return dict(self._file_schema)
174
+
129
175
  def get_output_columns(self) -> List[str]:
130
176
  """
131
177
  Extract all output column names from the query with full qualification.
@@ -171,6 +217,12 @@ class LineageAnalyzer:
171
217
  columns.append(qualified_name)
172
218
  self._column_mapping[qualified_name] = star_col
173
219
  if not columns:
220
+ if self._no_star:
221
+ raise StarResolutionError(
222
+ f"SELECT * could not be resolved to columns "
223
+ f"for target table '{target_table}'. "
224
+ f"Provide schema context or avoid using SELECT *."
225
+ )
174
226
  # Fallback: can't resolve *, use * as column name
175
227
  qualified_name = f"{target_table}.*"
176
228
  columns.append(qualified_name)
@@ -200,6 +252,12 @@ class LineageAnalyzer:
200
252
  columns.append(qualified_name)
201
253
  self._column_mapping[qualified_name] = col
202
254
  if not qualified_star_cols:
255
+ if self._no_star:
256
+ raise StarResolutionError(
257
+ f"SELECT {source_table}.* could not be resolved "
258
+ f"to columns for target table '{target_table}'. "
259
+ f"Provide schema context or avoid using SELECT *."
260
+ )
203
261
  # Fallback: can't resolve t.*, use * as column name
204
262
  qualified_name = f"{target_table}.*"
205
263
  columns.append(qualified_name)
@@ -226,6 +284,23 @@ class LineageAnalyzer:
226
284
  # Get the first SELECT for table resolution (handles UNION case)
227
285
  first_select = self._get_first_select(select_node)
228
286
  for projection in projections:
287
+ # Handle SELECT * in DQL context
288
+ if isinstance(projection, exp.Star):
289
+ if first_select:
290
+ star_columns = self._resolve_star_columns(first_select)
291
+ for star_col in star_columns:
292
+ columns.append(star_col)
293
+ self._column_mapping[star_col] = star_col
294
+ if not columns:
295
+ if self._no_star:
296
+ raise StarResolutionError(
297
+ "SELECT * could not be resolved to columns. "
298
+ "Provide schema context or avoid using SELECT *."
299
+ )
300
+ columns.append("*")
301
+ self._column_mapping["*"] = "*"
302
+ continue
303
+
229
304
  # Get the underlying expression (unwrap alias if present)
230
305
  if isinstance(projection, exp.Alias):
231
306
  source_expr = projection.this
@@ -236,6 +311,30 @@ class LineageAnalyzer:
236
311
  column_name = None
237
312
  lineage_name = None
238
313
 
314
+ # Handle table-qualified star in DQL context (e.g., t.*)
315
+ if isinstance(source_expr, exp.Column) and isinstance(
316
+ source_expr.this, exp.Star
317
+ ):
318
+ source_table = source_expr.table
319
+ dql_star_cols: List[str] = []
320
+ if source_table and first_select:
321
+ dql_star_cols = self._resolve_qualified_star(
322
+ source_table, first_select
323
+ )
324
+ for col in dql_star_cols:
325
+ columns.append(col)
326
+ self._column_mapping[col] = col
327
+ if not dql_star_cols:
328
+ if self._no_star:
329
+ raise StarResolutionError(
330
+ f"SELECT {source_table}.* could not be resolved "
331
+ f"to columns. "
332
+ f"Provide schema context or avoid using SELECT *."
333
+ )
334
+ columns.append("*")
335
+ self._column_mapping["*"] = "*"
336
+ continue
337
+
239
338
  # Try to extract fully qualified name
240
339
  if isinstance(source_expr, exp.Column):
241
340
  # Get table and column parts
@@ -367,7 +466,7 @@ class LineageAnalyzer:
367
466
  """
368
467
  results = []
369
468
  self._skipped_queries = [] # Reset skipped queries for this analysis
370
- self._file_schema = {} # Reset file schema for this analysis run
469
+ self._file_schema = dict(self._initial_schema) # Reset to external schema
371
470
 
372
471
  for query_index, expr, preview in self._iterate_queries(table_filter):
373
472
  # Temporarily swap self.expr to analyze this query
@@ -407,6 +506,8 @@ class LineageAnalyzer:
407
506
  level=level,
408
507
  )
409
508
  )
509
+ except StarResolutionError:
510
+ raise
410
511
  except ValueError as e:
411
512
  # Unsupported statement type - track it and continue
412
513
  stmt_type = self._get_statement_type(expr)
@@ -615,6 +716,12 @@ class LineageAnalyzer:
615
716
  if isinstance(target, exp.Table):
616
717
  return (self._get_qualified_table_name(target), ObjectType.UNKNOWN)
617
718
 
719
+ # CACHE TABLE
720
+ elif isinstance(self.expr, exp.Cache):
721
+ target = self.expr.this
722
+ if isinstance(target, exp.Table):
723
+ return (self._get_qualified_table_name(target), ObjectType.TABLE)
724
+
618
725
  # DELETE FROM table
619
726
  elif isinstance(self.expr, exp.Delete):
620
727
  target = self.expr.this
@@ -706,6 +813,10 @@ class LineageAnalyzer:
706
813
  elif isinstance(self.expr, exp.Drop):
707
814
  return table_node is self.expr.this
708
815
 
816
+ # For CACHE TABLE, the target is self.expr.this
817
+ elif isinstance(self.expr, exp.Cache):
818
+ return table_node is self.expr.this
819
+
709
820
  return False
710
821
 
711
822
  def _analyze_column_lineage_internal(
@@ -741,7 +852,12 @@ class LineageAnalyzer:
741
852
 
742
853
  lineage_items = []
743
854
  # Get SQL for current expression only (not full multi-query SQL)
744
- current_query_sql = self.expr.sql(dialect=self.dialect)
855
+ # For CACHE TABLE, pass just the SELECT since sqlglot.lineage doesn't
856
+ # natively understand CACHE statements
857
+ if isinstance(self.expr, exp.Cache) and self.expr.expression:
858
+ current_query_sql = self.expr.expression.sql(dialect=self.dialect)
859
+ else:
860
+ current_query_sql = self.expr.sql(dialect=self.dialect)
745
861
 
746
862
  for col in columns_to_analyze:
747
863
  try:
@@ -889,6 +1005,7 @@ class LineageAnalyzer:
889
1005
  "Drop": f"DROP {getattr(target_expr, 'kind', '')}".strip(),
890
1006
  "Alter": "ALTER",
891
1007
  "Truncate": "TRUNCATE",
1008
+ "Cache": "CACHE TABLE",
892
1009
  "Command": "COMMAND",
893
1010
  }
894
1011
 
@@ -943,6 +1060,17 @@ class LineageAnalyzer:
943
1060
  ):
944
1061
  return (target_name, select_node)
945
1062
 
1063
+ # Check for CACHE TABLE AS SELECT
1064
+ elif isinstance(self.expr, exp.Cache):
1065
+ target = self.expr.this
1066
+ if isinstance(target, exp.Table):
1067
+ target_name = self._get_qualified_table_name(target)
1068
+ select_node = self.expr.expression
1069
+ if isinstance(
1070
+ select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
1071
+ ):
1072
+ return (target_name, select_node)
1073
+
946
1074
  # Check for MERGE statement
947
1075
  elif isinstance(self.expr, exp.Merge):
948
1076
  target = self.expr.this
@@ -1339,6 +1467,119 @@ class LineageAnalyzer:
1339
1467
  # Store with UNKNOWN type - SQLGlot only needs column names for expansion
1340
1468
  self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
1341
1469
 
1470
+ def _extract_schema_from_dql(self, expr: exp.Expression) -> None:
1471
+ """Infer table schemas from column references in DQL.
1472
+
1473
+ Walks SELECT statements and extracts table-column mappings from:
1474
+ 1. Qualified column references (e.g., ``c.id``) — always resolved.
1475
+ 2. Unqualified column references (e.g., ``id``) — only when the
1476
+ SELECT has exactly one real table source (no joins), making
1477
+ attribution unambiguous.
1478
+
1479
+ Aliases are resolved back to actual table names. CTEs and subquery
1480
+ aliases are skipped since they don't represent external tables.
1481
+
1482
+ Args:
1483
+ expr: The SQL expression to extract schema from.
1484
+ """
1485
+ # Find all SELECT nodes in the expression tree
1486
+ selects = list(expr.find_all(exp.Select))
1487
+ if not selects:
1488
+ return
1489
+
1490
+ for select_node in selects:
1491
+ # Build alias-to-table mapping for this SELECT scope
1492
+ alias_map: Dict[str, str] = {}
1493
+ cte_names: Set[str] = set()
1494
+
1495
+ # Collect CTE names so we can skip them
1496
+ parent = select_node
1497
+ while parent:
1498
+ with_clause = parent.args.get("with")
1499
+ if with_clause:
1500
+ for cte in with_clause.expressions:
1501
+ if isinstance(cte, exp.CTE) and cte.alias:
1502
+ cte_names.add(cte.alias.lower())
1503
+ parent = parent.parent if hasattr(parent, "parent") else None
1504
+
1505
+ # Collect subquery aliases so we can skip them too
1506
+ subquery_aliases: Set[str] = set()
1507
+ from_clause = select_node.args.get("from")
1508
+ if from_clause and isinstance(from_clause, exp.From):
1509
+ source = from_clause.this
1510
+ if isinstance(source, exp.Subquery) and source.alias:
1511
+ subquery_aliases.add(source.alias.lower())
1512
+ for join in select_node.find_all(exp.Join):
1513
+ if isinstance(join.this, exp.Subquery) and join.this.alias:
1514
+ subquery_aliases.add(join.this.alias.lower())
1515
+
1516
+ # Build alias map from FROM/JOIN table references
1517
+ real_tables: list[str] = [] # track non-CTE, non-subquery tables
1518
+ for table_ref in select_node.find_all(exp.Table):
1519
+ # Skip tables inside nested selects — they belong to inner scope
1520
+ if table_ref.find_ancestor(exp.Select) is not select_node:
1521
+ continue
1522
+ qualified = self._get_qualified_table_name(table_ref)
1523
+ if table_ref.alias:
1524
+ alias_map[table_ref.alias.lower()] = qualified
1525
+ else:
1526
+ alias_map[table_ref.name.lower()] = qualified
1527
+ # Track real tables (not CTEs or subqueries)
1528
+ if (
1529
+ qualified.lower() not in cte_names
1530
+ and qualified.lower() not in subquery_aliases
1531
+ ):
1532
+ real_tables.append(qualified)
1533
+
1534
+ # Determine single-table target for unqualified columns
1535
+ # Only set when exactly one real table source exists (unambiguous)
1536
+ single_table: Optional[str] = (
1537
+ real_tables[0] if len(real_tables) == 1 else None
1538
+ )
1539
+
1540
+ # Walk all column references in this SELECT
1541
+ for column in select_node.find_all(exp.Column):
1542
+ if isinstance(column.this, exp.Star):
1543
+ continue
1544
+
1545
+ table_ref_name = column.table
1546
+ col_name = column.name
1547
+
1548
+ if table_ref_name:
1549
+ # Qualified column — resolve alias to actual table
1550
+ ref_lower = table_ref_name.lower()
1551
+
1552
+ # Skip CTE and subquery references
1553
+ if ref_lower in cte_names or ref_lower in subquery_aliases:
1554
+ continue
1555
+
1556
+ actual_table = alias_map.get(ref_lower)
1557
+ if not actual_table:
1558
+ continue
1559
+
1560
+ # Skip if it resolved to a CTE or subquery
1561
+ if (
1562
+ actual_table.lower() in cte_names
1563
+ or actual_table.lower() in subquery_aliases
1564
+ ):
1565
+ continue
1566
+ else:
1567
+ # Unqualified column — attribute to single table if unambiguous
1568
+ if not single_table:
1569
+ if self._strict_schema:
1570
+ preview = select_node.sql(dialect=self.dialect)[:80]
1571
+ raise SchemaResolutionError(
1572
+ f"Cannot resolve table for unqualified column "
1573
+ f"'{col_name}' in multi-table query: {preview}"
1574
+ )
1575
+ continue
1576
+ actual_table = single_table
1577
+
1578
+ if actual_table not in self._file_schema:
1579
+ self._file_schema[actual_table] = {}
1580
+ if col_name not in self._file_schema[actual_table]:
1581
+ self._file_schema[actual_table][col_name] = "UNKNOWN"
1582
+
1342
1583
  def _extract_columns_from_select(
1343
1584
  self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
1344
1585
  ) -> List[str]:
sqlglider/utils/config.py CHANGED
@@ -60,6 +60,11 @@ class ConfigSettings(BaseModel):
60
60
  catalog_type: Optional[str] = None
61
61
  ddl_folder: Optional[str] = None
62
62
  catalog: Optional[CatalogConfig] = None
63
+ no_star: Optional[bool] = None
64
+ resolve_schema: Optional[bool] = None
65
+ dump_schema: Optional[str] = None
66
+ dump_schema_format: Optional[str] = None
67
+ strict_schema: Optional[bool] = None
63
68
 
64
69
 
65
70
  def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:
@@ -0,0 +1,62 @@
1
+ """Schema utilities for parsing DDL into schema dictionaries."""
2
+
3
+ from typing import Dict
4
+
5
+ from sqlglot import exp, parse
6
+
7
+
8
+ def parse_ddl_to_schema(ddl: str, dialect: str = "spark") -> Dict[str, Dict[str, str]]:
9
+ """Extract table schemas from DDL statements.
10
+
11
+ Parses CREATE TABLE/VIEW statements and extracts column names.
12
+ Only column names are needed — types are stored as "UNKNOWN" since
13
+ SQLGlot's lineage only uses names for star expansion.
14
+
15
+ Args:
16
+ ddl: SQL string containing one or more CREATE TABLE/VIEW statements
17
+ dialect: SQL dialect for parsing
18
+
19
+ Returns:
20
+ Schema dict mapping table names to column definitions,
21
+ e.g. {"my_table": {"id": "UNKNOWN", "name": "UNKNOWN"}}
22
+ """
23
+ schema: Dict[str, Dict[str, str]] = {}
24
+ expressions = parse(ddl, dialect=dialect)
25
+
26
+ for expr in expressions:
27
+ if expr is None:
28
+ continue
29
+ if not isinstance(expr, (exp.Create,)):
30
+ continue
31
+
32
+ # Get target table name
33
+ target = expr.this
34
+ if isinstance(target, exp.Schema):
35
+ # Schema node wraps the table and column definitions
36
+ columns = [
37
+ col.name for col in target.expressions if isinstance(col, exp.ColumnDef)
38
+ ]
39
+ target = target.this
40
+ else:
41
+ columns = []
42
+
43
+ if not isinstance(target, exp.Table):
44
+ continue
45
+
46
+ table_name = _get_qualified_name(target)
47
+
48
+ if columns:
49
+ schema[table_name] = {col: "UNKNOWN" for col in columns}
50
+
51
+ return schema
52
+
53
+
54
+ def _get_qualified_name(table: exp.Table) -> str:
55
+ """Build a qualified table name from a SQLGlot Table expression."""
56
+ parts = []
57
+ if table.catalog:
58
+ parts.append(table.catalog)
59
+ if table.db:
60
+ parts.append(table.db)
61
+ parts.append(table.name)
62
+ return ".".join(parts)