sql-glider 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -1,6 +1,6 @@
1
1
  sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
- sqlglider/_version.py,sha256=0-Ruc52ECccw_8Ef0d7jMkzrb8fkobUkZLqGGvcm1ik,706
3
- sqlglider/cli.py,sha256=DMCMw5dxDHB2MuxBXuJMNeDSlIGAfKDz1Renp0YwGGM,52224
2
+ sqlglider/_version.py,sha256=Xz5RLbyPcCHHXte393JYfUy4Dt7uaeWyrGVw9SmJ0eg,706
3
+ sqlglider/cli.py,sha256=FDTjRmor_cQlcwfiD_uHTrQao2sMf3ev21IUyUSt7Qs,56401
4
4
  sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
5
  sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
6
6
  sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
@@ -11,13 +11,14 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
11
11
  sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
12
12
  sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
13
13
  sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
14
- sqlglider/graph/builder.py,sha256=HdkMcuZkxdEFO0CXMAaqGQSyhvzuaIQTaFscQdO2GSI,12146
14
+ sqlglider/graph/builder.py,sha256=suxc_hymHvHnkgltgXqwwIoxlay7zhy1Enbs6HNC3m8,20107
15
+ sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
15
16
  sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
16
17
  sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
17
18
  sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
18
19
  sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9ZlgTQ,2781
19
20
  sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
20
- sqlglider/lineage/analyzer.py,sha256=gjJtJU-sxFokoSVxcHpcIdbP3H8GD_KQaubbbcG0UCM,68982
21
+ sqlglider/lineage/analyzer.py,sha256=08pFR5aGFFPhSbRW6EqiX2d3mp91v-orcs6dm_T1FJg,76484
21
22
  sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
22
23
  sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
23
24
  sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
@@ -25,10 +26,11 @@ sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g
25
26
  sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
26
27
  sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
27
28
  sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
28
- sqlglider/utils/config.py,sha256=rbbiDCWA_h29vgWJZ1z3zQmGcei0KcxhTPcymSCYeFo,4796
29
+ sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
29
30
  sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
30
- sql_glider-0.1.11.dist-info/METADATA,sha256=JxQakiYUUzvldsEzjdXQLV63ud07Gw_bcZ2BIi29nuQ,28446
31
- sql_glider-0.1.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
32
- sql_glider-0.1.11.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
33
- sql_glider-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
34
- sql_glider-0.1.11.dist-info/RECORD,,
31
+ sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
32
+ sql_glider-0.1.13.dist-info/METADATA,sha256=z-utivkULH1BBhygNpLcWN9UdU1DbwfF3EzUhGtWXes,28446
33
+ sql_glider-0.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
+ sql_glider-0.1.13.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
35
+ sql_glider-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
+ sql_glider-0.1.13.dist-info/RECORD,,
sqlglider/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.11'
32
- __version_tuple__ = version_tuple = (0, 1, 11)
31
+ __version__ = version = '0.1.13'
32
+ __version_tuple__ = version_tuple = (0, 1, 13)
33
33
 
34
34
  __commit_id__ = commit_id = None
sqlglider/cli.py CHANGED
@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
12
12
  from typing_extensions import Annotated
13
13
 
14
14
  from sqlglider.global_models import AnalysisLevel, NodeFormat
15
- from sqlglider.lineage.analyzer import LineageAnalyzer
15
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
16
16
  from sqlglider.lineage.formatters import (
17
17
  CsvFormatter,
18
18
  JsonFormatter,
@@ -1001,6 +1001,35 @@ def graph_build(
1001
1001
  "--no-star",
1002
1002
  help="Fail if SELECT * cannot be resolved to actual columns",
1003
1003
  ),
1004
+ resolve_schema: bool = typer.Option(
1005
+ False,
1006
+ "--resolve-schema",
1007
+ help="Extract schema from all files before lineage analysis, "
1008
+ "enabling cross-file star resolution",
1009
+ ),
1010
+ catalog_type: Optional[str] = typer.Option(
1011
+ None,
1012
+ "--catalog-type",
1013
+ "-c",
1014
+ help="Catalog provider for pulling DDL of tables not found in files "
1015
+ "(requires --resolve-schema). E.g. 'databricks'",
1016
+ ),
1017
+ dump_schema: Optional[Path] = typer.Option(
1018
+ None,
1019
+ "--dump-schema",
1020
+ help="Dump resolved schema to file (requires --resolve-schema)",
1021
+ ),
1022
+ dump_schema_format: Optional[str] = typer.Option(
1023
+ None,
1024
+ "--dump-schema-format",
1025
+ help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
1026
+ ),
1027
+ strict_schema: bool = typer.Option(
1028
+ False,
1029
+ "--strict-schema",
1030
+ help="Fail if any column's table cannot be identified during schema extraction "
1031
+ "(requires --resolve-schema)",
1032
+ ),
1004
1033
  ) -> None:
1005
1034
  """
1006
1035
  Build a lineage graph from SQL files.
@@ -1036,6 +1065,37 @@ def graph_build(
1036
1065
  dialect = dialect or config.dialect or "spark"
1037
1066
  templater = templater or config.templater # None means no templating
1038
1067
  no_star = no_star or config.no_star or False
1068
+ resolve_schema = resolve_schema or config.resolve_schema or False
1069
+ strict_schema = strict_schema or config.strict_schema or False
1070
+
1071
+ if strict_schema and not resolve_schema:
1072
+ err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
1073
+ raise typer.Exit(1)
1074
+
1075
+ if catalog_type and not resolve_schema:
1076
+ err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
1077
+ raise typer.Exit(1)
1078
+
1079
+ # Resolve dump_schema options from config
1080
+ dump_schema = dump_schema or (
1081
+ Path(config.dump_schema) if config.dump_schema else None
1082
+ )
1083
+ dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
1084
+
1085
+ if dump_schema and not resolve_schema:
1086
+ err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
1087
+ raise typer.Exit(1)
1088
+
1089
+ if dump_schema_format not in ("text", "json", "csv"):
1090
+ err_console.print(
1091
+ f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
1092
+ "Use 'text', 'json', or 'csv'."
1093
+ )
1094
+ raise typer.Exit(1)
1095
+
1096
+ # Only inherit catalog_type from config when resolve_schema is active
1097
+ if resolve_schema and not catalog_type:
1098
+ catalog_type = config.catalog_type
1039
1099
 
1040
1100
  # Validate and convert node format to enum
1041
1101
  try:
@@ -1088,32 +1148,75 @@ def graph_build(
1088
1148
  sql_preprocessor = _preprocess
1089
1149
 
1090
1150
  try:
1151
+ # Build catalog config from config file if available
1152
+ catalog_config_dict = None
1153
+ if catalog_type and config.catalog:
1154
+ provider_config = getattr(config.catalog, catalog_type, None)
1155
+ if provider_config:
1156
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1157
+
1091
1158
  builder = GraphBuilder(
1092
1159
  node_format=node_format_enum,
1093
1160
  dialect=dialect,
1094
1161
  sql_preprocessor=sql_preprocessor,
1095
1162
  no_star=no_star,
1163
+ resolve_schema=resolve_schema,
1164
+ catalog_type=catalog_type,
1165
+ catalog_config=catalog_config_dict,
1166
+ strict_schema=strict_schema,
1096
1167
  )
1097
1168
 
1098
- # Process manifest if provided
1099
- if manifest:
1100
- builder.add_manifest(manifest, dialect=dialect)
1101
-
1102
- # Process paths - collect all files first for progress tracking
1169
+ # Collect file paths for schema extraction
1170
+ path_files: list[Path] = []
1103
1171
  if paths:
1104
- all_files: list[Path] = []
1105
1172
  for path in paths:
1106
1173
  if path.is_dir():
1107
1174
  pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1108
- all_files.extend(
1175
+ path_files.extend(
1109
1176
  f for f in sorted(path.glob(pattern)) if f.is_file()
1110
1177
  )
1111
1178
  elif path.is_file():
1112
- all_files.append(path)
1179
+ path_files.append(path)
1113
1180
  else:
1114
1181
  err_console.print(f"[red]Error:[/red] Path not found: {path}")
1115
1182
  raise typer.Exit(1)
1116
- builder.add_files(all_files, dialect=dialect)
1183
+
1184
+ manifest_files: list[Path] = []
1185
+ if manifest:
1186
+ from sqlglider.graph.models import Manifest
1187
+
1188
+ manifest_data = Manifest.from_csv(manifest)
1189
+ base_dir = manifest.parent
1190
+ for entry in manifest_data.entries:
1191
+ file_path = Path(entry.file_path)
1192
+ if not file_path.is_absolute():
1193
+ file_path = (base_dir / entry.file_path).resolve()
1194
+ manifest_files.append(file_path)
1195
+
1196
+ # Extract schema upfront if requested, then dump before graph building
1197
+ all_files = manifest_files + path_files
1198
+ if resolve_schema and all_files:
1199
+ builder.extract_schemas(all_files, dialect=dialect)
1200
+
1201
+ if dump_schema:
1202
+ from sqlglider.graph.formatters import format_schema
1203
+
1204
+ schema_content = format_schema(
1205
+ builder.resolved_schema, dump_schema_format
1206
+ )
1207
+ dump_schema.write_text(schema_content, encoding="utf-8")
1208
+ console.print(
1209
+ f"[green]Schema dumped to {dump_schema} "
1210
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1211
+ )
1212
+
1213
+ # Process manifest if provided
1214
+ if manifest:
1215
+ builder.add_manifest(manifest, dialect=dialect)
1216
+
1217
+ # Process path-based files
1218
+ if path_files:
1219
+ builder.add_files(path_files, dialect=dialect)
1117
1220
 
1118
1221
  # Build and save graph
1119
1222
  graph = builder.build()
@@ -1124,6 +1227,10 @@ def graph_build(
1124
1227
  f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
1125
1228
  )
1126
1229
 
1230
+ except SchemaResolutionError as e:
1231
+ err_console.print(f"[red]Error:[/red] {e}")
1232
+ raise typer.Exit(1)
1233
+
1127
1234
  except FileNotFoundError as e:
1128
1235
  err_console.print(f"[red]Error:[/red] {e}")
1129
1236
  raise typer.Exit(1)
@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
20
20
  from sqlglider.utils.file_utils import read_sql_file
21
+ from sqlglider.utils.schema import parse_ddl_to_schema
21
22
 
22
23
  console = Console(stderr=True)
23
24
 
@@ -34,6 +35,10 @@ class GraphBuilder:
34
35
  dialect: str = "spark",
35
36
  sql_preprocessor: Optional[SqlPreprocessor] = None,
36
37
  no_star: bool = False,
38
+ resolve_schema: bool = False,
39
+ catalog_type: Optional[str] = None,
40
+ catalog_config: Optional[Dict[str, object]] = None,
41
+ strict_schema: bool = False,
37
42
  ):
38
43
  """
39
44
  Initialize the graph builder.
@@ -45,16 +50,31 @@ class GraphBuilder:
45
50
  Takes (sql: str, file_path: Path) and returns processed SQL.
46
51
  Useful for templating (e.g., Jinja2 rendering).
47
52
  no_star: If True, fail when SELECT * cannot be resolved to columns
53
+ resolve_schema: If True, run a schema extraction pass across all
54
+ files before lineage analysis so that schema from any file is
55
+ available when analyzing every other file.
56
+ catalog_type: Optional catalog provider name (e.g. "databricks").
57
+ When set together with resolve_schema, DDL is pulled from the
58
+ catalog for tables whose schema could not be inferred from files.
59
+ catalog_config: Optional provider-specific configuration dict
60
+ passed to the catalog's configure() method.
61
+ strict_schema: If True, fail during schema extraction when an
62
+ unqualified column cannot be attributed to a table.
48
63
  """
49
64
  self.node_format = node_format
50
65
  self.dialect = dialect
51
66
  self.sql_preprocessor = sql_preprocessor
52
67
  self.no_star = no_star
68
+ self.resolve_schema = resolve_schema
69
+ self.catalog_type = catalog_type
70
+ self.catalog_config = catalog_config
71
+ self.strict_schema = strict_schema
53
72
  self.graph: rx.PyDiGraph = rx.PyDiGraph()
54
73
  self._node_index_map: Dict[str, int] = {} # identifier -> rustworkx node index
55
74
  self._source_files: Set[str] = set()
56
75
  self._edge_set: Set[tuple] = set() # (source, target) for dedup
57
76
  self._skipped_files: List[tuple[str, str]] = [] # (file_path, reason)
77
+ self._resolved_schema: Dict[str, Dict[str, str]] = {} # accumulated schema
58
78
 
59
79
  def add_file(
60
80
  self,
@@ -86,7 +106,10 @@ class GraphBuilder:
86
106
  sql_content = self.sql_preprocessor(sql_content, file_path)
87
107
 
88
108
  analyzer = LineageAnalyzer(
89
- sql_content, dialect=file_dialect, no_star=self.no_star
109
+ sql_content,
110
+ dialect=file_dialect,
111
+ no_star=self.no_star,
112
+ schema=self._resolved_schema if self._resolved_schema else None,
90
113
  )
91
114
  results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
92
115
 
@@ -209,23 +232,28 @@ class GraphBuilder:
209
232
  entry_dialect = entry.dialect or dialect or self.dialect
210
233
  files_with_dialects.append((file_path, entry_dialect))
211
234
 
212
- # Process with progress
213
- if files_with_dialects:
214
- total = len(files_with_dialects)
215
- with Progress(
216
- TextColumn("[progress.description]{task.description}"),
217
- BarColumn(),
218
- TaskProgressColumn(),
219
- console=console,
220
- transient=False,
221
- ) as progress:
222
- task = progress.add_task("Parsing", total=total)
223
- for i, (file_path, file_dialect) in enumerate(
224
- files_with_dialects, start=1
225
- ):
226
- console.print(f"Parsing file {i}/{total}: {file_path.name}")
227
- self.add_file(file_path, file_dialect)
228
- progress.advance(task)
235
+ if not files_with_dialects:
236
+ return self
237
+
238
+ # Two-pass schema resolution (skip if already resolved)
239
+ if self.resolve_schema and not self._resolved_schema:
240
+ file_paths_only = [fp for fp, _ in files_with_dialects]
241
+ self.extract_schemas(file_paths_only, dialect)
242
+
243
+ total = len(files_with_dialects)
244
+ description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
245
+ with Progress(
246
+ TextColumn("[progress.description]{task.description}"),
247
+ BarColumn(),
248
+ TaskProgressColumn(),
249
+ console=console,
250
+ transient=False,
251
+ ) as progress:
252
+ task = progress.add_task(description, total=total)
253
+ for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
254
+ console.print(f"Parsing file {i}/{total}: {file_path.name}")
255
+ self.add_file(file_path, file_dialect)
256
+ progress.advance(task)
229
257
 
230
258
  return self
231
259
 
@@ -249,8 +277,15 @@ class GraphBuilder:
249
277
  if not file_paths:
250
278
  return self
251
279
 
280
+ # Two-pass schema resolution (skip if already resolved)
281
+ if self.resolve_schema and not self._resolved_schema:
282
+ self.extract_schemas(file_paths, dialect)
283
+
252
284
  if show_progress:
253
285
  total = len(file_paths)
286
+ description = (
287
+ "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
288
+ )
254
289
  with Progress(
255
290
  TextColumn("[progress.description]{task.description}"),
256
291
  BarColumn(),
@@ -258,7 +293,7 @@ class GraphBuilder:
258
293
  console=console,
259
294
  transient=False,
260
295
  ) as progress:
261
- task = progress.add_task("Parsing", total=total)
296
+ task = progress.add_task(description, total=total)
262
297
  for i, file_path in enumerate(file_paths, start=1):
263
298
  console.print(f"Parsing file {i}/{total}: {file_path.name}")
264
299
  self.add_file(file_path, dialect)
@@ -268,6 +303,157 @@ class GraphBuilder:
268
303
  self.add_file(file_path, dialect)
269
304
  return self
270
305
 
306
+ def extract_schemas(
307
+ self,
308
+ file_paths: List[Path],
309
+ dialect: Optional[str] = None,
310
+ ) -> Dict[str, Dict[str, str]]:
311
+ """Run schema extraction pass and optionally fill from catalog.
312
+
313
+ Call this before add_files/add_manifest to resolve schema upfront.
314
+ The resolved schema is stored internally and also returned.
315
+
316
+ Args:
317
+ file_paths: SQL files to extract schema from
318
+ dialect: SQL dialect override
319
+
320
+ Returns:
321
+ Resolved schema dict
322
+ """
323
+ console.print("[blue]Pass 1: Extracting schema from files[/blue]")
324
+ self._resolved_schema = self._extract_schemas(file_paths, dialect)
325
+ if self.catalog_type:
326
+ self._resolved_schema = self._fill_schema_from_catalog(
327
+ self._resolved_schema, file_paths, dialect
328
+ )
329
+ console.print(
330
+ f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
331
+ )
332
+ return self._resolved_schema.copy()
333
+
334
+ def _extract_schemas(
335
+ self,
336
+ file_paths: List[Path],
337
+ dialect: Optional[str] = None,
338
+ ) -> Dict[str, Dict[str, str]]:
339
+ """Run schema extraction pass across all files.
340
+
341
+ Parses each file and extracts schema from CREATE TABLE/VIEW
342
+ statements without performing lineage analysis.
343
+
344
+ Args:
345
+ file_paths: SQL files to extract schema from
346
+ dialect: SQL dialect override
347
+
348
+ Returns:
349
+ Accumulated schema dict from all files
350
+ """
351
+ schema: Dict[str, Dict[str, str]] = {}
352
+ total = len(file_paths)
353
+ with Progress(
354
+ TextColumn("[progress.description]{task.description}"),
355
+ BarColumn(),
356
+ TaskProgressColumn(),
357
+ console=console,
358
+ transient=False,
359
+ ) as progress:
360
+ task = progress.add_task("Pass 1: Extracting schema", total=total)
361
+ for i, file_path in enumerate(file_paths, start=1):
362
+ console.print(f"Extracting schema {i}/{total}: {file_path.name}")
363
+ file_dialect = dialect or self.dialect
364
+ try:
365
+ sql_content = read_sql_file(file_path)
366
+ if self.sql_preprocessor:
367
+ sql_content = self.sql_preprocessor(sql_content, file_path)
368
+ analyzer = LineageAnalyzer(
369
+ sql_content,
370
+ dialect=file_dialect,
371
+ schema=schema,
372
+ strict_schema=self.strict_schema,
373
+ )
374
+ file_schema = analyzer.extract_schema_only()
375
+ schema.update(file_schema)
376
+ except SchemaResolutionError:
377
+ raise
378
+ except Exception:
379
+ # Schema extraction failures are non-fatal; the file
380
+ # will be reported during the lineage pass if it also fails.
381
+ pass
382
+ progress.advance(task)
383
+ return schema
384
+
385
+ def _fill_schema_from_catalog(
386
+ self,
387
+ schema: Dict[str, Dict[str, str]],
388
+ file_paths: List[Path],
389
+ dialect: Optional[str] = None,
390
+ ) -> Dict[str, Dict[str, str]]:
391
+ """Pull DDL from catalog for tables not yet in schema.
392
+
393
+ Extracts all table names referenced across the files, identifies
394
+ those missing from the schema, and fetches their DDL from the
395
+ configured catalog provider.
396
+
397
+ Args:
398
+ schema: Schema dict already populated from file extraction
399
+ file_paths: SQL files to scan for table references
400
+ dialect: SQL dialect override
401
+
402
+ Returns:
403
+ Updated schema dict with catalog-sourced entries added
404
+ """
405
+ from sqlglider.catalog import get_catalog
406
+
407
+ catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
408
+ if self.catalog_config:
409
+ catalog.configure(self.catalog_config)
410
+
411
+ # Collect all referenced table names across files
412
+ all_tables: Set[str] = set()
413
+ for file_path in file_paths:
414
+ file_dialect = dialect or self.dialect
415
+ try:
416
+ sql_content = read_sql_file(file_path)
417
+ if self.sql_preprocessor:
418
+ sql_content = self.sql_preprocessor(sql_content, file_path)
419
+ analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
420
+ tables_results = analyzer.analyze_tables()
421
+ for result in tables_results:
422
+ for table_info in result.tables:
423
+ # Skip CTEs — they don't exist in catalogs
424
+ from sqlglider.lineage.analyzer import ObjectType
425
+
426
+ if table_info.object_type != ObjectType.CTE:
427
+ all_tables.add(table_info.name)
428
+ except Exception:
429
+ pass
430
+
431
+ # Find tables missing from schema
432
+ missing = [t for t in all_tables if t not in schema]
433
+ if not missing:
434
+ return schema
435
+
436
+ console.print(
437
+ f"[blue]Pulling DDL from {self.catalog_type} "
438
+ f"for {len(missing)} table(s)...[/blue]"
439
+ )
440
+
441
+ ddl_results = catalog.get_ddl_batch(missing)
442
+ file_dialect = dialect or self.dialect
443
+ for table_name, ddl in ddl_results.items():
444
+ if ddl.startswith("ERROR:"):
445
+ console.print(
446
+ f"[yellow]Warning:[/yellow] Could not pull DDL "
447
+ f"for {table_name}: {ddl}"
448
+ )
449
+ continue
450
+ parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
451
+ for name, cols in parsed_schema.items():
452
+ if name not in schema:
453
+ schema[name] = cols
454
+
455
+ return schema
456
+
271
457
  def _ensure_node(
272
458
  self,
273
459
  identifier: str,
@@ -348,6 +534,11 @@ class GraphBuilder:
348
534
  """Get mapping from node identifiers to rustworkx indices."""
349
535
  return self._node_index_map.copy()
350
536
 
537
+ @property
538
+ def resolved_schema(self) -> Dict[str, Dict[str, str]]:
539
+ """Get the resolved schema dictionary from schema extraction pass."""
540
+ return self._resolved_schema.copy()
541
+
351
542
  @property
352
543
  def skipped_files(self) -> List[tuple[str, str]]:
353
544
  """Get list of files that were skipped during graph building."""
@@ -0,0 +1,98 @@
1
+ """Output formatters for resolved schema data."""
2
+
3
+ import csv
4
+ import json
5
+ from io import StringIO
6
+ from typing import Dict
7
+
8
+ SchemaDict = Dict[str, Dict[str, str]]
9
+
10
+
11
+ def format_schema_text(schema: SchemaDict) -> str:
12
+ """Format resolved schema as human-readable text.
13
+
14
+ Output format:
15
+ customers
16
+ id
17
+ name
18
+
19
+ schema.orders
20
+ order_id
21
+ customer_id
22
+
23
+ Args:
24
+ schema: Resolved schema dictionary mapping table names to column dicts.
25
+
26
+ Returns:
27
+ Text-formatted string.
28
+ """
29
+ lines: list[str] = []
30
+ for table_name in sorted(schema):
31
+ if lines:
32
+ lines.append("")
33
+ lines.append(table_name)
34
+ for column_name in sorted(schema[table_name]):
35
+ lines.append(f" {column_name}")
36
+ return "\n".join(lines) + "\n" if lines else ""
37
+
38
+
39
+ def format_schema_json(schema: SchemaDict) -> str:
40
+ """Format resolved schema as JSON.
41
+
42
+ Args:
43
+ schema: Resolved schema dictionary mapping table names to column dicts.
44
+
45
+ Returns:
46
+ JSON-formatted string.
47
+ """
48
+ sorted_schema = {k: schema[k] for k in sorted(schema)}
49
+ return json.dumps(sorted_schema, indent=2)
50
+
51
+
52
+ def format_schema_csv(schema: SchemaDict) -> str:
53
+ """Format resolved schema as CSV.
54
+
55
+ Output format:
56
+ table,column,type
57
+ customers,id,UNKNOWN
58
+ customers,name,UNKNOWN
59
+
60
+ Args:
61
+ schema: Resolved schema dictionary mapping table names to column dicts.
62
+
63
+ Returns:
64
+ CSV-formatted string.
65
+ """
66
+ output = StringIO()
67
+ writer = csv.writer(output)
68
+ writer.writerow(["table", "column", "type"])
69
+ for table_name in sorted(schema):
70
+ for column_name in sorted(schema[table_name]):
71
+ writer.writerow([table_name, column_name, schema[table_name][column_name]])
72
+ return output.getvalue()
73
+
74
+
75
+ def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
76
+ """Format resolved schema in the specified format.
77
+
78
+ Args:
79
+ schema: Resolved schema dictionary.
80
+ output_format: One of "text", "json", or "csv".
81
+
82
+ Returns:
83
+ Formatted string.
84
+
85
+ Raises:
86
+ ValueError: If output_format is not recognized.
87
+ """
88
+ formatters = {
89
+ "text": format_schema_text,
90
+ "json": format_schema_json,
91
+ "csv": format_schema_csv,
92
+ }
93
+ formatter = formatters.get(output_format)
94
+ if formatter is None:
95
+ raise ValueError(
96
+ f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
97
+ )
98
+ return formatter(schema)
@@ -15,6 +15,10 @@ class StarResolutionError(Exception):
15
15
  """Raised when SELECT * cannot be resolved and no_star mode is enabled."""
16
16
 
17
17
 
18
+ class SchemaResolutionError(Exception):
19
+ """Raised when a column's table cannot be identified and strict_schema is enabled."""
20
+
21
+
18
22
  class TableUsage(str, Enum):
19
23
  """How a table is used in a query."""
20
24
 
@@ -89,7 +93,14 @@ WarningCallback = Callable[[str], None]
89
93
  class LineageAnalyzer:
90
94
  """Analyze column and table lineage for SQL queries."""
91
95
 
92
- def __init__(self, sql: str, dialect: str = "spark", no_star: bool = False):
96
+ def __init__(
97
+ self,
98
+ sql: str,
99
+ dialect: str = "spark",
100
+ no_star: bool = False,
101
+ schema: Optional[Dict[str, Dict[str, str]]] = None,
102
+ strict_schema: bool = False,
103
+ ):
93
104
  """
94
105
  Initialize the lineage analyzer.
95
106
 
@@ -97,6 +108,12 @@ class LineageAnalyzer:
97
108
  sql: SQL query string to analyze (can contain multiple statements)
98
109
  dialect: SQL dialect (default: spark)
99
110
  no_star: If True, fail when SELECT * cannot be resolved to columns
111
+ schema: Optional external schema mapping table names to column
112
+ definitions (e.g. {"table": {"col": "UNKNOWN"}}). File-derived
113
+ schema from CREATE statements will merge on top.
114
+ strict_schema: If True, fail during schema extraction when an
115
+ unqualified column cannot be attributed to a table (e.g.
116
+ in a multi-table SELECT without table qualifiers).
100
117
 
101
118
  Raises:
102
119
  ParseError: If the SQL cannot be parsed
@@ -104,10 +121,12 @@ class LineageAnalyzer:
104
121
  self.sql = sql
105
122
  self.dialect = dialect
106
123
  self._no_star = no_star
124
+ self._strict_schema = strict_schema
107
125
  self._skipped_queries: List[SkippedQuery] = []
108
126
  # File-scoped schema context for cross-statement lineage
109
127
  # Maps table/view names to their column definitions
110
- self._file_schema: Dict[str, Dict[str, str]] = {}
128
+ self._initial_schema: Dict[str, Dict[str, str]] = dict(schema) if schema else {}
129
+ self._file_schema: Dict[str, Dict[str, str]] = dict(self._initial_schema)
111
130
 
112
131
  try:
113
132
  # Parse all statements in the SQL string
@@ -132,6 +151,27 @@ class LineageAnalyzer:
132
151
  """Get list of queries that were skipped during analysis."""
133
152
  return self._skipped_queries.copy()
134
153
 
154
+ def get_extracted_schema(self) -> Dict[str, Dict[str, str]]:
155
+ """Return the accumulated file schema after analysis."""
156
+ return dict(self._file_schema)
157
+
158
+ def extract_schema_only(self) -> Dict[str, Dict[str, str]]:
159
+ """Parse all statements and extract schema without running lineage.
160
+
161
+ Iterates through all expressions, extracting schema from:
162
+ 1. CREATE TABLE/VIEW AS SELECT statements (existing behavior)
163
+ 2. DQL statements by inferring table columns from qualified column
164
+ references (e.g., ``SELECT t.id FROM table t`` infers
165
+ ``table: {id: UNKNOWN}``)
166
+
167
+ Returns the accumulated schema dict.
168
+ """
169
+ self._file_schema = dict(self._initial_schema)
170
+ for expr in self.expressions:
171
+ self._extract_schema_from_statement(expr)
172
+ self._extract_schema_from_dql(expr)
173
+ return dict(self._file_schema)
174
+
135
175
  def get_output_columns(self) -> List[str]:
136
176
  """
137
177
  Extract all output column names from the query with full qualification.
@@ -426,7 +466,7 @@ class LineageAnalyzer:
426
466
  """
427
467
  results = []
428
468
  self._skipped_queries = [] # Reset skipped queries for this analysis
429
- self._file_schema = {} # Reset file schema for this analysis run
469
+ self._file_schema = dict(self._initial_schema) # Reset to external schema
430
470
 
431
471
  for query_index, expr, preview in self._iterate_queries(table_filter):
432
472
  # Temporarily swap self.expr to analyze this query
@@ -819,18 +859,31 @@ class LineageAnalyzer:
819
859
  else:
820
860
  current_query_sql = self.expr.sql(dialect=self.dialect)
821
861
 
862
+ # Prune schema to only tables referenced in this query to avoid
863
+ # sqlglot.lineage() performance degradation with large schema dicts
864
+ pruned_schema: Optional[Dict[str, Dict[str, str]]] = None
865
+ if self._file_schema:
866
+ referenced = {t.lower() for t in self._get_query_tables()}
867
+ pruned_schema = {
868
+ table: cols
869
+ for table, cols in self._file_schema.items()
870
+ if table.lower() in referenced
871
+ }
872
+ if not pruned_schema:
873
+ pruned_schema = None
874
+
822
875
  for col in columns_to_analyze:
823
876
  try:
824
877
  # Get the column name that lineage expects
825
878
  lineage_col = self._column_mapping.get(col, col)
826
879
 
827
880
  # Get lineage tree for this column using current query SQL only
828
- # Pass file schema to enable SELECT * expansion for known tables/views
881
+ # Pass pruned schema to enable SELECT * expansion for known tables/views
829
882
  node = lineage(
830
883
  lineage_col,
831
884
  current_query_sql,
832
885
  dialect=self.dialect,
833
- schema=self._file_schema if self._file_schema else None,
886
+ schema=pruned_schema,
834
887
  )
835
888
 
836
889
  # Collect all source columns
@@ -1427,6 +1480,119 @@ class LineageAnalyzer:
1427
1480
  # Store with UNKNOWN type - SQLGlot only needs column names for expansion
1428
1481
  self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
1429
1482
 
1483
+ def _extract_schema_from_dql(self, expr: exp.Expression) -> None:
1484
+ """Infer table schemas from column references in DQL.
1485
+
1486
+ Walks SELECT statements and extracts table-column mappings from:
1487
+ 1. Qualified column references (e.g., ``c.id``) — always resolved.
1488
+ 2. Unqualified column references (e.g., ``id``) — only when the
1489
+ SELECT has exactly one real table source (no joins), making
1490
+ attribution unambiguous.
1491
+
1492
+ Aliases are resolved back to actual table names. CTEs and subquery
1493
+ aliases are skipped since they don't represent external tables.
1494
+
1495
+ Args:
1496
+ expr: The SQL expression to extract schema from.
1497
+ """
1498
+ # Find all SELECT nodes in the expression tree
1499
+ selects = list(expr.find_all(exp.Select))
1500
+ if not selects:
1501
+ return
1502
+
1503
+ for select_node in selects:
1504
+ # Build alias-to-table mapping for this SELECT scope
1505
+ alias_map: Dict[str, str] = {}
1506
+ cte_names: Set[str] = set()
1507
+
1508
+ # Collect CTE names so we can skip them
1509
+ parent = select_node
1510
+ while parent:
1511
+ with_clause = parent.args.get("with")
1512
+ if with_clause:
1513
+ for cte in with_clause.expressions:
1514
+ if isinstance(cte, exp.CTE) and cte.alias:
1515
+ cte_names.add(cte.alias.lower())
1516
+ parent = parent.parent if hasattr(parent, "parent") else None
1517
+
1518
+ # Collect subquery aliases so we can skip them too
1519
+ subquery_aliases: Set[str] = set()
1520
+ from_clause = select_node.args.get("from")
1521
+ if from_clause and isinstance(from_clause, exp.From):
1522
+ source = from_clause.this
1523
+ if isinstance(source, exp.Subquery) and source.alias:
1524
+ subquery_aliases.add(source.alias.lower())
1525
+ for join in select_node.find_all(exp.Join):
1526
+ if isinstance(join.this, exp.Subquery) and join.this.alias:
1527
+ subquery_aliases.add(join.this.alias.lower())
1528
+
1529
+ # Build alias map from FROM/JOIN table references
1530
+ real_tables: list[str] = [] # track non-CTE, non-subquery tables
1531
+ for table_ref in select_node.find_all(exp.Table):
1532
+ # Skip tables inside nested selects — they belong to inner scope
1533
+ if table_ref.find_ancestor(exp.Select) is not select_node:
1534
+ continue
1535
+ qualified = self._get_qualified_table_name(table_ref)
1536
+ if table_ref.alias:
1537
+ alias_map[table_ref.alias.lower()] = qualified
1538
+ else:
1539
+ alias_map[table_ref.name.lower()] = qualified
1540
+ # Track real tables (not CTEs or subqueries)
1541
+ if (
1542
+ qualified.lower() not in cte_names
1543
+ and qualified.lower() not in subquery_aliases
1544
+ ):
1545
+ real_tables.append(qualified)
1546
+
1547
+ # Determine single-table target for unqualified columns
1548
+ # Only set when exactly one real table source exists (unambiguous)
1549
+ single_table: Optional[str] = (
1550
+ real_tables[0] if len(real_tables) == 1 else None
1551
+ )
1552
+
1553
+ # Walk all column references in this SELECT
1554
+ for column in select_node.find_all(exp.Column):
1555
+ if isinstance(column.this, exp.Star):
1556
+ continue
1557
+
1558
+ table_ref_name = column.table
1559
+ col_name = column.name
1560
+
1561
+ if table_ref_name:
1562
+ # Qualified column — resolve alias to actual table
1563
+ ref_lower = table_ref_name.lower()
1564
+
1565
+ # Skip CTE and subquery references
1566
+ if ref_lower in cte_names or ref_lower in subquery_aliases:
1567
+ continue
1568
+
1569
+ actual_table = alias_map.get(ref_lower)
1570
+ if not actual_table:
1571
+ continue
1572
+
1573
+ # Skip if it resolved to a CTE or subquery
1574
+ if (
1575
+ actual_table.lower() in cte_names
1576
+ or actual_table.lower() in subquery_aliases
1577
+ ):
1578
+ continue
1579
+ else:
1580
+ # Unqualified column — attribute to single table if unambiguous
1581
+ if not single_table:
1582
+ if self._strict_schema:
1583
+ preview = select_node.sql(dialect=self.dialect)[:80]
1584
+ raise SchemaResolutionError(
1585
+ f"Cannot resolve table for unqualified column "
1586
+ f"'{col_name}' in multi-table query: {preview}"
1587
+ )
1588
+ continue
1589
+ actual_table = single_table
1590
+
1591
+ if actual_table not in self._file_schema:
1592
+ self._file_schema[actual_table] = {}
1593
+ if col_name not in self._file_schema[actual_table]:
1594
+ self._file_schema[actual_table][col_name] = "UNKNOWN"
1595
+
1430
1596
  def _extract_columns_from_select(
1431
1597
  self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
1432
1598
  ) -> List[str]:
sqlglider/utils/config.py CHANGED
@@ -61,6 +61,10 @@ class ConfigSettings(BaseModel):
61
61
  ddl_folder: Optional[str] = None
62
62
  catalog: Optional[CatalogConfig] = None
63
63
  no_star: Optional[bool] = None
64
+ resolve_schema: Optional[bool] = None
65
+ dump_schema: Optional[str] = None
66
+ dump_schema_format: Optional[str] = None
67
+ strict_schema: Optional[bool] = None
64
68
 
65
69
 
66
70
  def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:
@@ -0,0 +1,62 @@
1
+ """Schema utilities for parsing DDL into schema dictionaries."""
2
+
3
+ from typing import Dict
4
+
5
+ from sqlglot import exp, parse
6
+
7
+
8
+ def parse_ddl_to_schema(ddl: str, dialect: str = "spark") -> Dict[str, Dict[str, str]]:
9
+ """Extract table schemas from DDL statements.
10
+
11
+ Parses CREATE TABLE/VIEW statements and extracts column names.
12
+ Only column names are needed — types are stored as "UNKNOWN" since
13
+ SQLGlot's lineage only uses names for star expansion.
14
+
15
+ Args:
16
+ ddl: SQL string containing one or more CREATE TABLE/VIEW statements
17
+ dialect: SQL dialect for parsing
18
+
19
+ Returns:
20
+ Schema dict mapping table names to column definitions,
21
+ e.g. {"my_table": {"id": "UNKNOWN", "name": "UNKNOWN"}}
22
+ """
23
+ schema: Dict[str, Dict[str, str]] = {}
24
+ expressions = parse(ddl, dialect=dialect)
25
+
26
+ for expr in expressions:
27
+ if expr is None:
28
+ continue
29
+ if not isinstance(expr, (exp.Create,)):
30
+ continue
31
+
32
+ # Get target table name
33
+ target = expr.this
34
+ if isinstance(target, exp.Schema):
35
+ # Schema node wraps the table and column definitions
36
+ columns = [
37
+ col.name for col in target.expressions if isinstance(col, exp.ColumnDef)
38
+ ]
39
+ target = target.this
40
+ else:
41
+ columns = []
42
+
43
+ if not isinstance(target, exp.Table):
44
+ continue
45
+
46
+ table_name = _get_qualified_name(target)
47
+
48
+ if columns:
49
+ schema[table_name] = {col: "UNKNOWN" for col in columns}
50
+
51
+ return schema
52
+
53
+
54
+ def _get_qualified_name(table: exp.Table) -> str:
55
+ """Build a qualified table name from a SQLGlot Table expression."""
56
+ parts = []
57
+ if table.catalog:
58
+ parts.append(table.catalog)
59
+ if table.db:
60
+ parts.append(table.db)
61
+ parts.append(table.name)
62
+ return ".".join(parts)