sql-glider 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -1,6 +1,6 @@
1
1
  sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
- sqlglider/_version.py,sha256=cEPXLUpTV7EzqolnyXW8nf8Hr6IVyBji9CzB6Cq_Ar0,706
3
- sqlglider/cli.py,sha256=qEDLZ1a6yr-BzrtkBsJEHPByMmRERsGKZsYFTn9kaMY,55624
2
+ sqlglider/_version.py,sha256=1asLxKIxr0ym19WewGl0URtkfnEuN7mK8ZckB9dZw6Q,706
3
+ sqlglider/cli.py,sha256=UvDaeDhQRu98M1PaUtWsIL_F_LtulOf58kWGn6SxUzE,64175
4
4
  sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
5
  sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
6
6
  sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
@@ -11,15 +11,17 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
11
11
  sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
12
12
  sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
13
13
  sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
14
- sqlglider/graph/builder.py,sha256=o0SnH5eWUUPpzRSdsdCXEva3QTlhLDagJulJ2hRFQqA,19895
14
+ sqlglider/graph/builder.py,sha256=fS6p-73zyjuYIHRzM3uXFTFZ8zyal0s7oBdyO2Fv8vQ,15224
15
15
  sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
16
16
  sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
17
17
  sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
18
18
  sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
19
19
  sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9ZlgTQ,2781
20
20
  sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
21
- sqlglider/lineage/analyzer.py,sha256=-LUeVNEsjfEWoKAJ2qVIiJO1noqwae4jQkwkkkVbAT8,75950
21
+ sqlglider/lineage/analyzer.py,sha256=08pFR5aGFFPhSbRW6EqiX2d3mp91v-orcs6dm_T1FJg,76484
22
22
  sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
23
+ sqlglider/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ sqlglider/schema/extractor.py,sha256=iOi13ZStR4ngC2GkZGXjB0lsgmDqJ-OYwTRgH72hy1w,7082
23
25
  sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
24
26
  sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
25
27
  sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
@@ -29,8 +31,8 @@ sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,2
29
31
  sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
30
32
  sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
31
33
  sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
32
- sql_glider-0.1.12.dist-info/METADATA,sha256=73yuoWaAE5DKE9wobDXxbERSP2Pq-WpdqCnaswAa9fQ,28446
33
- sql_glider-0.1.12.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
- sql_glider-0.1.12.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
35
- sql_glider-0.1.12.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
- sql_glider-0.1.12.dist-info/RECORD,,
34
+ sql_glider-0.1.14.dist-info/METADATA,sha256=SdedCDEPwWR2Kqrg8_mMgb0PNmNZj0OExHiapyKZ63A,28446
35
+ sql_glider-0.1.14.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ sql_glider-0.1.14.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
37
+ sql_glider-0.1.14.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
38
+ sql_glider-0.1.14.dist-info/RECORD,,
sqlglider/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.12'
32
- __version_tuple__ = version_tuple = (0, 1, 12)
31
+ __version__ = version = '0.1.14'
32
+ __version_tuple__ = version_tuple = (0, 1, 14)
33
33
 
34
34
  __commit_id__ = commit_id = None
sqlglider/cli.py CHANGED
@@ -788,6 +788,274 @@ def tables_pull(
788
788
  raise typer.Exit(1)
789
789
 
790
790
 
791
+ def _collect_sql_files(
792
+ paths: Optional[List[Path]],
793
+ manifest: Optional[Path],
794
+ recursive: bool,
795
+ glob_pattern: str,
796
+ ) -> tuple[list[Path], list[Path]]:
797
+ """Collect SQL files from paths and/or manifest.
798
+
799
+ Args:
800
+ paths: File or directory paths to scan.
801
+ manifest: Optional manifest CSV path.
802
+ recursive: Whether to recurse into directories.
803
+ glob_pattern: Glob pattern for directory scanning.
804
+
805
+ Returns:
806
+ Tuple of (manifest_files, path_files).
807
+ """
808
+ path_files: list[Path] = []
809
+ if paths:
810
+ for path in paths:
811
+ if path.is_dir():
812
+ pattern = f"**/{glob_pattern}" if recursive else glob_pattern
813
+ path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
814
+ elif path.is_file():
815
+ path_files.append(path)
816
+ else:
817
+ err_console.print(f"[red]Error:[/red] Path not found: {path}")
818
+ raise typer.Exit(1)
819
+
820
+ manifest_files: list[Path] = []
821
+ if manifest:
822
+ from sqlglider.graph.models import Manifest
823
+
824
+ manifest_data = Manifest.from_csv(manifest)
825
+ base_dir = manifest.parent
826
+ for entry in manifest_data.entries:
827
+ file_path = Path(entry.file_path)
828
+ if not file_path.is_absolute():
829
+ file_path = (base_dir / entry.file_path).resolve()
830
+ manifest_files.append(file_path)
831
+
832
+ return manifest_files, path_files
833
+
834
+
835
+ @tables_app.command("scrape")
836
+ def tables_scrape(
837
+ paths: List[Path] = typer.Argument(
838
+ None,
839
+ help="SQL file(s) or directory path to process",
840
+ ),
841
+ recursive: bool = typer.Option(
842
+ False,
843
+ "--recursive",
844
+ "-r",
845
+ help="Recursively search directories for SQL files",
846
+ ),
847
+ glob_pattern: str = typer.Option(
848
+ "*.sql",
849
+ "--glob",
850
+ "-g",
851
+ help="Glob pattern for matching SQL files in directories",
852
+ ),
853
+ manifest: Optional[Path] = typer.Option(
854
+ None,
855
+ "--manifest",
856
+ "-m",
857
+ exists=True,
858
+ help="Path to manifest CSV file with file_path and optional dialect columns",
859
+ ),
860
+ dialect: Optional[str] = typer.Option(
861
+ None,
862
+ "--dialect",
863
+ "-d",
864
+ help="SQL dialect (default: spark)",
865
+ ),
866
+ templater: Optional[str] = typer.Option(
867
+ None,
868
+ "--templater",
869
+ "-t",
870
+ help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
871
+ ),
872
+ var: Optional[List[str]] = typer.Option(
873
+ None,
874
+ "--var",
875
+ "-v",
876
+ help="Template variable in key=value format (repeatable)",
877
+ ),
878
+ vars_file: Optional[Path] = typer.Option(
879
+ None,
880
+ "--vars-file",
881
+ exists=True,
882
+ help="Path to variables file (JSON or YAML)",
883
+ ),
884
+ strict_schema: bool = typer.Option(
885
+ False,
886
+ "--strict-schema",
887
+ help="Fail if any column's table cannot be identified during schema extraction",
888
+ ),
889
+ catalog_type: Optional[str] = typer.Option(
890
+ None,
891
+ "--catalog-type",
892
+ "-c",
893
+ help="Catalog provider for pulling DDL of tables not found in files "
894
+ "(e.g. 'databricks')",
895
+ ),
896
+ output_format: Optional[str] = typer.Option(
897
+ None,
898
+ "--output-format",
899
+ "-f",
900
+ help="Output format: 'text' (default), 'json', or 'csv'",
901
+ ),
902
+ output_file: Optional[Path] = typer.Option(
903
+ None,
904
+ "--output-file",
905
+ "-o",
906
+ help="Output file path (prints to stdout if not provided)",
907
+ ),
908
+ ) -> None:
909
+ """
910
+ Scrape schema information from SQL files.
911
+
912
+ Infers table and column schemas from DDL statements and DQL column
913
+ references across one or more SQL files. Supports the same file input
914
+ modes as `graph build` (paths, directories, manifests).
915
+
916
+ Examples:
917
+
918
+ # Scrape schema from a directory
919
+ sqlglider tables scrape ./queries/ -r
920
+
921
+ # Output as JSON
922
+ sqlglider tables scrape ./queries/ -r -f json
923
+
924
+ # Save to file
925
+ sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
926
+
927
+ # With Jinja2 templating
928
+ sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
929
+
930
+ # With catalog fallback
931
+ sqlglider tables scrape ./queries/ -r -c databricks
932
+ """
933
+ from sqlglider.graph.formatters import format_schema
934
+ from sqlglider.lineage.analyzer import SchemaResolutionError
935
+ from sqlglider.schema.extractor import extract_and_resolve_schema
936
+
937
+ # Load config for defaults
938
+ config = load_config()
939
+ dialect = dialect or config.dialect or "spark"
940
+ templater = templater or config.templater
941
+ strict_schema = strict_schema or config.strict_schema or False
942
+ output_format = output_format or config.output_format or "text"
943
+
944
+ if output_format not in ("text", "json", "csv"):
945
+ err_console.print(
946
+ f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
947
+ "Use 'text', 'json', or 'csv'."
948
+ )
949
+ raise typer.Exit(1)
950
+
951
+ # Only inherit catalog_type from config when not provided via CLI
952
+ if not catalog_type:
953
+ catalog_type = config.catalog_type
954
+
955
+ # Validate inputs
956
+ if not paths and not manifest:
957
+ err_console.print(
958
+ "[red]Error:[/red] Must provide either file/directory paths or --manifest option."
959
+ )
960
+ raise typer.Exit(1)
961
+
962
+ # Create SQL preprocessor if templating is enabled
963
+ sql_preprocessor: Optional[Callable[[str, Path], str]] = None
964
+ if templater:
965
+ config_vars_file = None
966
+ config_vars = None
967
+ if config.templating:
968
+ if config.templating.variables_file and not vars_file:
969
+ config_vars_file = Path(config.templating.variables_file)
970
+ if not config_vars_file.exists():
971
+ err_console.print(
972
+ f"[yellow]Warning:[/yellow] Variables file from config "
973
+ f"not found: {config_vars_file}"
974
+ )
975
+ config_vars_file = None
976
+ config_vars = config.templating.variables
977
+
978
+ variables = load_all_variables(
979
+ cli_vars=var,
980
+ vars_file=vars_file or config_vars_file,
981
+ config_vars=config_vars,
982
+ use_env=True,
983
+ )
984
+
985
+ templater_instance = get_templater(templater)
986
+
987
+ def _preprocess(sql: str, file_path: Path) -> str:
988
+ return templater_instance.render(
989
+ sql, variables=variables, source_path=file_path
990
+ )
991
+
992
+ sql_preprocessor = _preprocess
993
+
994
+ try:
995
+ # Build catalog config from config file if available
996
+ catalog_config_dict = None
997
+ if catalog_type and config.catalog:
998
+ provider_config = getattr(config.catalog, catalog_type, None)
999
+ if provider_config:
1000
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1001
+
1002
+ # Collect files
1003
+ manifest_files, path_files = _collect_sql_files(
1004
+ paths, manifest, recursive, glob_pattern
1005
+ )
1006
+ all_files = manifest_files + path_files
1007
+
1008
+ if not all_files:
1009
+ err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
1010
+ raise typer.Exit(0)
1011
+
1012
+ # Extract schema
1013
+ schema = extract_and_resolve_schema(
1014
+ all_files,
1015
+ dialect=dialect,
1016
+ sql_preprocessor=sql_preprocessor,
1017
+ strict_schema=strict_schema,
1018
+ catalog_type=catalog_type,
1019
+ catalog_config=catalog_config_dict,
1020
+ console=err_console,
1021
+ )
1022
+
1023
+ if not schema:
1024
+ err_console.print("[yellow]No schema information found.[/yellow]")
1025
+ raise typer.Exit(0)
1026
+
1027
+ # Format and output
1028
+ formatted = format_schema(schema, output_format)
1029
+ if output_file:
1030
+ OutputWriter.write(formatted, output_file)
1031
+ err_console.print(
1032
+ f"[green]Schema written to {output_file} "
1033
+ f"({len(schema)} table(s))[/green]"
1034
+ )
1035
+ else:
1036
+ console.print(formatted, end="")
1037
+
1038
+ except SchemaResolutionError as e:
1039
+ err_console.print(f"[red]Error:[/red] {e}")
1040
+ raise typer.Exit(1)
1041
+
1042
+ except FileNotFoundError as e:
1043
+ err_console.print(f"[red]Error:[/red] {e}")
1044
+ raise typer.Exit(1)
1045
+
1046
+ except TemplaterError as e:
1047
+ err_console.print(f"[red]Error:[/red] {e}")
1048
+ raise typer.Exit(1)
1049
+
1050
+ except ValueError as e:
1051
+ err_console.print(f"[red]Error:[/red] {e}")
1052
+ raise typer.Exit(1)
1053
+
1054
+ except Exception as e:
1055
+ err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
1056
+ raise typer.Exit(1)
1057
+
1058
+
791
1059
  @app.command()
792
1060
  def template(
793
1061
  sql_file: Annotated[
@@ -1166,36 +1434,35 @@ def graph_build(
1166
1434
  strict_schema=strict_schema,
1167
1435
  )
1168
1436
 
1437
+ # Collect file paths for schema extraction
1438
+ manifest_files, path_files = _collect_sql_files(
1439
+ paths, manifest, recursive, glob_pattern
1440
+ )
1441
+
1442
+ # Extract schema upfront if requested, then dump before graph building
1443
+ all_files = manifest_files + path_files
1444
+ if resolve_schema and all_files:
1445
+ builder.extract_schemas(all_files, dialect=dialect)
1446
+
1447
+ if dump_schema:
1448
+ from sqlglider.graph.formatters import format_schema
1449
+
1450
+ schema_content = format_schema(
1451
+ builder.resolved_schema, dump_schema_format
1452
+ )
1453
+ dump_schema.write_text(schema_content, encoding="utf-8")
1454
+ console.print(
1455
+ f"[green]Schema dumped to {dump_schema} "
1456
+ f"({len(builder.resolved_schema)} table(s))[/green]"
1457
+ )
1458
+
1169
1459
  # Process manifest if provided
1170
1460
  if manifest:
1171
1461
  builder.add_manifest(manifest, dialect=dialect)
1172
1462
 
1173
- # Process paths - collect all files first for progress tracking
1174
- if paths:
1175
- all_files: list[Path] = []
1176
- for path in paths:
1177
- if path.is_dir():
1178
- pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1179
- all_files.extend(
1180
- f for f in sorted(path.glob(pattern)) if f.is_file()
1181
- )
1182
- elif path.is_file():
1183
- all_files.append(path)
1184
- else:
1185
- err_console.print(f"[red]Error:[/red] Path not found: {path}")
1186
- raise typer.Exit(1)
1187
- builder.add_files(all_files, dialect=dialect)
1188
-
1189
- # Dump resolved schema if requested
1190
- if dump_schema:
1191
- from sqlglider.graph.formatters import format_schema
1192
-
1193
- schema_content = format_schema(builder.resolved_schema, dump_schema_format)
1194
- dump_schema.write_text(schema_content, encoding="utf-8")
1195
- console.print(
1196
- f"[green]Schema dumped to {dump_schema} "
1197
- f"({len(builder.resolved_schema)} table(s))[/green]"
1198
- )
1463
+ # Process path-based files
1464
+ if path_files:
1465
+ builder.add_files(path_files, dialect=dialect)
1199
1466
 
1200
1467
  # Build and save graph
1201
1468
  graph = builder.build()
@@ -16,9 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer
20
+ from sqlglider.schema.extractor import extract_and_resolve_schema
20
21
  from sqlglider.utils.file_utils import read_sql_file
21
- from sqlglider.utils.schema import parse_ddl_to_schema
22
22
 
23
23
  console = Console(stderr=True)
24
24
 
@@ -235,19 +235,10 @@ class GraphBuilder:
235
235
  if not files_with_dialects:
236
236
  return self
237
237
 
238
- # Two-pass schema resolution
239
- if self.resolve_schema:
240
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
238
+ # Two-pass schema resolution (skip if already resolved)
239
+ if self.resolve_schema and not self._resolved_schema:
241
240
  file_paths_only = [fp for fp, _ in files_with_dialects]
242
- self._resolved_schema = self._extract_schemas(file_paths_only, dialect)
243
- if self.catalog_type:
244
- self._resolved_schema = self._fill_schema_from_catalog(
245
- self._resolved_schema, file_paths_only, dialect
246
- )
247
- console.print(
248
- f"[blue]Schema resolved for "
249
- f"{len(self._resolved_schema)} table(s)[/blue]"
250
- )
241
+ self.extract_schemas(file_paths_only, dialect)
251
242
 
252
243
  total = len(files_with_dialects)
253
244
  description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
@@ -286,18 +277,9 @@ class GraphBuilder:
286
277
  if not file_paths:
287
278
  return self
288
279
 
289
- # Two-pass schema resolution: extract schema from all files first
290
- if self.resolve_schema:
291
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
292
- self._resolved_schema = self._extract_schemas(file_paths, dialect)
293
- if self.catalog_type:
294
- self._resolved_schema = self._fill_schema_from_catalog(
295
- self._resolved_schema, file_paths, dialect
296
- )
297
- console.print(
298
- f"[blue]Schema resolved for "
299
- f"{len(self._resolved_schema)} table(s)[/blue]"
300
- )
280
+ # Two-pass schema resolution (skip if already resolved)
281
+ if self.resolve_schema and not self._resolved_schema:
282
+ self.extract_schemas(file_paths, dialect)
301
283
 
302
284
  if show_progress:
303
285
  total = len(file_paths)
@@ -321,128 +303,34 @@ class GraphBuilder:
321
303
  self.add_file(file_path, dialect)
322
304
  return self
323
305
 
324
- def _extract_schemas(
306
+ def extract_schemas(
325
307
  self,
326
308
  file_paths: List[Path],
327
309
  dialect: Optional[str] = None,
328
310
  ) -> Dict[str, Dict[str, str]]:
329
- """Run schema extraction pass across all files.
311
+ """Run schema extraction pass and optionally fill from catalog.
330
312
 
331
- Parses each file and extracts schema from CREATE TABLE/VIEW
332
- statements without performing lineage analysis.
313
+ Call this before add_files/add_manifest to resolve schema upfront.
314
+ The resolved schema is stored internally and also returned.
333
315
 
334
316
  Args:
335
317
  file_paths: SQL files to extract schema from
336
318
  dialect: SQL dialect override
337
319
 
338
320
  Returns:
339
- Accumulated schema dict from all files
321
+ Resolved schema dict
340
322
  """
341
- schema: Dict[str, Dict[str, str]] = {}
342
- total = len(file_paths)
343
- with Progress(
344
- TextColumn("[progress.description]{task.description}"),
345
- BarColumn(),
346
- TaskProgressColumn(),
323
+ file_dialect = dialect or self.dialect
324
+ self._resolved_schema = extract_and_resolve_schema(
325
+ file_paths,
326
+ dialect=file_dialect,
327
+ sql_preprocessor=self.sql_preprocessor,
328
+ strict_schema=self.strict_schema,
329
+ catalog_type=self.catalog_type,
330
+ catalog_config=self.catalog_config,
347
331
  console=console,
348
- transient=False,
349
- ) as progress:
350
- task = progress.add_task("Pass 1: Extracting schema", total=total)
351
- for i, file_path in enumerate(file_paths, start=1):
352
- console.print(f"Extracting schema {i}/{total}: {file_path.name}")
353
- file_dialect = dialect or self.dialect
354
- try:
355
- sql_content = read_sql_file(file_path)
356
- if self.sql_preprocessor:
357
- sql_content = self.sql_preprocessor(sql_content, file_path)
358
- analyzer = LineageAnalyzer(
359
- sql_content,
360
- dialect=file_dialect,
361
- schema=schema,
362
- strict_schema=self.strict_schema,
363
- )
364
- file_schema = analyzer.extract_schema_only()
365
- schema.update(file_schema)
366
- except SchemaResolutionError:
367
- raise
368
- except Exception:
369
- # Schema extraction failures are non-fatal; the file
370
- # will be reported during the lineage pass if it also fails.
371
- pass
372
- progress.advance(task)
373
- return schema
374
-
375
- def _fill_schema_from_catalog(
376
- self,
377
- schema: Dict[str, Dict[str, str]],
378
- file_paths: List[Path],
379
- dialect: Optional[str] = None,
380
- ) -> Dict[str, Dict[str, str]]:
381
- """Pull DDL from catalog for tables not yet in schema.
382
-
383
- Extracts all table names referenced across the files, identifies
384
- those missing from the schema, and fetches their DDL from the
385
- configured catalog provider.
386
-
387
- Args:
388
- schema: Schema dict already populated from file extraction
389
- file_paths: SQL files to scan for table references
390
- dialect: SQL dialect override
391
-
392
- Returns:
393
- Updated schema dict with catalog-sourced entries added
394
- """
395
- from sqlglider.catalog import get_catalog
396
-
397
- catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
398
- if self.catalog_config:
399
- catalog.configure(self.catalog_config)
400
-
401
- # Collect all referenced table names across files
402
- all_tables: Set[str] = set()
403
- for file_path in file_paths:
404
- file_dialect = dialect or self.dialect
405
- try:
406
- sql_content = read_sql_file(file_path)
407
- if self.sql_preprocessor:
408
- sql_content = self.sql_preprocessor(sql_content, file_path)
409
- analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
410
- tables_results = analyzer.analyze_tables()
411
- for result in tables_results:
412
- for table_info in result.tables:
413
- # Skip CTEs — they don't exist in catalogs
414
- from sqlglider.lineage.analyzer import ObjectType
415
-
416
- if table_info.object_type != ObjectType.CTE:
417
- all_tables.add(table_info.name)
418
- except Exception:
419
- pass
420
-
421
- # Find tables missing from schema
422
- missing = [t for t in all_tables if t not in schema]
423
- if not missing:
424
- return schema
425
-
426
- console.print(
427
- f"[blue]Pulling DDL from {self.catalog_type} "
428
- f"for {len(missing)} table(s)...[/blue]"
429
332
  )
430
-
431
- ddl_results = catalog.get_ddl_batch(missing)
432
- file_dialect = dialect or self.dialect
433
- for table_name, ddl in ddl_results.items():
434
- if ddl.startswith("ERROR:"):
435
- console.print(
436
- f"[yellow]Warning:[/yellow] Could not pull DDL "
437
- f"for {table_name}: {ddl}"
438
- )
439
- continue
440
- parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
441
- for name, cols in parsed_schema.items():
442
- if name not in schema:
443
- schema[name] = cols
444
-
445
- return schema
333
+ return self._resolved_schema.copy()
446
334
 
447
335
  def _ensure_node(
448
336
  self,
@@ -859,18 +859,31 @@ class LineageAnalyzer:
859
859
  else:
860
860
  current_query_sql = self.expr.sql(dialect=self.dialect)
861
861
 
862
+ # Prune schema to only tables referenced in this query to avoid
863
+ # sqlglot.lineage() performance degradation with large schema dicts
864
+ pruned_schema: Optional[Dict[str, Dict[str, str]]] = None
865
+ if self._file_schema:
866
+ referenced = {t.lower() for t in self._get_query_tables()}
867
+ pruned_schema = {
868
+ table: cols
869
+ for table, cols in self._file_schema.items()
870
+ if table.lower() in referenced
871
+ }
872
+ if not pruned_schema:
873
+ pruned_schema = None
874
+
862
875
  for col in columns_to_analyze:
863
876
  try:
864
877
  # Get the column name that lineage expects
865
878
  lineage_col = self._column_mapping.get(col, col)
866
879
 
867
880
  # Get lineage tree for this column using current query SQL only
868
- # Pass file schema to enable SELECT * expansion for known tables/views
881
+ # Pass pruned schema to enable SELECT * expansion for known tables/views
869
882
  node = lineage(
870
883
  lineage_col,
871
884
  current_query_sql,
872
885
  dialect=self.dialect,
873
- schema=self._file_schema if self._file_schema else None,
886
+ schema=pruned_schema,
874
887
  )
875
888
 
876
889
  # Collect all source columns
File without changes
@@ -0,0 +1,202 @@
1
+ """Shared schema extraction logic for inferring table schemas from SQL files."""
2
+
3
+ from pathlib import Path
4
+ from typing import Callable, Dict, List, Optional
5
+
6
+ from rich.console import Console
7
+ from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
8
+
9
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
10
+ from sqlglider.utils.file_utils import read_sql_file
11
+ from sqlglider.utils.schema import parse_ddl_to_schema
12
+
13
+ SchemaDict = Dict[str, Dict[str, str]]
14
+ SqlPreprocessor = Callable[[str, Path], str]
15
+
16
+
17
+ def extract_schemas_from_files(
18
+ file_paths: List[Path],
19
+ dialect: str = "spark",
20
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
21
+ initial_schema: Optional[SchemaDict] = None,
22
+ strict_schema: bool = False,
23
+ console: Optional[Console] = None,
24
+ ) -> SchemaDict:
25
+ """Extract schema from SQL files by parsing DDL and inferring from DQL.
26
+
27
+ Iterates through files, accumulating schema knowledge. Each file's
28
+ inferred schema is available when parsing subsequent files.
29
+
30
+ Args:
31
+ file_paths: SQL files to extract schema from.
32
+ dialect: SQL dialect.
33
+ sql_preprocessor: Optional function to preprocess SQL (e.g., templating).
34
+ initial_schema: Optional starting schema to build upon.
35
+ strict_schema: If True, fail on ambiguous column attribution.
36
+ console: Rich console for output. Uses stderr if not provided.
37
+
38
+ Returns:
39
+ Accumulated schema dict mapping table names to column dicts.
40
+ """
41
+ if console is None:
42
+ console = Console(stderr=True)
43
+
44
+ schema: SchemaDict = dict(initial_schema) if initial_schema else {}
45
+ total = len(file_paths)
46
+
47
+ with Progress(
48
+ TextColumn("[progress.description]{task.description}"),
49
+ BarColumn(),
50
+ TaskProgressColumn(),
51
+ console=console,
52
+ transient=False,
53
+ ) as progress:
54
+ task = progress.add_task("Extracting schema", total=total)
55
+ for i, file_path in enumerate(file_paths, start=1):
56
+ console.print(f"Extracting schema {i}/{total}: {file_path.name}")
57
+ try:
58
+ sql_content = read_sql_file(file_path)
59
+ if sql_preprocessor:
60
+ sql_content = sql_preprocessor(sql_content, file_path)
61
+ analyzer = LineageAnalyzer(
62
+ sql_content,
63
+ dialect=dialect,
64
+ schema=schema,
65
+ strict_schema=strict_schema,
66
+ )
67
+ file_schema = analyzer.extract_schema_only()
68
+ schema.update(file_schema)
69
+ except SchemaResolutionError:
70
+ raise
71
+ except Exception:
72
+ # Schema extraction failures are non-fatal; the file
73
+ # will be reported during the lineage pass if it also fails.
74
+ pass
75
+ progress.advance(task)
76
+ return schema
77
+
78
+
79
+ def fill_schema_from_catalog(
80
+ schema: SchemaDict,
81
+ file_paths: List[Path],
82
+ dialect: str = "spark",
83
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
84
+ catalog_type: str = "databricks",
85
+ catalog_config: Optional[Dict[str, object]] = None,
86
+ console: Optional[Console] = None,
87
+ ) -> SchemaDict:
88
+ """Pull DDL from catalog for tables not yet in schema.
89
+
90
+ Args:
91
+ schema: Schema dict already populated from file extraction.
92
+ file_paths: SQL files to scan for table references.
93
+ dialect: SQL dialect.
94
+ sql_preprocessor: Optional SQL preprocessor.
95
+ catalog_type: Catalog provider name.
96
+ catalog_config: Provider-specific configuration dict.
97
+ console: Rich console for output.
98
+
99
+ Returns:
100
+ Updated schema dict with catalog-sourced entries added.
101
+ """
102
+ from sqlglider.catalog import get_catalog
103
+ from sqlglider.lineage.analyzer import ObjectType
104
+
105
+ if console is None:
106
+ console = Console(stderr=True)
107
+
108
+ catalog = get_catalog(catalog_type)
109
+ if catalog_config:
110
+ catalog.configure(catalog_config)
111
+
112
+ # Collect all referenced table names across files
113
+ all_tables: set[str] = set()
114
+ for file_path in file_paths:
115
+ try:
116
+ sql_content = read_sql_file(file_path)
117
+ if sql_preprocessor:
118
+ sql_content = sql_preprocessor(sql_content, file_path)
119
+ analyzer = LineageAnalyzer(sql_content, dialect=dialect)
120
+ tables_results = analyzer.analyze_tables()
121
+ for result in tables_results:
122
+ for table_info in result.tables:
123
+ if table_info.object_type != ObjectType.CTE:
124
+ all_tables.add(table_info.name)
125
+ except Exception:
126
+ pass
127
+
128
+ # Find tables missing from schema
129
+ missing = [t for t in all_tables if t not in schema]
130
+ if not missing:
131
+ return schema
132
+
133
+ console.print(
134
+ f"[blue]Pulling DDL from {catalog_type} for {len(missing)} table(s)...[/blue]"
135
+ )
136
+
137
+ ddl_results = catalog.get_ddl_batch(missing)
138
+ for table_name, ddl in ddl_results.items():
139
+ if ddl.startswith("ERROR:"):
140
+ console.print(
141
+ f"[yellow]Warning:[/yellow] Could not pull DDL for {table_name}: {ddl}"
142
+ )
143
+ continue
144
+ parsed_schema = parse_ddl_to_schema(ddl, dialect=dialect)
145
+ for name, cols in parsed_schema.items():
146
+ if name not in schema:
147
+ schema[name] = cols
148
+
149
+ return schema
150
+
151
+
152
+ def extract_and_resolve_schema(
153
+ file_paths: List[Path],
154
+ dialect: str = "spark",
155
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
156
+ strict_schema: bool = False,
157
+ catalog_type: Optional[str] = None,
158
+ catalog_config: Optional[Dict[str, object]] = None,
159
+ console: Optional[Console] = None,
160
+ ) -> SchemaDict:
161
+ """Extract schema from files and optionally fill from catalog.
162
+
163
+ High-level orchestrator that runs file-based extraction followed
164
+ by optional catalog resolution.
165
+
166
+ Args:
167
+ file_paths: SQL files to extract schema from.
168
+ dialect: SQL dialect.
169
+ sql_preprocessor: Optional SQL preprocessor.
170
+ strict_schema: If True, fail on ambiguous column attribution.
171
+ catalog_type: Optional catalog provider name.
172
+ catalog_config: Optional provider-specific configuration dict.
173
+ console: Rich console for output.
174
+
175
+ Returns:
176
+ Resolved schema dict.
177
+ """
178
+ if console is None:
179
+ console = Console(stderr=True)
180
+
181
+ console.print("[blue]Extracting schema from files[/blue]")
182
+ schema = extract_schemas_from_files(
183
+ file_paths,
184
+ dialect=dialect,
185
+ sql_preprocessor=sql_preprocessor,
186
+ strict_schema=strict_schema,
187
+ console=console,
188
+ )
189
+
190
+ if catalog_type:
191
+ schema = fill_schema_from_catalog(
192
+ schema,
193
+ file_paths,
194
+ dialect=dialect,
195
+ sql_preprocessor=sql_preprocessor,
196
+ catalog_type=catalog_type,
197
+ catalog_config=catalog_config,
198
+ console=console,
199
+ )
200
+
201
+ console.print(f"[blue]Schema resolved for {len(schema)} table(s)[/blue]")
202
+ return schema