sql-glider 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sql-glider
3
- Version: 0.1.13
3
+ Version: 0.1.15
4
4
  Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
5
5
  Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
6
6
  Project-URL: Repository, https://github.com/rycowhi/sql-glider/
@@ -1,6 +1,6 @@
1
1
  sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
2
- sqlglider/_version.py,sha256=Xz5RLbyPcCHHXte393JYfUy4Dt7uaeWyrGVw9SmJ0eg,706
3
- sqlglider/cli.py,sha256=FDTjRmor_cQlcwfiD_uHTrQao2sMf3ev21IUyUSt7Qs,56401
2
+ sqlglider/_version.py,sha256=HPqQHR9pVxIxlFt4vovkyoe7k6UO3ag2isBN2lHFL8g,706
3
+ sqlglider/cli.py,sha256=9zNMaw3rgcqb6uG05VJTYbLUXmZzdX87gAOJ4Zg3xjY,65319
4
4
  sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
5
5
  sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
6
6
  sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
@@ -11,8 +11,8 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
11
11
  sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
12
12
  sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
13
13
  sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
14
- sqlglider/graph/builder.py,sha256=suxc_hymHvHnkgltgXqwwIoxlay7zhy1Enbs6HNC3m8,20107
15
- sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
14
+ sqlglider/graph/builder.py,sha256=VNBdsDlkiaId3JGvr2G4h6OIFek_9zPsGMIYL9GpJlk,15796
15
+ sqlglider/graph/formatters.py,sha256=p85-WN9oPmEETsAtWSo1sIQELF36w85QoFEJyfBZGoM,4800
16
16
  sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
17
17
  sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
18
18
  sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
@@ -20,6 +20,8 @@ sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9Zl
20
20
  sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
21
21
  sqlglider/lineage/analyzer.py,sha256=08pFR5aGFFPhSbRW6EqiX2d3mp91v-orcs6dm_T1FJg,76484
22
22
  sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
23
+ sqlglider/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ sqlglider/schema/extractor.py,sha256=WW31wbHkL-V749pLb7EAyUOJuziZQK-5hLZVW6f970U,7234
23
25
  sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
24
26
  sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
25
27
  sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
@@ -29,8 +31,8 @@ sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,2
29
31
  sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
30
32
  sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
31
33
  sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
32
- sql_glider-0.1.13.dist-info/METADATA,sha256=z-utivkULH1BBhygNpLcWN9UdU1DbwfF3EzUhGtWXes,28446
33
- sql_glider-0.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
34
- sql_glider-0.1.13.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
35
- sql_glider-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
36
- sql_glider-0.1.13.dist-info/RECORD,,
34
+ sql_glider-0.1.15.dist-info/METADATA,sha256=IF0dZD6rOriyausbDZhHPMfYnhHyRlxyi9v_ihTgCUo,28446
35
+ sql_glider-0.1.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
36
+ sql_glider-0.1.15.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
37
+ sql_glider-0.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
38
+ sql_glider-0.1.15.dist-info/RECORD,,
sqlglider/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.13'
32
- __version_tuple__ = version_tuple = (0, 1, 13)
31
+ __version__ = version = '0.1.15'
32
+ __version_tuple__ = version_tuple = (0, 1, 15)
33
33
 
34
34
  __commit_id__ = commit_id = None
sqlglider/cli.py CHANGED
@@ -171,6 +171,12 @@ def lineage(
171
171
  "--no-star",
172
172
  help="Fail if SELECT * cannot be resolved to actual columns",
173
173
  ),
174
+ provide_schema: Optional[Path] = typer.Option(
175
+ None,
176
+ "--provide-schema",
177
+ exists=True,
178
+ help="Path to a schema file (JSON, CSV, or text) for star resolution",
179
+ ),
174
180
  ) -> None:
175
181
  """
176
182
  Analyze column or table lineage for a SQL file.
@@ -266,8 +272,15 @@ def lineage(
266
272
  source_path=source_path,
267
273
  )
268
274
 
275
+ # Load provided schema if specified
276
+ schema = None
277
+ if provide_schema:
278
+ from sqlglider.graph.formatters import load_schema_file
279
+
280
+ schema = load_schema_file(provide_schema)
281
+
269
282
  # Create analyzer
270
- analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
283
+ analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star, schema=schema)
271
284
 
272
285
  # Unified lineage analysis (handles both single and multi-query files)
273
286
  results = analyzer.analyze_queries(
@@ -788,6 +801,274 @@ def tables_pull(
788
801
  raise typer.Exit(1)
789
802
 
790
803
 
804
+ def _collect_sql_files(
805
+ paths: Optional[List[Path]],
806
+ manifest: Optional[Path],
807
+ recursive: bool,
808
+ glob_pattern: str,
809
+ ) -> tuple[list[Path], list[Path]]:
810
+ """Collect SQL files from paths and/or manifest.
811
+
812
+ Args:
813
+ paths: File or directory paths to scan.
814
+ manifest: Optional manifest CSV path.
815
+ recursive: Whether to recurse into directories.
816
+ glob_pattern: Glob pattern for directory scanning.
817
+
818
+ Returns:
819
+ Tuple of (manifest_files, path_files).
820
+ """
821
+ path_files: list[Path] = []
822
+ if paths:
823
+ for path in paths:
824
+ if path.is_dir():
825
+ pattern = f"**/{glob_pattern}" if recursive else glob_pattern
826
+ path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
827
+ elif path.is_file():
828
+ path_files.append(path)
829
+ else:
830
+ err_console.print(f"[red]Error:[/red] Path not found: {path}")
831
+ raise typer.Exit(1)
832
+
833
+ manifest_files: list[Path] = []
834
+ if manifest:
835
+ from sqlglider.graph.models import Manifest
836
+
837
+ manifest_data = Manifest.from_csv(manifest)
838
+ base_dir = manifest.parent
839
+ for entry in manifest_data.entries:
840
+ file_path = Path(entry.file_path)
841
+ if not file_path.is_absolute():
842
+ file_path = (base_dir / entry.file_path).resolve()
843
+ manifest_files.append(file_path)
844
+
845
+ return manifest_files, path_files
846
+
847
+
848
+ @tables_app.command("scrape")
849
+ def tables_scrape(
850
+ paths: List[Path] = typer.Argument(
851
+ None,
852
+ help="SQL file(s) or directory path to process",
853
+ ),
854
+ recursive: bool = typer.Option(
855
+ False,
856
+ "--recursive",
857
+ "-r",
858
+ help="Recursively search directories for SQL files",
859
+ ),
860
+ glob_pattern: str = typer.Option(
861
+ "*.sql",
862
+ "--glob",
863
+ "-g",
864
+ help="Glob pattern for matching SQL files in directories",
865
+ ),
866
+ manifest: Optional[Path] = typer.Option(
867
+ None,
868
+ "--manifest",
869
+ "-m",
870
+ exists=True,
871
+ help="Path to manifest CSV file with file_path and optional dialect columns",
872
+ ),
873
+ dialect: Optional[str] = typer.Option(
874
+ None,
875
+ "--dialect",
876
+ "-d",
877
+ help="SQL dialect (default: spark)",
878
+ ),
879
+ templater: Optional[str] = typer.Option(
880
+ None,
881
+ "--templater",
882
+ "-t",
883
+ help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
884
+ ),
885
+ var: Optional[List[str]] = typer.Option(
886
+ None,
887
+ "--var",
888
+ "-v",
889
+ help="Template variable in key=value format (repeatable)",
890
+ ),
891
+ vars_file: Optional[Path] = typer.Option(
892
+ None,
893
+ "--vars-file",
894
+ exists=True,
895
+ help="Path to variables file (JSON or YAML)",
896
+ ),
897
+ strict_schema: bool = typer.Option(
898
+ False,
899
+ "--strict-schema",
900
+ help="Fail if any column's table cannot be identified during schema extraction",
901
+ ),
902
+ catalog_type: Optional[str] = typer.Option(
903
+ None,
904
+ "--catalog-type",
905
+ "-c",
906
+ help="Catalog provider for pulling DDL of tables not found in files "
907
+ "(e.g. 'databricks')",
908
+ ),
909
+ output_format: Optional[str] = typer.Option(
910
+ None,
911
+ "--output-format",
912
+ "-f",
913
+ help="Output format: 'text' (default), 'json', or 'csv'",
914
+ ),
915
+ output_file: Optional[Path] = typer.Option(
916
+ None,
917
+ "--output-file",
918
+ "-o",
919
+ help="Output file path (prints to stdout if not provided)",
920
+ ),
921
+ ) -> None:
922
+ """
923
+ Scrape schema information from SQL files.
924
+
925
+ Infers table and column schemas from DDL statements and DQL column
926
+ references across one or more SQL files. Supports the same file input
927
+ modes as `graph build` (paths, directories, manifests).
928
+
929
+ Examples:
930
+
931
+ # Scrape schema from a directory
932
+ sqlglider tables scrape ./queries/ -r
933
+
934
+ # Output as JSON
935
+ sqlglider tables scrape ./queries/ -r -f json
936
+
937
+ # Save to file
938
+ sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
939
+
940
+ # With Jinja2 templating
941
+ sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
942
+
943
+ # With catalog fallback
944
+ sqlglider tables scrape ./queries/ -r -c databricks
945
+ """
946
+ from sqlglider.graph.formatters import format_schema
947
+ from sqlglider.lineage.analyzer import SchemaResolutionError
948
+ from sqlglider.schema.extractor import extract_and_resolve_schema
949
+
950
+ # Load config for defaults
951
+ config = load_config()
952
+ dialect = dialect or config.dialect or "spark"
953
+ templater = templater or config.templater
954
+ strict_schema = strict_schema or config.strict_schema or False
955
+ output_format = output_format or config.output_format or "text"
956
+
957
+ if output_format not in ("text", "json", "csv"):
958
+ err_console.print(
959
+ f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
960
+ "Use 'text', 'json', or 'csv'."
961
+ )
962
+ raise typer.Exit(1)
963
+
964
+ # Only inherit catalog_type from config when not provided via CLI
965
+ if not catalog_type:
966
+ catalog_type = config.catalog_type
967
+
968
+ # Validate inputs
969
+ if not paths and not manifest:
970
+ err_console.print(
971
+ "[red]Error:[/red] Must provide either file/directory paths or --manifest option."
972
+ )
973
+ raise typer.Exit(1)
974
+
975
+ # Create SQL preprocessor if templating is enabled
976
+ sql_preprocessor: Optional[Callable[[str, Path], str]] = None
977
+ if templater:
978
+ config_vars_file = None
979
+ config_vars = None
980
+ if config.templating:
981
+ if config.templating.variables_file and not vars_file:
982
+ config_vars_file = Path(config.templating.variables_file)
983
+ if not config_vars_file.exists():
984
+ err_console.print(
985
+ f"[yellow]Warning:[/yellow] Variables file from config "
986
+ f"not found: {config_vars_file}"
987
+ )
988
+ config_vars_file = None
989
+ config_vars = config.templating.variables
990
+
991
+ variables = load_all_variables(
992
+ cli_vars=var,
993
+ vars_file=vars_file or config_vars_file,
994
+ config_vars=config_vars,
995
+ use_env=True,
996
+ )
997
+
998
+ templater_instance = get_templater(templater)
999
+
1000
+ def _preprocess(sql: str, file_path: Path) -> str:
1001
+ return templater_instance.render(
1002
+ sql, variables=variables, source_path=file_path
1003
+ )
1004
+
1005
+ sql_preprocessor = _preprocess
1006
+
1007
+ try:
1008
+ # Build catalog config from config file if available
1009
+ catalog_config_dict = None
1010
+ if catalog_type and config.catalog:
1011
+ provider_config = getattr(config.catalog, catalog_type, None)
1012
+ if provider_config:
1013
+ catalog_config_dict = provider_config.model_dump(exclude_none=True)
1014
+
1015
+ # Collect files
1016
+ manifest_files, path_files = _collect_sql_files(
1017
+ paths, manifest, recursive, glob_pattern
1018
+ )
1019
+ all_files = manifest_files + path_files
1020
+
1021
+ if not all_files:
1022
+ err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
1023
+ raise typer.Exit(0)
1024
+
1025
+ # Extract schema
1026
+ schema = extract_and_resolve_schema(
1027
+ all_files,
1028
+ dialect=dialect,
1029
+ sql_preprocessor=sql_preprocessor,
1030
+ strict_schema=strict_schema,
1031
+ catalog_type=catalog_type,
1032
+ catalog_config=catalog_config_dict,
1033
+ console=err_console,
1034
+ )
1035
+
1036
+ if not schema:
1037
+ err_console.print("[yellow]No schema information found.[/yellow]")
1038
+ raise typer.Exit(0)
1039
+
1040
+ # Format and output
1041
+ formatted = format_schema(schema, output_format)
1042
+ if output_file:
1043
+ OutputWriter.write(formatted, output_file)
1044
+ err_console.print(
1045
+ f"[green]Schema written to {output_file} "
1046
+ f"({len(schema)} table(s))[/green]"
1047
+ )
1048
+ else:
1049
+ console.print(formatted, end="")
1050
+
1051
+ except SchemaResolutionError as e:
1052
+ err_console.print(f"[red]Error:[/red] {e}")
1053
+ raise typer.Exit(1)
1054
+
1055
+ except FileNotFoundError as e:
1056
+ err_console.print(f"[red]Error:[/red] {e}")
1057
+ raise typer.Exit(1)
1058
+
1059
+ except TemplaterError as e:
1060
+ err_console.print(f"[red]Error:[/red] {e}")
1061
+ raise typer.Exit(1)
1062
+
1063
+ except ValueError as e:
1064
+ err_console.print(f"[red]Error:[/red] {e}")
1065
+ raise typer.Exit(1)
1066
+
1067
+ except Exception as e:
1068
+ err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
1069
+ raise typer.Exit(1)
1070
+
1071
+
791
1072
  @app.command()
792
1073
  def template(
793
1074
  sql_file: Annotated[
@@ -1024,6 +1305,13 @@ def graph_build(
1024
1305
  "--dump-schema-format",
1025
1306
  help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
1026
1307
  ),
1308
+ provide_schema: Optional[Path] = typer.Option(
1309
+ None,
1310
+ "--provide-schema",
1311
+ exists=True,
1312
+ help="Path to a schema file (JSON, CSV, or text) to use for star resolution. "
1313
+ "Can be combined with --resolve-schema to merge file-extracted schema on top.",
1314
+ ),
1027
1315
  strict_schema: bool = typer.Option(
1028
1316
  False,
1029
1317
  "--strict-schema",
@@ -1166,32 +1454,21 @@ def graph_build(
1166
1454
  strict_schema=strict_schema,
1167
1455
  )
1168
1456
 
1169
- # Collect file paths for schema extraction
1170
- path_files: list[Path] = []
1171
- if paths:
1172
- for path in paths:
1173
- if path.is_dir():
1174
- pattern = f"**/{glob_pattern}" if recursive else glob_pattern
1175
- path_files.extend(
1176
- f for f in sorted(path.glob(pattern)) if f.is_file()
1177
- )
1178
- elif path.is_file():
1179
- path_files.append(path)
1180
- else:
1181
- err_console.print(f"[red]Error:[/red] Path not found: {path}")
1182
- raise typer.Exit(1)
1457
+ # Load provided schema file if specified
1458
+ if provide_schema:
1459
+ from sqlglider.graph.formatters import load_schema_file
1183
1460
 
1184
- manifest_files: list[Path] = []
1185
- if manifest:
1186
- from sqlglider.graph.models import Manifest
1187
-
1188
- manifest_data = Manifest.from_csv(manifest)
1189
- base_dir = manifest.parent
1190
- for entry in manifest_data.entries:
1191
- file_path = Path(entry.file_path)
1192
- if not file_path.is_absolute():
1193
- file_path = (base_dir / entry.file_path).resolve()
1194
- manifest_files.append(file_path)
1461
+ loaded_schema = load_schema_file(provide_schema)
1462
+ builder.set_schema(loaded_schema)
1463
+ console.print(
1464
+ f"[green]Loaded schema from {provide_schema} "
1465
+ f"({len(loaded_schema)} table(s))[/green]"
1466
+ )
1467
+
1468
+ # Collect file paths for schema extraction
1469
+ manifest_files, path_files = _collect_sql_files(
1470
+ paths, manifest, recursive, glob_pattern
1471
+ )
1195
1472
 
1196
1473
  # Extract schema upfront if requested, then dump before graph building
1197
1474
  all_files = manifest_files + path_files
@@ -16,9 +16,9 @@ from sqlglider.graph.models import (
16
16
  LineageGraph,
17
17
  Manifest,
18
18
  )
19
- from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
19
+ from sqlglider.lineage.analyzer import LineageAnalyzer
20
+ from sqlglider.schema.extractor import extract_and_resolve_schema
20
21
  from sqlglider.utils.file_utils import read_sql_file
21
- from sqlglider.utils.schema import parse_ddl_to_schema
22
22
 
23
23
  console = Console(stderr=True)
24
24
 
@@ -303,156 +303,50 @@ class GraphBuilder:
303
303
  self.add_file(file_path, dialect)
304
304
  return self
305
305
 
306
- def extract_schemas(
307
- self,
308
- file_paths: List[Path],
309
- dialect: Optional[str] = None,
310
- ) -> Dict[str, Dict[str, str]]:
311
- """Run schema extraction pass and optionally fill from catalog.
306
+ def set_schema(self, schema: Dict[str, Dict[str, str]]) -> "GraphBuilder":
307
+ """Pre-seed the resolved schema from an external source.
312
308
 
313
- Call this before add_files/add_manifest to resolve schema upfront.
314
- The resolved schema is stored internally and also returned.
309
+ This allows skipping the schema extraction pass when the schema
310
+ is already known (e.g., loaded from a file).
315
311
 
316
312
  Args:
317
- file_paths: SQL files to extract schema from
318
- dialect: SQL dialect override
313
+ schema: Schema dictionary mapping table names to column dicts.
319
314
 
320
315
  Returns:
321
- Resolved schema dict
316
+ self for method chaining
322
317
  """
323
- console.print("[blue]Pass 1: Extracting schema from files[/blue]")
324
- self._resolved_schema = self._extract_schemas(file_paths, dialect)
325
- if self.catalog_type:
326
- self._resolved_schema = self._fill_schema_from_catalog(
327
- self._resolved_schema, file_paths, dialect
328
- )
329
- console.print(
330
- f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
331
- )
332
- return self._resolved_schema.copy()
318
+ self._resolved_schema = schema
319
+ return self
333
320
 
334
- def _extract_schemas(
321
+ def extract_schemas(
335
322
  self,
336
323
  file_paths: List[Path],
337
324
  dialect: Optional[str] = None,
338
325
  ) -> Dict[str, Dict[str, str]]:
339
- """Run schema extraction pass across all files.
326
+ """Run schema extraction pass and optionally fill from catalog.
340
327
 
341
- Parses each file and extracts schema from CREATE TABLE/VIEW
342
- statements without performing lineage analysis.
328
+ Call this before add_files/add_manifest to resolve schema upfront.
329
+ The resolved schema is stored internally and also returned.
343
330
 
344
331
  Args:
345
332
  file_paths: SQL files to extract schema from
346
333
  dialect: SQL dialect override
347
334
 
348
335
  Returns:
349
- Accumulated schema dict from all files
336
+ Resolved schema dict
350
337
  """
351
- schema: Dict[str, Dict[str, str]] = {}
352
- total = len(file_paths)
353
- with Progress(
354
- TextColumn("[progress.description]{task.description}"),
355
- BarColumn(),
356
- TaskProgressColumn(),
338
+ file_dialect = dialect or self.dialect
339
+ self._resolved_schema = extract_and_resolve_schema(
340
+ file_paths,
341
+ dialect=file_dialect,
342
+ sql_preprocessor=self.sql_preprocessor,
343
+ initial_schema=self._resolved_schema if self._resolved_schema else None,
344
+ strict_schema=self.strict_schema,
345
+ catalog_type=self.catalog_type,
346
+ catalog_config=self.catalog_config,
357
347
  console=console,
358
- transient=False,
359
- ) as progress:
360
- task = progress.add_task("Pass 1: Extracting schema", total=total)
361
- for i, file_path in enumerate(file_paths, start=1):
362
- console.print(f"Extracting schema {i}/{total}: {file_path.name}")
363
- file_dialect = dialect or self.dialect
364
- try:
365
- sql_content = read_sql_file(file_path)
366
- if self.sql_preprocessor:
367
- sql_content = self.sql_preprocessor(sql_content, file_path)
368
- analyzer = LineageAnalyzer(
369
- sql_content,
370
- dialect=file_dialect,
371
- schema=schema,
372
- strict_schema=self.strict_schema,
373
- )
374
- file_schema = analyzer.extract_schema_only()
375
- schema.update(file_schema)
376
- except SchemaResolutionError:
377
- raise
378
- except Exception:
379
- # Schema extraction failures are non-fatal; the file
380
- # will be reported during the lineage pass if it also fails.
381
- pass
382
- progress.advance(task)
383
- return schema
384
-
385
- def _fill_schema_from_catalog(
386
- self,
387
- schema: Dict[str, Dict[str, str]],
388
- file_paths: List[Path],
389
- dialect: Optional[str] = None,
390
- ) -> Dict[str, Dict[str, str]]:
391
- """Pull DDL from catalog for tables not yet in schema.
392
-
393
- Extracts all table names referenced across the files, identifies
394
- those missing from the schema, and fetches their DDL from the
395
- configured catalog provider.
396
-
397
- Args:
398
- schema: Schema dict already populated from file extraction
399
- file_paths: SQL files to scan for table references
400
- dialect: SQL dialect override
401
-
402
- Returns:
403
- Updated schema dict with catalog-sourced entries added
404
- """
405
- from sqlglider.catalog import get_catalog
406
-
407
- catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
408
- if self.catalog_config:
409
- catalog.configure(self.catalog_config)
410
-
411
- # Collect all referenced table names across files
412
- all_tables: Set[str] = set()
413
- for file_path in file_paths:
414
- file_dialect = dialect or self.dialect
415
- try:
416
- sql_content = read_sql_file(file_path)
417
- if self.sql_preprocessor:
418
- sql_content = self.sql_preprocessor(sql_content, file_path)
419
- analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
420
- tables_results = analyzer.analyze_tables()
421
- for result in tables_results:
422
- for table_info in result.tables:
423
- # Skip CTEs — they don't exist in catalogs
424
- from sqlglider.lineage.analyzer import ObjectType
425
-
426
- if table_info.object_type != ObjectType.CTE:
427
- all_tables.add(table_info.name)
428
- except Exception:
429
- pass
430
-
431
- # Find tables missing from schema
432
- missing = [t for t in all_tables if t not in schema]
433
- if not missing:
434
- return schema
435
-
436
- console.print(
437
- f"[blue]Pulling DDL from {self.catalog_type} "
438
- f"for {len(missing)} table(s)...[/blue]"
439
348
  )
440
-
441
- ddl_results = catalog.get_ddl_batch(missing)
442
- file_dialect = dialect or self.dialect
443
- for table_name, ddl in ddl_results.items():
444
- if ddl.startswith("ERROR:"):
445
- console.print(
446
- f"[yellow]Warning:[/yellow] Could not pull DDL "
447
- f"for {table_name}: {ddl}"
448
- )
449
- continue
450
- parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
451
- for name, cols in parsed_schema.items():
452
- if name not in schema:
453
- schema[name] = cols
454
-
455
- return schema
349
+ return self._resolved_schema.copy()
456
350
 
457
351
  def _ensure_node(
458
352
  self,
@@ -1,8 +1,9 @@
1
- """Output formatters for resolved schema data."""
1
+ """Output formatters and parsers for resolved schema data."""
2
2
 
3
3
  import csv
4
4
  import json
5
5
  from io import StringIO
6
+ from pathlib import Path
6
7
  from typing import Dict
7
8
 
8
9
  SchemaDict = Dict[str, Dict[str, str]]
@@ -96,3 +97,93 @@ def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
96
97
  f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
97
98
  )
98
99
  return formatter(schema)
100
+
101
+
102
+ def parse_schema_json(content: str) -> SchemaDict:
103
+ """Parse schema from JSON format.
104
+
105
+ Args:
106
+ content: JSON string with table -> {column -> type} structure.
107
+
108
+ Returns:
109
+ Parsed schema dictionary.
110
+ """
111
+ return json.loads(content) # type: ignore[no-any-return]
112
+
113
+
114
+ def parse_schema_csv(content: str) -> SchemaDict:
115
+ """Parse schema from CSV format.
116
+
117
+ Expects columns: table, column, type.
118
+
119
+ Args:
120
+ content: CSV string with header row.
121
+
122
+ Returns:
123
+ Parsed schema dictionary.
124
+ """
125
+ schema: SchemaDict = {}
126
+ reader = csv.DictReader(StringIO(content))
127
+ for row in reader:
128
+ table = row["table"]
129
+ column = row["column"]
130
+ col_type = row.get("type", "UNKNOWN")
131
+ if table not in schema:
132
+ schema[table] = {}
133
+ schema[table][column] = col_type
134
+ return schema
135
+
136
+
137
+ def parse_schema_text(content: str) -> SchemaDict:
138
+ """Parse schema from indented text format.
139
+
140
+ Expected format:
141
+ table_name
142
+ column1
143
+ column2
144
+
145
+ other_table
146
+ col_a
147
+
148
+ Args:
149
+ content: Text-formatted schema string.
150
+
151
+ Returns:
152
+ Parsed schema dictionary.
153
+ """
154
+ schema: SchemaDict = {}
155
+ current_table: str | None = None
156
+ for line in content.splitlines():
157
+ if not line or not line.strip():
158
+ continue
159
+ if line.startswith(" "):
160
+ if current_table is not None:
161
+ schema[current_table][line.strip()] = "UNKNOWN"
162
+ else:
163
+ current_table = line.strip()
164
+ schema[current_table] = {}
165
+ return schema
166
+
167
+
168
+ def load_schema_file(path: Path) -> SchemaDict:
169
+ """Load a schema file, auto-detecting format from extension.
170
+
171
+ `.json` → JSON, `.csv` → CSV, otherwise text.
172
+
173
+ Args:
174
+ path: Path to schema file.
175
+
176
+ Returns:
177
+ Parsed schema dictionary.
178
+
179
+ Raises:
180
+ FileNotFoundError: If the file does not exist.
181
+ """
182
+ content = path.read_text(encoding="utf-8")
183
+ suffix = path.suffix.lower()
184
+ if suffix == ".json":
185
+ return parse_schema_json(content)
186
+ elif suffix == ".csv":
187
+ return parse_schema_csv(content)
188
+ else:
189
+ return parse_schema_text(content)
File without changes
@@ -0,0 +1,205 @@
1
+ """Shared schema extraction logic for inferring table schemas from SQL files."""
2
+
3
+ from pathlib import Path
4
+ from typing import Callable, Dict, List, Optional
5
+
6
+ from rich.console import Console
7
+ from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
8
+
9
+ from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
10
+ from sqlglider.utils.file_utils import read_sql_file
11
+ from sqlglider.utils.schema import parse_ddl_to_schema
12
+
13
+ SchemaDict = Dict[str, Dict[str, str]]
14
+ SqlPreprocessor = Callable[[str, Path], str]
15
+
16
+
17
+ def extract_schemas_from_files(
18
+ file_paths: List[Path],
19
+ dialect: str = "spark",
20
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
21
+ initial_schema: Optional[SchemaDict] = None,
22
+ strict_schema: bool = False,
23
+ console: Optional[Console] = None,
24
+ ) -> SchemaDict:
25
+ """Extract schema from SQL files by parsing DDL and inferring from DQL.
26
+
27
+ Iterates through files, accumulating schema knowledge. Each file's
28
+ inferred schema is available when parsing subsequent files.
29
+
30
+ Args:
31
+ file_paths: SQL files to extract schema from.
32
+ dialect: SQL dialect.
33
+ sql_preprocessor: Optional function to preprocess SQL (e.g., templating).
34
+ initial_schema: Optional starting schema to build upon.
35
+ strict_schema: If True, fail on ambiguous column attribution.
36
+ console: Rich console for output. Uses stderr if not provided.
37
+
38
+ Returns:
39
+ Accumulated schema dict mapping table names to column dicts.
40
+ """
41
+ if console is None:
42
+ console = Console(stderr=True)
43
+
44
+ schema: SchemaDict = dict(initial_schema) if initial_schema else {}
45
+ total = len(file_paths)
46
+
47
+ with Progress(
48
+ TextColumn("[progress.description]{task.description}"),
49
+ BarColumn(),
50
+ TaskProgressColumn(),
51
+ console=console,
52
+ transient=False,
53
+ ) as progress:
54
+ task = progress.add_task("Extracting schema", total=total)
55
+ for i, file_path in enumerate(file_paths, start=1):
56
+ console.print(f"Extracting schema {i}/{total}: {file_path.name}")
57
+ try:
58
+ sql_content = read_sql_file(file_path)
59
+ if sql_preprocessor:
60
+ sql_content = sql_preprocessor(sql_content, file_path)
61
+ analyzer = LineageAnalyzer(
62
+ sql_content,
63
+ dialect=dialect,
64
+ schema=schema,
65
+ strict_schema=strict_schema,
66
+ )
67
+ file_schema = analyzer.extract_schema_only()
68
+ schema.update(file_schema)
69
+ except SchemaResolutionError:
70
+ raise
71
+ except Exception:
72
+ # Schema extraction failures are non-fatal; the file
73
+ # will be reported during the lineage pass if it also fails.
74
+ pass
75
+ progress.advance(task)
76
+ return schema
77
+
78
+
79
+ def fill_schema_from_catalog(
80
+ schema: SchemaDict,
81
+ file_paths: List[Path],
82
+ dialect: str = "spark",
83
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
84
+ catalog_type: str = "databricks",
85
+ catalog_config: Optional[Dict[str, object]] = None,
86
+ console: Optional[Console] = None,
87
+ ) -> SchemaDict:
88
+ """Pull DDL from catalog for tables not yet in schema.
89
+
90
+ Args:
91
+ schema: Schema dict already populated from file extraction.
92
+ file_paths: SQL files to scan for table references.
93
+ dialect: SQL dialect.
94
+ sql_preprocessor: Optional SQL preprocessor.
95
+ catalog_type: Catalog provider name.
96
+ catalog_config: Provider-specific configuration dict.
97
+ console: Rich console for output.
98
+
99
+ Returns:
100
+ Updated schema dict with catalog-sourced entries added.
101
+ """
102
+ from sqlglider.catalog import get_catalog
103
+ from sqlglider.lineage.analyzer import ObjectType
104
+
105
+ if console is None:
106
+ console = Console(stderr=True)
107
+
108
+ catalog = get_catalog(catalog_type)
109
+ if catalog_config:
110
+ catalog.configure(catalog_config)
111
+
112
+ # Collect all referenced table names across files
113
+ all_tables: set[str] = set()
114
+ for file_path in file_paths:
115
+ try:
116
+ sql_content = read_sql_file(file_path)
117
+ if sql_preprocessor:
118
+ sql_content = sql_preprocessor(sql_content, file_path)
119
+ analyzer = LineageAnalyzer(sql_content, dialect=dialect)
120
+ tables_results = analyzer.analyze_tables()
121
+ for result in tables_results:
122
+ for table_info in result.tables:
123
+ if table_info.object_type != ObjectType.CTE:
124
+ all_tables.add(table_info.name)
125
+ except Exception:
126
+ pass
127
+
128
+ # Find tables missing from schema
129
+ missing = [t for t in all_tables if t not in schema]
130
+ if not missing:
131
+ return schema
132
+
133
+ console.print(
134
+ f"[blue]Pulling DDL from {catalog_type} for {len(missing)} table(s)...[/blue]"
135
+ )
136
+
137
+ ddl_results = catalog.get_ddl_batch(missing)
138
+ for table_name, ddl in ddl_results.items():
139
+ if ddl.startswith("ERROR:"):
140
+ console.print(
141
+ f"[yellow]Warning:[/yellow] Could not pull DDL for {table_name}: {ddl}"
142
+ )
143
+ continue
144
+ parsed_schema = parse_ddl_to_schema(ddl, dialect=dialect)
145
+ for name, cols in parsed_schema.items():
146
+ if name not in schema:
147
+ schema[name] = cols
148
+
149
+ return schema
150
+
151
+
152
+ def extract_and_resolve_schema(
153
+ file_paths: List[Path],
154
+ dialect: str = "spark",
155
+ sql_preprocessor: Optional[SqlPreprocessor] = None,
156
+ initial_schema: Optional[SchemaDict] = None,
157
+ strict_schema: bool = False,
158
+ catalog_type: Optional[str] = None,
159
+ catalog_config: Optional[Dict[str, object]] = None,
160
+ console: Optional[Console] = None,
161
+ ) -> SchemaDict:
162
+ """Extract schema from files and optionally fill from catalog.
163
+
164
+ High-level orchestrator that runs file-based extraction followed
165
+ by optional catalog resolution.
166
+
167
+ Args:
168
+ file_paths: SQL files to extract schema from.
169
+ dialect: SQL dialect.
170
+ sql_preprocessor: Optional SQL preprocessor.
171
+ initial_schema: Optional starting schema to build upon.
172
+ strict_schema: If True, fail on ambiguous column attribution.
173
+ catalog_type: Optional catalog provider name.
174
+ catalog_config: Optional provider-specific configuration dict.
175
+ console: Rich console for output.
176
+
177
+ Returns:
178
+ Resolved schema dict.
179
+ """
180
+ if console is None:
181
+ console = Console(stderr=True)
182
+
183
+ console.print("[blue]Extracting schema from files[/blue]")
184
+ schema = extract_schemas_from_files(
185
+ file_paths,
186
+ dialect=dialect,
187
+ sql_preprocessor=sql_preprocessor,
188
+ initial_schema=initial_schema,
189
+ strict_schema=strict_schema,
190
+ console=console,
191
+ )
192
+
193
+ if catalog_type:
194
+ schema = fill_schema_from_catalog(
195
+ schema,
196
+ file_paths,
197
+ dialect=dialect,
198
+ sql_preprocessor=sql_preprocessor,
199
+ catalog_type=catalog_type,
200
+ catalog_config=catalog_config,
201
+ console=console,
202
+ )
203
+
204
+ console.print(f"[blue]Schema resolved for {len(schema)} table(s)[/blue]")
205
+ return schema