sql-glider 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_glider-0.1.13.dist-info → sql_glider-0.1.15.dist-info}/METADATA +1 -1
- {sql_glider-0.1.13.dist-info → sql_glider-0.1.15.dist-info}/RECORD +11 -9
- sqlglider/_version.py +2 -2
- sqlglider/cli.py +303 -26
- sqlglider/graph/builder.py +25 -131
- sqlglider/graph/formatters.py +92 -1
- sqlglider/schema/__init__.py +0 -0
- sqlglider/schema/extractor.py +205 -0
- {sql_glider-0.1.13.dist-info → sql_glider-0.1.15.dist-info}/WHEEL +0 -0
- {sql_glider-0.1.13.dist-info → sql_glider-0.1.15.dist-info}/entry_points.txt +0 -0
- {sql_glider-0.1.13.dist-info → sql_glider-0.1.15.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sql-glider
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.15
|
|
4
4
|
Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
|
|
5
5
|
Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
|
|
6
6
|
Project-URL: Repository, https://github.com/rycowhi/sql-glider/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
|
|
2
|
-
sqlglider/_version.py,sha256=
|
|
3
|
-
sqlglider/cli.py,sha256=
|
|
2
|
+
sqlglider/_version.py,sha256=HPqQHR9pVxIxlFt4vovkyoe7k6UO3ag2isBN2lHFL8g,706
|
|
3
|
+
sqlglider/cli.py,sha256=9zNMaw3rgcqb6uG05VJTYbLUXmZzdX87gAOJ4Zg3xjY,65319
|
|
4
4
|
sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
|
|
5
5
|
sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
|
|
6
6
|
sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
|
|
@@ -11,8 +11,8 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
|
|
|
11
11
|
sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
|
|
12
12
|
sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
|
|
13
13
|
sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
|
|
14
|
-
sqlglider/graph/builder.py,sha256=
|
|
15
|
-
sqlglider/graph/formatters.py,sha256=
|
|
14
|
+
sqlglider/graph/builder.py,sha256=VNBdsDlkiaId3JGvr2G4h6OIFek_9zPsGMIYL9GpJlk,15796
|
|
15
|
+
sqlglider/graph/formatters.py,sha256=p85-WN9oPmEETsAtWSo1sIQELF36w85QoFEJyfBZGoM,4800
|
|
16
16
|
sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
|
|
17
17
|
sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
|
|
18
18
|
sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
|
|
@@ -20,6 +20,8 @@ sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9Zl
|
|
|
20
20
|
sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
|
|
21
21
|
sqlglider/lineage/analyzer.py,sha256=08pFR5aGFFPhSbRW6EqiX2d3mp91v-orcs6dm_T1FJg,76484
|
|
22
22
|
sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
|
|
23
|
+
sqlglider/schema/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
sqlglider/schema/extractor.py,sha256=WW31wbHkL-V749pLb7EAyUOJuziZQK-5hLZVW6f970U,7234
|
|
23
25
|
sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
|
|
24
26
|
sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
|
|
25
27
|
sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
|
|
@@ -29,8 +31,8 @@ sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,2
|
|
|
29
31
|
sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
|
|
30
32
|
sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
|
|
31
33
|
sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
|
|
32
|
-
sql_glider-0.1.
|
|
33
|
-
sql_glider-0.1.
|
|
34
|
-
sql_glider-0.1.
|
|
35
|
-
sql_glider-0.1.
|
|
36
|
-
sql_glider-0.1.
|
|
34
|
+
sql_glider-0.1.15.dist-info/METADATA,sha256=IF0dZD6rOriyausbDZhHPMfYnhHyRlxyi9v_ihTgCUo,28446
|
|
35
|
+
sql_glider-0.1.15.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
36
|
+
sql_glider-0.1.15.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
|
|
37
|
+
sql_glider-0.1.15.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
38
|
+
sql_glider-0.1.15.dist-info/RECORD,,
|
sqlglider/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.15'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 15)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
sqlglider/cli.py
CHANGED
|
@@ -171,6 +171,12 @@ def lineage(
|
|
|
171
171
|
"--no-star",
|
|
172
172
|
help="Fail if SELECT * cannot be resolved to actual columns",
|
|
173
173
|
),
|
|
174
|
+
provide_schema: Optional[Path] = typer.Option(
|
|
175
|
+
None,
|
|
176
|
+
"--provide-schema",
|
|
177
|
+
exists=True,
|
|
178
|
+
help="Path to a schema file (JSON, CSV, or text) for star resolution",
|
|
179
|
+
),
|
|
174
180
|
) -> None:
|
|
175
181
|
"""
|
|
176
182
|
Analyze column or table lineage for a SQL file.
|
|
@@ -266,8 +272,15 @@ def lineage(
|
|
|
266
272
|
source_path=source_path,
|
|
267
273
|
)
|
|
268
274
|
|
|
275
|
+
# Load provided schema if specified
|
|
276
|
+
schema = None
|
|
277
|
+
if provide_schema:
|
|
278
|
+
from sqlglider.graph.formatters import load_schema_file
|
|
279
|
+
|
|
280
|
+
schema = load_schema_file(provide_schema)
|
|
281
|
+
|
|
269
282
|
# Create analyzer
|
|
270
|
-
analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star)
|
|
283
|
+
analyzer = LineageAnalyzer(sql, dialect=dialect, no_star=no_star, schema=schema)
|
|
271
284
|
|
|
272
285
|
# Unified lineage analysis (handles both single and multi-query files)
|
|
273
286
|
results = analyzer.analyze_queries(
|
|
@@ -788,6 +801,274 @@ def tables_pull(
|
|
|
788
801
|
raise typer.Exit(1)
|
|
789
802
|
|
|
790
803
|
|
|
804
|
+
def _collect_sql_files(
|
|
805
|
+
paths: Optional[List[Path]],
|
|
806
|
+
manifest: Optional[Path],
|
|
807
|
+
recursive: bool,
|
|
808
|
+
glob_pattern: str,
|
|
809
|
+
) -> tuple[list[Path], list[Path]]:
|
|
810
|
+
"""Collect SQL files from paths and/or manifest.
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
paths: File or directory paths to scan.
|
|
814
|
+
manifest: Optional manifest CSV path.
|
|
815
|
+
recursive: Whether to recurse into directories.
|
|
816
|
+
glob_pattern: Glob pattern for directory scanning.
|
|
817
|
+
|
|
818
|
+
Returns:
|
|
819
|
+
Tuple of (manifest_files, path_files).
|
|
820
|
+
"""
|
|
821
|
+
path_files: list[Path] = []
|
|
822
|
+
if paths:
|
|
823
|
+
for path in paths:
|
|
824
|
+
if path.is_dir():
|
|
825
|
+
pattern = f"**/{glob_pattern}" if recursive else glob_pattern
|
|
826
|
+
path_files.extend(f for f in sorted(path.glob(pattern)) if f.is_file())
|
|
827
|
+
elif path.is_file():
|
|
828
|
+
path_files.append(path)
|
|
829
|
+
else:
|
|
830
|
+
err_console.print(f"[red]Error:[/red] Path not found: {path}")
|
|
831
|
+
raise typer.Exit(1)
|
|
832
|
+
|
|
833
|
+
manifest_files: list[Path] = []
|
|
834
|
+
if manifest:
|
|
835
|
+
from sqlglider.graph.models import Manifest
|
|
836
|
+
|
|
837
|
+
manifest_data = Manifest.from_csv(manifest)
|
|
838
|
+
base_dir = manifest.parent
|
|
839
|
+
for entry in manifest_data.entries:
|
|
840
|
+
file_path = Path(entry.file_path)
|
|
841
|
+
if not file_path.is_absolute():
|
|
842
|
+
file_path = (base_dir / entry.file_path).resolve()
|
|
843
|
+
manifest_files.append(file_path)
|
|
844
|
+
|
|
845
|
+
return manifest_files, path_files
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
@tables_app.command("scrape")
|
|
849
|
+
def tables_scrape(
|
|
850
|
+
paths: List[Path] = typer.Argument(
|
|
851
|
+
None,
|
|
852
|
+
help="SQL file(s) or directory path to process",
|
|
853
|
+
),
|
|
854
|
+
recursive: bool = typer.Option(
|
|
855
|
+
False,
|
|
856
|
+
"--recursive",
|
|
857
|
+
"-r",
|
|
858
|
+
help="Recursively search directories for SQL files",
|
|
859
|
+
),
|
|
860
|
+
glob_pattern: str = typer.Option(
|
|
861
|
+
"*.sql",
|
|
862
|
+
"--glob",
|
|
863
|
+
"-g",
|
|
864
|
+
help="Glob pattern for matching SQL files in directories",
|
|
865
|
+
),
|
|
866
|
+
manifest: Optional[Path] = typer.Option(
|
|
867
|
+
None,
|
|
868
|
+
"--manifest",
|
|
869
|
+
"-m",
|
|
870
|
+
exists=True,
|
|
871
|
+
help="Path to manifest CSV file with file_path and optional dialect columns",
|
|
872
|
+
),
|
|
873
|
+
dialect: Optional[str] = typer.Option(
|
|
874
|
+
None,
|
|
875
|
+
"--dialect",
|
|
876
|
+
"-d",
|
|
877
|
+
help="SQL dialect (default: spark)",
|
|
878
|
+
),
|
|
879
|
+
templater: Optional[str] = typer.Option(
|
|
880
|
+
None,
|
|
881
|
+
"--templater",
|
|
882
|
+
"-t",
|
|
883
|
+
help="Templater for SQL preprocessing (e.g., 'jinja', 'none')",
|
|
884
|
+
),
|
|
885
|
+
var: Optional[List[str]] = typer.Option(
|
|
886
|
+
None,
|
|
887
|
+
"--var",
|
|
888
|
+
"-v",
|
|
889
|
+
help="Template variable in key=value format (repeatable)",
|
|
890
|
+
),
|
|
891
|
+
vars_file: Optional[Path] = typer.Option(
|
|
892
|
+
None,
|
|
893
|
+
"--vars-file",
|
|
894
|
+
exists=True,
|
|
895
|
+
help="Path to variables file (JSON or YAML)",
|
|
896
|
+
),
|
|
897
|
+
strict_schema: bool = typer.Option(
|
|
898
|
+
False,
|
|
899
|
+
"--strict-schema",
|
|
900
|
+
help="Fail if any column's table cannot be identified during schema extraction",
|
|
901
|
+
),
|
|
902
|
+
catalog_type: Optional[str] = typer.Option(
|
|
903
|
+
None,
|
|
904
|
+
"--catalog-type",
|
|
905
|
+
"-c",
|
|
906
|
+
help="Catalog provider for pulling DDL of tables not found in files "
|
|
907
|
+
"(e.g. 'databricks')",
|
|
908
|
+
),
|
|
909
|
+
output_format: Optional[str] = typer.Option(
|
|
910
|
+
None,
|
|
911
|
+
"--output-format",
|
|
912
|
+
"-f",
|
|
913
|
+
help="Output format: 'text' (default), 'json', or 'csv'",
|
|
914
|
+
),
|
|
915
|
+
output_file: Optional[Path] = typer.Option(
|
|
916
|
+
None,
|
|
917
|
+
"--output-file",
|
|
918
|
+
"-o",
|
|
919
|
+
help="Output file path (prints to stdout if not provided)",
|
|
920
|
+
),
|
|
921
|
+
) -> None:
|
|
922
|
+
"""
|
|
923
|
+
Scrape schema information from SQL files.
|
|
924
|
+
|
|
925
|
+
Infers table and column schemas from DDL statements and DQL column
|
|
926
|
+
references across one or more SQL files. Supports the same file input
|
|
927
|
+
modes as `graph build` (paths, directories, manifests).
|
|
928
|
+
|
|
929
|
+
Examples:
|
|
930
|
+
|
|
931
|
+
# Scrape schema from a directory
|
|
932
|
+
sqlglider tables scrape ./queries/ -r
|
|
933
|
+
|
|
934
|
+
# Output as JSON
|
|
935
|
+
sqlglider tables scrape ./queries/ -r -f json
|
|
936
|
+
|
|
937
|
+
# Save to file
|
|
938
|
+
sqlglider tables scrape ./queries/ -r -f csv -o schema.csv
|
|
939
|
+
|
|
940
|
+
# With Jinja2 templating
|
|
941
|
+
sqlglider tables scrape ./queries/ -r --templater jinja --var schema=prod
|
|
942
|
+
|
|
943
|
+
# With catalog fallback
|
|
944
|
+
sqlglider tables scrape ./queries/ -r -c databricks
|
|
945
|
+
"""
|
|
946
|
+
from sqlglider.graph.formatters import format_schema
|
|
947
|
+
from sqlglider.lineage.analyzer import SchemaResolutionError
|
|
948
|
+
from sqlglider.schema.extractor import extract_and_resolve_schema
|
|
949
|
+
|
|
950
|
+
# Load config for defaults
|
|
951
|
+
config = load_config()
|
|
952
|
+
dialect = dialect or config.dialect or "spark"
|
|
953
|
+
templater = templater or config.templater
|
|
954
|
+
strict_schema = strict_schema or config.strict_schema or False
|
|
955
|
+
output_format = output_format or config.output_format or "text"
|
|
956
|
+
|
|
957
|
+
if output_format not in ("text", "json", "csv"):
|
|
958
|
+
err_console.print(
|
|
959
|
+
f"[red]Error:[/red] Invalid --output-format '{output_format}'. "
|
|
960
|
+
"Use 'text', 'json', or 'csv'."
|
|
961
|
+
)
|
|
962
|
+
raise typer.Exit(1)
|
|
963
|
+
|
|
964
|
+
# Only inherit catalog_type from config when not provided via CLI
|
|
965
|
+
if not catalog_type:
|
|
966
|
+
catalog_type = config.catalog_type
|
|
967
|
+
|
|
968
|
+
# Validate inputs
|
|
969
|
+
if not paths and not manifest:
|
|
970
|
+
err_console.print(
|
|
971
|
+
"[red]Error:[/red] Must provide either file/directory paths or --manifest option."
|
|
972
|
+
)
|
|
973
|
+
raise typer.Exit(1)
|
|
974
|
+
|
|
975
|
+
# Create SQL preprocessor if templating is enabled
|
|
976
|
+
sql_preprocessor: Optional[Callable[[str, Path], str]] = None
|
|
977
|
+
if templater:
|
|
978
|
+
config_vars_file = None
|
|
979
|
+
config_vars = None
|
|
980
|
+
if config.templating:
|
|
981
|
+
if config.templating.variables_file and not vars_file:
|
|
982
|
+
config_vars_file = Path(config.templating.variables_file)
|
|
983
|
+
if not config_vars_file.exists():
|
|
984
|
+
err_console.print(
|
|
985
|
+
f"[yellow]Warning:[/yellow] Variables file from config "
|
|
986
|
+
f"not found: {config_vars_file}"
|
|
987
|
+
)
|
|
988
|
+
config_vars_file = None
|
|
989
|
+
config_vars = config.templating.variables
|
|
990
|
+
|
|
991
|
+
variables = load_all_variables(
|
|
992
|
+
cli_vars=var,
|
|
993
|
+
vars_file=vars_file or config_vars_file,
|
|
994
|
+
config_vars=config_vars,
|
|
995
|
+
use_env=True,
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
templater_instance = get_templater(templater)
|
|
999
|
+
|
|
1000
|
+
def _preprocess(sql: str, file_path: Path) -> str:
|
|
1001
|
+
return templater_instance.render(
|
|
1002
|
+
sql, variables=variables, source_path=file_path
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
sql_preprocessor = _preprocess
|
|
1006
|
+
|
|
1007
|
+
try:
|
|
1008
|
+
# Build catalog config from config file if available
|
|
1009
|
+
catalog_config_dict = None
|
|
1010
|
+
if catalog_type and config.catalog:
|
|
1011
|
+
provider_config = getattr(config.catalog, catalog_type, None)
|
|
1012
|
+
if provider_config:
|
|
1013
|
+
catalog_config_dict = provider_config.model_dump(exclude_none=True)
|
|
1014
|
+
|
|
1015
|
+
# Collect files
|
|
1016
|
+
manifest_files, path_files = _collect_sql_files(
|
|
1017
|
+
paths, manifest, recursive, glob_pattern
|
|
1018
|
+
)
|
|
1019
|
+
all_files = manifest_files + path_files
|
|
1020
|
+
|
|
1021
|
+
if not all_files:
|
|
1022
|
+
err_console.print("[yellow]Warning:[/yellow] No SQL files found.")
|
|
1023
|
+
raise typer.Exit(0)
|
|
1024
|
+
|
|
1025
|
+
# Extract schema
|
|
1026
|
+
schema = extract_and_resolve_schema(
|
|
1027
|
+
all_files,
|
|
1028
|
+
dialect=dialect,
|
|
1029
|
+
sql_preprocessor=sql_preprocessor,
|
|
1030
|
+
strict_schema=strict_schema,
|
|
1031
|
+
catalog_type=catalog_type,
|
|
1032
|
+
catalog_config=catalog_config_dict,
|
|
1033
|
+
console=err_console,
|
|
1034
|
+
)
|
|
1035
|
+
|
|
1036
|
+
if not schema:
|
|
1037
|
+
err_console.print("[yellow]No schema information found.[/yellow]")
|
|
1038
|
+
raise typer.Exit(0)
|
|
1039
|
+
|
|
1040
|
+
# Format and output
|
|
1041
|
+
formatted = format_schema(schema, output_format)
|
|
1042
|
+
if output_file:
|
|
1043
|
+
OutputWriter.write(formatted, output_file)
|
|
1044
|
+
err_console.print(
|
|
1045
|
+
f"[green]Schema written to {output_file} "
|
|
1046
|
+
f"({len(schema)} table(s))[/green]"
|
|
1047
|
+
)
|
|
1048
|
+
else:
|
|
1049
|
+
console.print(formatted, end="")
|
|
1050
|
+
|
|
1051
|
+
except SchemaResolutionError as e:
|
|
1052
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1053
|
+
raise typer.Exit(1)
|
|
1054
|
+
|
|
1055
|
+
except FileNotFoundError as e:
|
|
1056
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1057
|
+
raise typer.Exit(1)
|
|
1058
|
+
|
|
1059
|
+
except TemplaterError as e:
|
|
1060
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1061
|
+
raise typer.Exit(1)
|
|
1062
|
+
|
|
1063
|
+
except ValueError as e:
|
|
1064
|
+
err_console.print(f"[red]Error:[/red] {e}")
|
|
1065
|
+
raise typer.Exit(1)
|
|
1066
|
+
|
|
1067
|
+
except Exception as e:
|
|
1068
|
+
err_console.print(f"[red]Error:[/red] Unexpected error: {e}")
|
|
1069
|
+
raise typer.Exit(1)
|
|
1070
|
+
|
|
1071
|
+
|
|
791
1072
|
@app.command()
|
|
792
1073
|
def template(
|
|
793
1074
|
sql_file: Annotated[
|
|
@@ -1024,6 +1305,13 @@ def graph_build(
|
|
|
1024
1305
|
"--dump-schema-format",
|
|
1025
1306
|
help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
|
|
1026
1307
|
),
|
|
1308
|
+
provide_schema: Optional[Path] = typer.Option(
|
|
1309
|
+
None,
|
|
1310
|
+
"--provide-schema",
|
|
1311
|
+
exists=True,
|
|
1312
|
+
help="Path to a schema file (JSON, CSV, or text) to use for star resolution. "
|
|
1313
|
+
"Can be combined with --resolve-schema to merge file-extracted schema on top.",
|
|
1314
|
+
),
|
|
1027
1315
|
strict_schema: bool = typer.Option(
|
|
1028
1316
|
False,
|
|
1029
1317
|
"--strict-schema",
|
|
@@ -1166,32 +1454,21 @@ def graph_build(
|
|
|
1166
1454
|
strict_schema=strict_schema,
|
|
1167
1455
|
)
|
|
1168
1456
|
|
|
1169
|
-
#
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
for path in paths:
|
|
1173
|
-
if path.is_dir():
|
|
1174
|
-
pattern = f"**/{glob_pattern}" if recursive else glob_pattern
|
|
1175
|
-
path_files.extend(
|
|
1176
|
-
f for f in sorted(path.glob(pattern)) if f.is_file()
|
|
1177
|
-
)
|
|
1178
|
-
elif path.is_file():
|
|
1179
|
-
path_files.append(path)
|
|
1180
|
-
else:
|
|
1181
|
-
err_console.print(f"[red]Error:[/red] Path not found: {path}")
|
|
1182
|
-
raise typer.Exit(1)
|
|
1457
|
+
# Load provided schema file if specified
|
|
1458
|
+
if provide_schema:
|
|
1459
|
+
from sqlglider.graph.formatters import load_schema_file
|
|
1183
1460
|
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1461
|
+
loaded_schema = load_schema_file(provide_schema)
|
|
1462
|
+
builder.set_schema(loaded_schema)
|
|
1463
|
+
console.print(
|
|
1464
|
+
f"[green]Loaded schema from {provide_schema} "
|
|
1465
|
+
f"({len(loaded_schema)} table(s))[/green]"
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
# Collect file paths for schema extraction
|
|
1469
|
+
manifest_files, path_files = _collect_sql_files(
|
|
1470
|
+
paths, manifest, recursive, glob_pattern
|
|
1471
|
+
)
|
|
1195
1472
|
|
|
1196
1473
|
# Extract schema upfront if requested, then dump before graph building
|
|
1197
1474
|
all_files = manifest_files + path_files
|
sqlglider/graph/builder.py
CHANGED
|
@@ -16,9 +16,9 @@ from sqlglider.graph.models import (
|
|
|
16
16
|
LineageGraph,
|
|
17
17
|
Manifest,
|
|
18
18
|
)
|
|
19
|
-
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
19
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer
|
|
20
|
+
from sqlglider.schema.extractor import extract_and_resolve_schema
|
|
20
21
|
from sqlglider.utils.file_utils import read_sql_file
|
|
21
|
-
from sqlglider.utils.schema import parse_ddl_to_schema
|
|
22
22
|
|
|
23
23
|
console = Console(stderr=True)
|
|
24
24
|
|
|
@@ -303,156 +303,50 @@ class GraphBuilder:
|
|
|
303
303
|
self.add_file(file_path, dialect)
|
|
304
304
|
return self
|
|
305
305
|
|
|
306
|
-
def
|
|
307
|
-
|
|
308
|
-
file_paths: List[Path],
|
|
309
|
-
dialect: Optional[str] = None,
|
|
310
|
-
) -> Dict[str, Dict[str, str]]:
|
|
311
|
-
"""Run schema extraction pass and optionally fill from catalog.
|
|
306
|
+
def set_schema(self, schema: Dict[str, Dict[str, str]]) -> "GraphBuilder":
|
|
307
|
+
"""Pre-seed the resolved schema from an external source.
|
|
312
308
|
|
|
313
|
-
|
|
314
|
-
|
|
309
|
+
This allows skipping the schema extraction pass when the schema
|
|
310
|
+
is already known (e.g., loaded from a file).
|
|
315
311
|
|
|
316
312
|
Args:
|
|
317
|
-
|
|
318
|
-
dialect: SQL dialect override
|
|
313
|
+
schema: Schema dictionary mapping table names to column dicts.
|
|
319
314
|
|
|
320
315
|
Returns:
|
|
321
|
-
|
|
316
|
+
self for method chaining
|
|
322
317
|
"""
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
if self.catalog_type:
|
|
326
|
-
self._resolved_schema = self._fill_schema_from_catalog(
|
|
327
|
-
self._resolved_schema, file_paths, dialect
|
|
328
|
-
)
|
|
329
|
-
console.print(
|
|
330
|
-
f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
|
|
331
|
-
)
|
|
332
|
-
return self._resolved_schema.copy()
|
|
318
|
+
self._resolved_schema = schema
|
|
319
|
+
return self
|
|
333
320
|
|
|
334
|
-
def
|
|
321
|
+
def extract_schemas(
|
|
335
322
|
self,
|
|
336
323
|
file_paths: List[Path],
|
|
337
324
|
dialect: Optional[str] = None,
|
|
338
325
|
) -> Dict[str, Dict[str, str]]:
|
|
339
|
-
"""Run schema extraction pass
|
|
326
|
+
"""Run schema extraction pass and optionally fill from catalog.
|
|
340
327
|
|
|
341
|
-
|
|
342
|
-
|
|
328
|
+
Call this before add_files/add_manifest to resolve schema upfront.
|
|
329
|
+
The resolved schema is stored internally and also returned.
|
|
343
330
|
|
|
344
331
|
Args:
|
|
345
332
|
file_paths: SQL files to extract schema from
|
|
346
333
|
dialect: SQL dialect override
|
|
347
334
|
|
|
348
335
|
Returns:
|
|
349
|
-
|
|
336
|
+
Resolved schema dict
|
|
350
337
|
"""
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
338
|
+
file_dialect = dialect or self.dialect
|
|
339
|
+
self._resolved_schema = extract_and_resolve_schema(
|
|
340
|
+
file_paths,
|
|
341
|
+
dialect=file_dialect,
|
|
342
|
+
sql_preprocessor=self.sql_preprocessor,
|
|
343
|
+
initial_schema=self._resolved_schema if self._resolved_schema else None,
|
|
344
|
+
strict_schema=self.strict_schema,
|
|
345
|
+
catalog_type=self.catalog_type,
|
|
346
|
+
catalog_config=self.catalog_config,
|
|
357
347
|
console=console,
|
|
358
|
-
transient=False,
|
|
359
|
-
) as progress:
|
|
360
|
-
task = progress.add_task("Pass 1: Extracting schema", total=total)
|
|
361
|
-
for i, file_path in enumerate(file_paths, start=1):
|
|
362
|
-
console.print(f"Extracting schema {i}/{total}: {file_path.name}")
|
|
363
|
-
file_dialect = dialect or self.dialect
|
|
364
|
-
try:
|
|
365
|
-
sql_content = read_sql_file(file_path)
|
|
366
|
-
if self.sql_preprocessor:
|
|
367
|
-
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
368
|
-
analyzer = LineageAnalyzer(
|
|
369
|
-
sql_content,
|
|
370
|
-
dialect=file_dialect,
|
|
371
|
-
schema=schema,
|
|
372
|
-
strict_schema=self.strict_schema,
|
|
373
|
-
)
|
|
374
|
-
file_schema = analyzer.extract_schema_only()
|
|
375
|
-
schema.update(file_schema)
|
|
376
|
-
except SchemaResolutionError:
|
|
377
|
-
raise
|
|
378
|
-
except Exception:
|
|
379
|
-
# Schema extraction failures are non-fatal; the file
|
|
380
|
-
# will be reported during the lineage pass if it also fails.
|
|
381
|
-
pass
|
|
382
|
-
progress.advance(task)
|
|
383
|
-
return schema
|
|
384
|
-
|
|
385
|
-
def _fill_schema_from_catalog(
|
|
386
|
-
self,
|
|
387
|
-
schema: Dict[str, Dict[str, str]],
|
|
388
|
-
file_paths: List[Path],
|
|
389
|
-
dialect: Optional[str] = None,
|
|
390
|
-
) -> Dict[str, Dict[str, str]]:
|
|
391
|
-
"""Pull DDL from catalog for tables not yet in schema.
|
|
392
|
-
|
|
393
|
-
Extracts all table names referenced across the files, identifies
|
|
394
|
-
those missing from the schema, and fetches their DDL from the
|
|
395
|
-
configured catalog provider.
|
|
396
|
-
|
|
397
|
-
Args:
|
|
398
|
-
schema: Schema dict already populated from file extraction
|
|
399
|
-
file_paths: SQL files to scan for table references
|
|
400
|
-
dialect: SQL dialect override
|
|
401
|
-
|
|
402
|
-
Returns:
|
|
403
|
-
Updated schema dict with catalog-sourced entries added
|
|
404
|
-
"""
|
|
405
|
-
from sqlglider.catalog import get_catalog
|
|
406
|
-
|
|
407
|
-
catalog = get_catalog(self.catalog_type) # type: ignore[arg-type]
|
|
408
|
-
if self.catalog_config:
|
|
409
|
-
catalog.configure(self.catalog_config)
|
|
410
|
-
|
|
411
|
-
# Collect all referenced table names across files
|
|
412
|
-
all_tables: Set[str] = set()
|
|
413
|
-
for file_path in file_paths:
|
|
414
|
-
file_dialect = dialect or self.dialect
|
|
415
|
-
try:
|
|
416
|
-
sql_content = read_sql_file(file_path)
|
|
417
|
-
if self.sql_preprocessor:
|
|
418
|
-
sql_content = self.sql_preprocessor(sql_content, file_path)
|
|
419
|
-
analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
|
|
420
|
-
tables_results = analyzer.analyze_tables()
|
|
421
|
-
for result in tables_results:
|
|
422
|
-
for table_info in result.tables:
|
|
423
|
-
# Skip CTEs — they don't exist in catalogs
|
|
424
|
-
from sqlglider.lineage.analyzer import ObjectType
|
|
425
|
-
|
|
426
|
-
if table_info.object_type != ObjectType.CTE:
|
|
427
|
-
all_tables.add(table_info.name)
|
|
428
|
-
except Exception:
|
|
429
|
-
pass
|
|
430
|
-
|
|
431
|
-
# Find tables missing from schema
|
|
432
|
-
missing = [t for t in all_tables if t not in schema]
|
|
433
|
-
if not missing:
|
|
434
|
-
return schema
|
|
435
|
-
|
|
436
|
-
console.print(
|
|
437
|
-
f"[blue]Pulling DDL from {self.catalog_type} "
|
|
438
|
-
f"for {len(missing)} table(s)...[/blue]"
|
|
439
348
|
)
|
|
440
|
-
|
|
441
|
-
ddl_results = catalog.get_ddl_batch(missing)
|
|
442
|
-
file_dialect = dialect or self.dialect
|
|
443
|
-
for table_name, ddl in ddl_results.items():
|
|
444
|
-
if ddl.startswith("ERROR:"):
|
|
445
|
-
console.print(
|
|
446
|
-
f"[yellow]Warning:[/yellow] Could not pull DDL "
|
|
447
|
-
f"for {table_name}: {ddl}"
|
|
448
|
-
)
|
|
449
|
-
continue
|
|
450
|
-
parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
|
|
451
|
-
for name, cols in parsed_schema.items():
|
|
452
|
-
if name not in schema:
|
|
453
|
-
schema[name] = cols
|
|
454
|
-
|
|
455
|
-
return schema
|
|
349
|
+
return self._resolved_schema.copy()
|
|
456
350
|
|
|
457
351
|
def _ensure_node(
|
|
458
352
|
self,
|
sqlglider/graph/formatters.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
"""Output formatters for resolved schema data."""
|
|
1
|
+
"""Output formatters and parsers for resolved schema data."""
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
4
|
import json
|
|
5
5
|
from io import StringIO
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from typing import Dict
|
|
7
8
|
|
|
8
9
|
SchemaDict = Dict[str, Dict[str, str]]
|
|
@@ -96,3 +97,93 @@ def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
|
|
|
96
97
|
f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
|
|
97
98
|
)
|
|
98
99
|
return formatter(schema)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def parse_schema_json(content: str) -> SchemaDict:
|
|
103
|
+
"""Parse schema from JSON format.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
content: JSON string with table -> {column -> type} structure.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Parsed schema dictionary.
|
|
110
|
+
"""
|
|
111
|
+
return json.loads(content) # type: ignore[no-any-return]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def parse_schema_csv(content: str) -> SchemaDict:
|
|
115
|
+
"""Parse schema from CSV format.
|
|
116
|
+
|
|
117
|
+
Expects columns: table, column, type.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
content: CSV string with header row.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Parsed schema dictionary.
|
|
124
|
+
"""
|
|
125
|
+
schema: SchemaDict = {}
|
|
126
|
+
reader = csv.DictReader(StringIO(content))
|
|
127
|
+
for row in reader:
|
|
128
|
+
table = row["table"]
|
|
129
|
+
column = row["column"]
|
|
130
|
+
col_type = row.get("type", "UNKNOWN")
|
|
131
|
+
if table not in schema:
|
|
132
|
+
schema[table] = {}
|
|
133
|
+
schema[table][column] = col_type
|
|
134
|
+
return schema
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_schema_text(content: str) -> SchemaDict:
|
|
138
|
+
"""Parse schema from indented text format.
|
|
139
|
+
|
|
140
|
+
Expected format:
|
|
141
|
+
table_name
|
|
142
|
+
column1
|
|
143
|
+
column2
|
|
144
|
+
|
|
145
|
+
other_table
|
|
146
|
+
col_a
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
content: Text-formatted schema string.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Parsed schema dictionary.
|
|
153
|
+
"""
|
|
154
|
+
schema: SchemaDict = {}
|
|
155
|
+
current_table: str | None = None
|
|
156
|
+
for line in content.splitlines():
|
|
157
|
+
if not line or not line.strip():
|
|
158
|
+
continue
|
|
159
|
+
if line.startswith(" "):
|
|
160
|
+
if current_table is not None:
|
|
161
|
+
schema[current_table][line.strip()] = "UNKNOWN"
|
|
162
|
+
else:
|
|
163
|
+
current_table = line.strip()
|
|
164
|
+
schema[current_table] = {}
|
|
165
|
+
return schema
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def load_schema_file(path: Path) -> SchemaDict:
|
|
169
|
+
"""Load a schema file, auto-detecting format from extension.
|
|
170
|
+
|
|
171
|
+
`.json` → JSON, `.csv` → CSV, otherwise text.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
path: Path to schema file.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Parsed schema dictionary.
|
|
178
|
+
|
|
179
|
+
Raises:
|
|
180
|
+
FileNotFoundError: If the file does not exist.
|
|
181
|
+
"""
|
|
182
|
+
content = path.read_text(encoding="utf-8")
|
|
183
|
+
suffix = path.suffix.lower()
|
|
184
|
+
if suffix == ".json":
|
|
185
|
+
return parse_schema_json(content)
|
|
186
|
+
elif suffix == ".csv":
|
|
187
|
+
return parse_schema_csv(content)
|
|
188
|
+
else:
|
|
189
|
+
return parse_schema_text(content)
|
|
File without changes
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
"""Shared schema extraction logic for inferring table schemas from SQL files."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
|
|
8
|
+
|
|
9
|
+
from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
|
|
10
|
+
from sqlglider.utils.file_utils import read_sql_file
|
|
11
|
+
from sqlglider.utils.schema import parse_ddl_to_schema
|
|
12
|
+
|
|
13
|
+
SchemaDict = Dict[str, Dict[str, str]]
|
|
14
|
+
SqlPreprocessor = Callable[[str, Path], str]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_schemas_from_files(
|
|
18
|
+
file_paths: List[Path],
|
|
19
|
+
dialect: str = "spark",
|
|
20
|
+
sql_preprocessor: Optional[SqlPreprocessor] = None,
|
|
21
|
+
initial_schema: Optional[SchemaDict] = None,
|
|
22
|
+
strict_schema: bool = False,
|
|
23
|
+
console: Optional[Console] = None,
|
|
24
|
+
) -> SchemaDict:
|
|
25
|
+
"""Extract schema from SQL files by parsing DDL and inferring from DQL.
|
|
26
|
+
|
|
27
|
+
Iterates through files, accumulating schema knowledge. Each file's
|
|
28
|
+
inferred schema is available when parsing subsequent files.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_paths: SQL files to extract schema from.
|
|
32
|
+
dialect: SQL dialect.
|
|
33
|
+
sql_preprocessor: Optional function to preprocess SQL (e.g., templating).
|
|
34
|
+
initial_schema: Optional starting schema to build upon.
|
|
35
|
+
strict_schema: If True, fail on ambiguous column attribution.
|
|
36
|
+
console: Rich console for output. Uses stderr if not provided.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Accumulated schema dict mapping table names to column dicts.
|
|
40
|
+
"""
|
|
41
|
+
if console is None:
|
|
42
|
+
console = Console(stderr=True)
|
|
43
|
+
|
|
44
|
+
schema: SchemaDict = dict(initial_schema) if initial_schema else {}
|
|
45
|
+
total = len(file_paths)
|
|
46
|
+
|
|
47
|
+
with Progress(
|
|
48
|
+
TextColumn("[progress.description]{task.description}"),
|
|
49
|
+
BarColumn(),
|
|
50
|
+
TaskProgressColumn(),
|
|
51
|
+
console=console,
|
|
52
|
+
transient=False,
|
|
53
|
+
) as progress:
|
|
54
|
+
task = progress.add_task("Extracting schema", total=total)
|
|
55
|
+
for i, file_path in enumerate(file_paths, start=1):
|
|
56
|
+
console.print(f"Extracting schema {i}/{total}: {file_path.name}")
|
|
57
|
+
try:
|
|
58
|
+
sql_content = read_sql_file(file_path)
|
|
59
|
+
if sql_preprocessor:
|
|
60
|
+
sql_content = sql_preprocessor(sql_content, file_path)
|
|
61
|
+
analyzer = LineageAnalyzer(
|
|
62
|
+
sql_content,
|
|
63
|
+
dialect=dialect,
|
|
64
|
+
schema=schema,
|
|
65
|
+
strict_schema=strict_schema,
|
|
66
|
+
)
|
|
67
|
+
file_schema = analyzer.extract_schema_only()
|
|
68
|
+
schema.update(file_schema)
|
|
69
|
+
except SchemaResolutionError:
|
|
70
|
+
raise
|
|
71
|
+
except Exception:
|
|
72
|
+
# Schema extraction failures are non-fatal; the file
|
|
73
|
+
# will be reported during the lineage pass if it also fails.
|
|
74
|
+
pass
|
|
75
|
+
progress.advance(task)
|
|
76
|
+
return schema
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def fill_schema_from_catalog(
|
|
80
|
+
schema: SchemaDict,
|
|
81
|
+
file_paths: List[Path],
|
|
82
|
+
dialect: str = "spark",
|
|
83
|
+
sql_preprocessor: Optional[SqlPreprocessor] = None,
|
|
84
|
+
catalog_type: str = "databricks",
|
|
85
|
+
catalog_config: Optional[Dict[str, object]] = None,
|
|
86
|
+
console: Optional[Console] = None,
|
|
87
|
+
) -> SchemaDict:
|
|
88
|
+
"""Pull DDL from catalog for tables not yet in schema.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
schema: Schema dict already populated from file extraction.
|
|
92
|
+
file_paths: SQL files to scan for table references.
|
|
93
|
+
dialect: SQL dialect.
|
|
94
|
+
sql_preprocessor: Optional SQL preprocessor.
|
|
95
|
+
catalog_type: Catalog provider name.
|
|
96
|
+
catalog_config: Provider-specific configuration dict.
|
|
97
|
+
console: Rich console for output.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Updated schema dict with catalog-sourced entries added.
|
|
101
|
+
"""
|
|
102
|
+
from sqlglider.catalog import get_catalog
|
|
103
|
+
from sqlglider.lineage.analyzer import ObjectType
|
|
104
|
+
|
|
105
|
+
if console is None:
|
|
106
|
+
console = Console(stderr=True)
|
|
107
|
+
|
|
108
|
+
catalog = get_catalog(catalog_type)
|
|
109
|
+
if catalog_config:
|
|
110
|
+
catalog.configure(catalog_config)
|
|
111
|
+
|
|
112
|
+
# Collect all referenced table names across files
|
|
113
|
+
all_tables: set[str] = set()
|
|
114
|
+
for file_path in file_paths:
|
|
115
|
+
try:
|
|
116
|
+
sql_content = read_sql_file(file_path)
|
|
117
|
+
if sql_preprocessor:
|
|
118
|
+
sql_content = sql_preprocessor(sql_content, file_path)
|
|
119
|
+
analyzer = LineageAnalyzer(sql_content, dialect=dialect)
|
|
120
|
+
tables_results = analyzer.analyze_tables()
|
|
121
|
+
for result in tables_results:
|
|
122
|
+
for table_info in result.tables:
|
|
123
|
+
if table_info.object_type != ObjectType.CTE:
|
|
124
|
+
all_tables.add(table_info.name)
|
|
125
|
+
except Exception:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
# Find tables missing from schema
|
|
129
|
+
missing = [t for t in all_tables if t not in schema]
|
|
130
|
+
if not missing:
|
|
131
|
+
return schema
|
|
132
|
+
|
|
133
|
+
console.print(
|
|
134
|
+
f"[blue]Pulling DDL from {catalog_type} for {len(missing)} table(s)...[/blue]"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
ddl_results = catalog.get_ddl_batch(missing)
|
|
138
|
+
for table_name, ddl in ddl_results.items():
|
|
139
|
+
if ddl.startswith("ERROR:"):
|
|
140
|
+
console.print(
|
|
141
|
+
f"[yellow]Warning:[/yellow] Could not pull DDL for {table_name}: {ddl}"
|
|
142
|
+
)
|
|
143
|
+
continue
|
|
144
|
+
parsed_schema = parse_ddl_to_schema(ddl, dialect=dialect)
|
|
145
|
+
for name, cols in parsed_schema.items():
|
|
146
|
+
if name not in schema:
|
|
147
|
+
schema[name] = cols
|
|
148
|
+
|
|
149
|
+
return schema
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def extract_and_resolve_schema(
|
|
153
|
+
file_paths: List[Path],
|
|
154
|
+
dialect: str = "spark",
|
|
155
|
+
sql_preprocessor: Optional[SqlPreprocessor] = None,
|
|
156
|
+
initial_schema: Optional[SchemaDict] = None,
|
|
157
|
+
strict_schema: bool = False,
|
|
158
|
+
catalog_type: Optional[str] = None,
|
|
159
|
+
catalog_config: Optional[Dict[str, object]] = None,
|
|
160
|
+
console: Optional[Console] = None,
|
|
161
|
+
) -> SchemaDict:
|
|
162
|
+
"""Extract schema from files and optionally fill from catalog.
|
|
163
|
+
|
|
164
|
+
High-level orchestrator that runs file-based extraction followed
|
|
165
|
+
by optional catalog resolution.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
file_paths: SQL files to extract schema from.
|
|
169
|
+
dialect: SQL dialect.
|
|
170
|
+
sql_preprocessor: Optional SQL preprocessor.
|
|
171
|
+
initial_schema: Optional starting schema to build upon.
|
|
172
|
+
strict_schema: If True, fail on ambiguous column attribution.
|
|
173
|
+
catalog_type: Optional catalog provider name.
|
|
174
|
+
catalog_config: Optional provider-specific configuration dict.
|
|
175
|
+
console: Rich console for output.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Resolved schema dict.
|
|
179
|
+
"""
|
|
180
|
+
if console is None:
|
|
181
|
+
console = Console(stderr=True)
|
|
182
|
+
|
|
183
|
+
console.print("[blue]Extracting schema from files[/blue]")
|
|
184
|
+
schema = extract_schemas_from_files(
|
|
185
|
+
file_paths,
|
|
186
|
+
dialect=dialect,
|
|
187
|
+
sql_preprocessor=sql_preprocessor,
|
|
188
|
+
initial_schema=initial_schema,
|
|
189
|
+
strict_schema=strict_schema,
|
|
190
|
+
console=console,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if catalog_type:
|
|
194
|
+
schema = fill_schema_from_catalog(
|
|
195
|
+
schema,
|
|
196
|
+
file_paths,
|
|
197
|
+
dialect=dialect,
|
|
198
|
+
sql_preprocessor=sql_preprocessor,
|
|
199
|
+
catalog_type=catalog_type,
|
|
200
|
+
catalog_config=catalog_config,
|
|
201
|
+
console=console,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
console.print(f"[blue]Schema resolved for {len(schema)} table(s)[/blue]")
|
|
205
|
+
return schema
|
|
File without changes
|
|
File without changes
|
|
File without changes
|