PyPI - sql-glider - Versions diffs - 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

sql-glider 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/METADATA +1 -1
{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/RECORD +12 -10
sqlglider/_version.py +2 -2
sqlglider/cli.py +117 -10
sqlglider/graph/builder.py +211 -20
sqlglider/graph/formatters.py +98 -0
sqlglider/lineage/analyzer.py +171 -5
sqlglider/utils/config.py +4 -0
sqlglider/utils/schema.py +62 -0
{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/WHEEL +0 -0
{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/entry_points.txt +0 -0
{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/licenses/LICENSE +0 -0

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sql-glider
-Version: 0.1.11
+Version: 0.1.13
 Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
 Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
 Project-URL: Repository, https://github.com/rycowhi/sql-glider/

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
-sqlglider/_version.py,sha256=0-Ruc52ECccw_8Ef0d7jMkzrb8fkobUkZLqGGvcm1ik,706
-sqlglider/cli.py,sha256=DMCMw5dxDHB2MuxBXuJMNeDSlIGAfKDz1Renp0YwGGM,52224
+sqlglider/_version.py,sha256=Xz5RLbyPcCHHXte393JYfUy4Dt7uaeWyrGVw9SmJ0eg,706
+sqlglider/cli.py,sha256=FDTjRmor_cQlcwfiD_uHTrQao2sMf3ev21IUyUSt7Qs,56401
 sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
 sqlglider/catalog/__init__.py,sha256=2PqFPyzFXJ14FpSUcBmVK2L-a_ypWQHAbHFHxLDk_LE,814
 sqlglider/catalog/base.py,sha256=R7htHC43InpH4uRjYk33dMYYji6oylHns7Ye_mgfjJE,3116
@@ -11,13 +11,14 @@ sqlglider/dissection/analyzer.py,sha256=-GD3-lTbfBthq1BW6HiDjvJx2y4LDmnUVHIVIb0H
 sqlglider/dissection/formatters.py,sha256=M7gsmTNljRIeLIRv4D0vHvqJVrTqWSpsg7vem83zSzY,7302
 sqlglider/dissection/models.py,sha256=RRD3RIteqbUBY6e-74skKDvMH3qeAUaqA2sFcrjP5GQ,3618
 sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
-sqlglider/graph/builder.py,sha256=HdkMcuZkxdEFO0CXMAaqGQSyhvzuaIQTaFscQdO2GSI,12146
+sqlglider/graph/builder.py,sha256=suxc_hymHvHnkgltgXqwwIoxlay7zhy1Enbs6HNC3m8,20107
+sqlglider/graph/formatters.py,sha256=EGgdxTr9Mctz9tTN54XIjoX0KGNcpiSKsW3o27dhMxo,2549
 sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
 sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
 sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
 sqlglider/graph/serialization.py,sha256=vMXn7s35jA499e7l90vNVaJE_3QR_VHf3rEfQ9ZlgTQ,2781
 sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
-sqlglider/lineage/analyzer.py,sha256=gjJtJU-sxFokoSVxcHpcIdbP3H8GD_KQaubbbcG0UCM,68982
+sqlglider/lineage/analyzer.py,sha256=08pFR5aGFFPhSbRW6EqiX2d3mp91v-orcs6dm_T1FJg,76484
 sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
 sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
 sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
@@ -25,10 +26,11 @@ sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g
 sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
 sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
 sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
-sqlglider/utils/config.py,sha256=rbbiDCWA_h29vgWJZ1z3zQmGcei0KcxhTPcymSCYeFo,4796
+sqlglider/utils/config.py,sha256=qx5zE9pjLCCzHQDFVPLVd7LgJ-lghxUa2x-aZOAHByY,4962
 sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
-sql_glider-0.1.11.dist-info/METADATA,sha256=JxQakiYUUzvldsEzjdXQLV63ud07Gw_bcZ2BIi29nuQ,28446
-sql_glider-0.1.11.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-sql_glider-0.1.11.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
-sql_glider-0.1.11.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sql_glider-0.1.11.dist-info/RECORD,,
+sqlglider/utils/schema.py,sha256=-0Vd1A3EggBH3reXTiabO0zFeTENROgmDg861X1D7Qs,1867
+sql_glider-0.1.13.dist-info/METADATA,sha256=z-utivkULH1BBhygNpLcWN9UdU1DbwfF3EzUhGtWXes,28446
+sql_glider-0.1.13.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+sql_glider-0.1.13.dist-info/entry_points.txt,sha256=HDuakHqHS5C0HFKsMIxMYmDU7-BLBGrnIJcYaVRu-s0,251
+sql_glider-0.1.13.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+sql_glider-0.1.13.dist-info/RECORD,,

sqlglider/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.11'
-__version_tuple__ = version_tuple = (0, 1, 11)
+__version__ = version = '0.1.13'
+__version_tuple__ = version_tuple = (0, 1, 13)
 __commit_id__ = commit_id = None

sqlglider/cli.py CHANGED Viewed

@@ -12,7 +12,7 @@ from sqlglot.errors import ParseError
 from typing_extensions import Annotated
 from sqlglider.global_models import AnalysisLevel, NodeFormat
-from sqlglider.lineage.analyzer import LineageAnalyzer
+from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
 from sqlglider.lineage.formatters import (
     CsvFormatter,
     JsonFormatter,
@@ -1001,6 +1001,35 @@ def graph_build(
         "--no-star",
         help="Fail if SELECT * cannot be resolved to actual columns",
     ),
+    resolve_schema: bool = typer.Option(
+        False,
+        "--resolve-schema",
+        help="Extract schema from all files before lineage analysis, "
+        "enabling cross-file star resolution",
+    ),
+    catalog_type: Optional[str] = typer.Option(
+        None,
+        "--catalog-type",
+        "-c",
+        help="Catalog provider for pulling DDL of tables not found in files "
+        "(requires --resolve-schema). E.g. 'databricks'",
+    ),
+    dump_schema: Optional[Path] = typer.Option(
+        None,
+        "--dump-schema",
+        help="Dump resolved schema to file (requires --resolve-schema)",
+    ),
+    dump_schema_format: Optional[str] = typer.Option(
+        None,
+        "--dump-schema-format",
+        help="Format for dumped schema: 'text' (default), 'json', or 'csv'",
+    ),
+    strict_schema: bool = typer.Option(
+        False,
+        "--strict-schema",
+        help="Fail if any column's table cannot be identified during schema extraction "
+        "(requires --resolve-schema)",
+    ),
 ) -> None:
     """
     Build a lineage graph from SQL files.
@@ -1036,6 +1065,37 @@ def graph_build(
     dialect = dialect or config.dialect or "spark"
     templater = templater or config.templater  # None means no templating
     no_star = no_star or config.no_star or False
+    resolve_schema = resolve_schema or config.resolve_schema or False
+    strict_schema = strict_schema or config.strict_schema or False
+    if strict_schema and not resolve_schema:
+        err_console.print("[red]Error:[/red] --strict-schema requires --resolve-schema")
+        raise typer.Exit(1)
+    if catalog_type and not resolve_schema:
+        err_console.print("[red]Error:[/red] --catalog-type requires --resolve-schema")
+        raise typer.Exit(1)
+    # Resolve dump_schema options from config
+    dump_schema = dump_schema or (
+        Path(config.dump_schema) if config.dump_schema else None
+    )
+    dump_schema_format = dump_schema_format or config.dump_schema_format or "text"
+    if dump_schema and not resolve_schema:
+        err_console.print("[red]Error:[/red] --dump-schema requires --resolve-schema")
+        raise typer.Exit(1)
+    if dump_schema_format not in ("text", "json", "csv"):
+        err_console.print(
+            f"[red]Error:[/red] Invalid --dump-schema-format '{dump_schema_format}'. "
+            "Use 'text', 'json', or 'csv'."
+        )
+        raise typer.Exit(1)
+    # Only inherit catalog_type from config when resolve_schema is active
+    if resolve_schema and not catalog_type:
+        catalog_type = config.catalog_type
     # Validate and convert node format to enum
     try:
@@ -1088,32 +1148,75 @@ def graph_build(
         sql_preprocessor = _preprocess
     try:
+        # Build catalog config from config file if available
+        catalog_config_dict = None
+        if catalog_type and config.catalog:
+            provider_config = getattr(config.catalog, catalog_type, None)
+            if provider_config:
+                catalog_config_dict = provider_config.model_dump(exclude_none=True)
         builder = GraphBuilder(
             node_format=node_format_enum,
             dialect=dialect,
             sql_preprocessor=sql_preprocessor,
             no_star=no_star,
+            resolve_schema=resolve_schema,
+            catalog_type=catalog_type,
+            catalog_config=catalog_config_dict,
+            strict_schema=strict_schema,
         )
-        # Process manifest if provided
-        if manifest:
-            builder.add_manifest(manifest, dialect=dialect)
-        # Process paths - collect all files first for progress tracking
+        # Collect file paths for schema extraction
+        path_files: list[Path] = []
         if paths:
-            all_files: list[Path] = []
             for path in paths:
                 if path.is_dir():
                     pattern = f"**/{glob_pattern}" if recursive else glob_pattern
-                    all_files.extend(
+                    path_files.extend(
                         f for f in sorted(path.glob(pattern)) if f.is_file()
                     )
                 elif path.is_file():
-                    all_files.append(path)
+                    path_files.append(path)
                 else:
                     err_console.print(f"[red]Error:[/red] Path not found: {path}")
                     raise typer.Exit(1)
-            builder.add_files(all_files, dialect=dialect)
+        manifest_files: list[Path] = []
+        if manifest:
+            from sqlglider.graph.models import Manifest
+            manifest_data = Manifest.from_csv(manifest)
+            base_dir = manifest.parent
+            for entry in manifest_data.entries:
+                file_path = Path(entry.file_path)
+                if not file_path.is_absolute():
+                    file_path = (base_dir / entry.file_path).resolve()
+                manifest_files.append(file_path)
+        # Extract schema upfront if requested, then dump before graph building
+        all_files = manifest_files + path_files
+        if resolve_schema and all_files:
+            builder.extract_schemas(all_files, dialect=dialect)
+            if dump_schema:
+                from sqlglider.graph.formatters import format_schema
+                schema_content = format_schema(
+                    builder.resolved_schema, dump_schema_format
+                )
+                dump_schema.write_text(schema_content, encoding="utf-8")
+                console.print(
+                    f"[green]Schema dumped to {dump_schema} "
+                    f"({len(builder.resolved_schema)} table(s))[/green]"
+                )
+        # Process manifest if provided
+        if manifest:
+            builder.add_manifest(manifest, dialect=dialect)
+        # Process path-based files
+        if path_files:
+            builder.add_files(path_files, dialect=dialect)
         # Build and save graph
         graph = builder.build()
@@ -1124,6 +1227,10 @@ def graph_build(
             f"({graph.metadata.total_nodes} nodes, {graph.metadata.total_edges} edges)"
         )
+    except SchemaResolutionError as e:
+        err_console.print(f"[red]Error:[/red] {e}")
+        raise typer.Exit(1)
     except FileNotFoundError as e:
         err_console.print(f"[red]Error:[/red] {e}")
         raise typer.Exit(1)

sqlglider/graph/builder.py CHANGED Viewed

@@ -16,8 +16,9 @@ from sqlglider.graph.models import (
     LineageGraph,
     Manifest,
 )
-from sqlglider.lineage.analyzer import LineageAnalyzer
+from sqlglider.lineage.analyzer import LineageAnalyzer, SchemaResolutionError
 from sqlglider.utils.file_utils import read_sql_file
+from sqlglider.utils.schema import parse_ddl_to_schema
 console = Console(stderr=True)
@@ -34,6 +35,10 @@ class GraphBuilder:
         dialect: str = "spark",
         sql_preprocessor: Optional[SqlPreprocessor] = None,
         no_star: bool = False,
+        resolve_schema: bool = False,
+        catalog_type: Optional[str] = None,
+        catalog_config: Optional[Dict[str, object]] = None,
+        strict_schema: bool = False,
     ):
         """
         Initialize the graph builder.
@@ -45,16 +50,31 @@ class GraphBuilder:
                              Takes (sql: str, file_path: Path) and returns processed SQL.
                              Useful for templating (e.g., Jinja2 rendering).
             no_star: If True, fail when SELECT * cannot be resolved to columns
+            resolve_schema: If True, run a schema extraction pass across all
+                files before lineage analysis so that schema from any file is
+                available when analyzing every other file.
+            catalog_type: Optional catalog provider name (e.g. "databricks").
+                When set together with resolve_schema, DDL is pulled from the
+                catalog for tables whose schema could not be inferred from files.
+            catalog_config: Optional provider-specific configuration dict
+                passed to the catalog's configure() method.
+            strict_schema: If True, fail during schema extraction when an
+                unqualified column cannot be attributed to a table.
         """
         self.node_format = node_format
         self.dialect = dialect
         self.sql_preprocessor = sql_preprocessor
         self.no_star = no_star
+        self.resolve_schema = resolve_schema
+        self.catalog_type = catalog_type
+        self.catalog_config = catalog_config
+        self.strict_schema = strict_schema
         self.graph: rx.PyDiGraph = rx.PyDiGraph()
         self._node_index_map: Dict[str, int] = {}  # identifier -> rustworkx node index
         self._source_files: Set[str] = set()
         self._edge_set: Set[tuple] = set()  # (source, target) for dedup
         self._skipped_files: List[tuple[str, str]] = []  # (file_path, reason)
+        self._resolved_schema: Dict[str, Dict[str, str]] = {}  # accumulated schema
     def add_file(
         self,
@@ -86,7 +106,10 @@ class GraphBuilder:
                 sql_content = self.sql_preprocessor(sql_content, file_path)
             analyzer = LineageAnalyzer(
-                sql_content, dialect=file_dialect, no_star=self.no_star
+                sql_content,
+                dialect=file_dialect,
+                no_star=self.no_star,
+                schema=self._resolved_schema if self._resolved_schema else None,
             )
             results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
@@ -209,23 +232,28 @@ class GraphBuilder:
             entry_dialect = entry.dialect or dialect or self.dialect
             files_with_dialects.append((file_path, entry_dialect))
-        # Process with progress
-        if files_with_dialects:
-            total = len(files_with_dialects)
-            with Progress(
-                TextColumn("[progress.description]{task.description}"),
-                BarColumn(),
-                TaskProgressColumn(),
-                console=console,
-                transient=False,
-            ) as progress:
-                task = progress.add_task("Parsing", total=total)
-                for i, (file_path, file_dialect) in enumerate(
-                    files_with_dialects, start=1
-                ):
-                    console.print(f"Parsing file {i}/{total}: {file_path.name}")
-                    self.add_file(file_path, file_dialect)
-                    progress.advance(task)
+        if not files_with_dialects:
+            return self
+        # Two-pass schema resolution (skip if already resolved)
+        if self.resolve_schema and not self._resolved_schema:
+            file_paths_only = [fp for fp, _ in files_with_dialects]
+            self.extract_schemas(file_paths_only, dialect)
+        total = len(files_with_dialects)
+        description = "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
+        with Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            console=console,
+            transient=False,
+        ) as progress:
+            task = progress.add_task(description, total=total)
+            for i, (file_path, file_dialect) in enumerate(files_with_dialects, start=1):
+                console.print(f"Parsing file {i}/{total}: {file_path.name}")
+                self.add_file(file_path, file_dialect)
+                progress.advance(task)
         return self
@@ -249,8 +277,15 @@ class GraphBuilder:
         if not file_paths:
             return self
+        # Two-pass schema resolution (skip if already resolved)
+        if self.resolve_schema and not self._resolved_schema:
+            self.extract_schemas(file_paths, dialect)
         if show_progress:
             total = len(file_paths)
+            description = (
+                "Pass 2: Analyzing lineage" if self.resolve_schema else "Parsing"
+            )
             with Progress(
                 TextColumn("[progress.description]{task.description}"),
                 BarColumn(),
@@ -258,7 +293,7 @@ class GraphBuilder:
                 console=console,
                 transient=False,
             ) as progress:
-                task = progress.add_task("Parsing", total=total)
+                task = progress.add_task(description, total=total)
                 for i, file_path in enumerate(file_paths, start=1):
                     console.print(f"Parsing file {i}/{total}: {file_path.name}")
                     self.add_file(file_path, dialect)
@@ -268,6 +303,157 @@ class GraphBuilder:
                 self.add_file(file_path, dialect)
         return self
+    def extract_schemas(
+        self,
+        file_paths: List[Path],
+        dialect: Optional[str] = None,
+    ) -> Dict[str, Dict[str, str]]:
+        """Run schema extraction pass and optionally fill from catalog.
+        Call this before add_files/add_manifest to resolve schema upfront.
+        The resolved schema is stored internally and also returned.
+        Args:
+            file_paths: SQL files to extract schema from
+            dialect: SQL dialect override
+        Returns:
+            Resolved schema dict
+        """
+        console.print("[blue]Pass 1: Extracting schema from files[/blue]")
+        self._resolved_schema = self._extract_schemas(file_paths, dialect)
+        if self.catalog_type:
+            self._resolved_schema = self._fill_schema_from_catalog(
+                self._resolved_schema, file_paths, dialect
+            )
+        console.print(
+            f"[blue]Schema resolved for {len(self._resolved_schema)} table(s)[/blue]"
+        )
+        return self._resolved_schema.copy()
+    def _extract_schemas(
+        self,
+        file_paths: List[Path],
+        dialect: Optional[str] = None,
+    ) -> Dict[str, Dict[str, str]]:
+        """Run schema extraction pass across all files.
+        Parses each file and extracts schema from CREATE TABLE/VIEW
+        statements without performing lineage analysis.
+        Args:
+            file_paths: SQL files to extract schema from
+            dialect: SQL dialect override
+        Returns:
+            Accumulated schema dict from all files
+        """
+        schema: Dict[str, Dict[str, str]] = {}
+        total = len(file_paths)
+        with Progress(
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            console=console,
+            transient=False,
+        ) as progress:
+            task = progress.add_task("Pass 1: Extracting schema", total=total)
+            for i, file_path in enumerate(file_paths, start=1):
+                console.print(f"Extracting schema {i}/{total}: {file_path.name}")
+                file_dialect = dialect or self.dialect
+                try:
+                    sql_content = read_sql_file(file_path)
+                    if self.sql_preprocessor:
+                        sql_content = self.sql_preprocessor(sql_content, file_path)
+                    analyzer = LineageAnalyzer(
+                        sql_content,
+                        dialect=file_dialect,
+                        schema=schema,
+                        strict_schema=self.strict_schema,
+                    )
+                    file_schema = analyzer.extract_schema_only()
+                    schema.update(file_schema)
+                except SchemaResolutionError:
+                    raise
+                except Exception:
+                    # Schema extraction failures are non-fatal; the file
+                    # will be reported during the lineage pass if it also fails.
+                    pass
+                progress.advance(task)
+        return schema
+    def _fill_schema_from_catalog(
+        self,
+        schema: Dict[str, Dict[str, str]],
+        file_paths: List[Path],
+        dialect: Optional[str] = None,
+    ) -> Dict[str, Dict[str, str]]:
+        """Pull DDL from catalog for tables not yet in schema.
+        Extracts all table names referenced across the files, identifies
+        those missing from the schema, and fetches their DDL from the
+        configured catalog provider.
+        Args:
+            schema: Schema dict already populated from file extraction
+            file_paths: SQL files to scan for table references
+            dialect: SQL dialect override
+        Returns:
+            Updated schema dict with catalog-sourced entries added
+        """
+        from sqlglider.catalog import get_catalog
+        catalog = get_catalog(self.catalog_type)  # type: ignore[arg-type]
+        if self.catalog_config:
+            catalog.configure(self.catalog_config)
+        # Collect all referenced table names across files
+        all_tables: Set[str] = set()
+        for file_path in file_paths:
+            file_dialect = dialect or self.dialect
+            try:
+                sql_content = read_sql_file(file_path)
+                if self.sql_preprocessor:
+                    sql_content = self.sql_preprocessor(sql_content, file_path)
+                analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
+                tables_results = analyzer.analyze_tables()
+                for result in tables_results:
+                    for table_info in result.tables:
+                        # Skip CTEs — they don't exist in catalogs
+                        from sqlglider.lineage.analyzer import ObjectType
+                        if table_info.object_type != ObjectType.CTE:
+                            all_tables.add(table_info.name)
+            except Exception:
+                pass
+        # Find tables missing from schema
+        missing = [t for t in all_tables if t not in schema]
+        if not missing:
+            return schema
+        console.print(
+            f"[blue]Pulling DDL from {self.catalog_type} "
+            f"for {len(missing)} table(s)...[/blue]"
+        )
+        ddl_results = catalog.get_ddl_batch(missing)
+        file_dialect = dialect or self.dialect
+        for table_name, ddl in ddl_results.items():
+            if ddl.startswith("ERROR:"):
+                console.print(
+                    f"[yellow]Warning:[/yellow] Could not pull DDL "
+                    f"for {table_name}: {ddl}"
+                )
+                continue
+            parsed_schema = parse_ddl_to_schema(ddl, dialect=file_dialect)
+            for name, cols in parsed_schema.items():
+                if name not in schema:
+                    schema[name] = cols
+        return schema
     def _ensure_node(
         self,
         identifier: str,
@@ -348,6 +534,11 @@ class GraphBuilder:
         """Get mapping from node identifiers to rustworkx indices."""
         return self._node_index_map.copy()
+    @property
+    def resolved_schema(self) -> Dict[str, Dict[str, str]]:
+        """Get the resolved schema dictionary from schema extraction pass."""
+        return self._resolved_schema.copy()
     @property
     def skipped_files(self) -> List[tuple[str, str]]:
         """Get list of files that were skipped during graph building."""

sqlglider/graph/formatters.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""Output formatters for resolved schema data."""
+import csv
+import json
+from io import StringIO
+from typing import Dict
+SchemaDict = Dict[str, Dict[str, str]]
+def format_schema_text(schema: SchemaDict) -> str:
+    """Format resolved schema as human-readable text.
+    Output format:
+        customers
+          id
+          name
+        schema.orders
+          order_id
+          customer_id
+    Args:
+        schema: Resolved schema dictionary mapping table names to column dicts.
+    Returns:
+        Text-formatted string.
+    """
+    lines: list[str] = []
+    for table_name in sorted(schema):
+        if lines:
+            lines.append("")
+        lines.append(table_name)
+        for column_name in sorted(schema[table_name]):
+            lines.append(f"  {column_name}")
+    return "\n".join(lines) + "\n" if lines else ""
+def format_schema_json(schema: SchemaDict) -> str:
+    """Format resolved schema as JSON.
+    Args:
+        schema: Resolved schema dictionary mapping table names to column dicts.
+    Returns:
+        JSON-formatted string.
+    """
+    sorted_schema = {k: schema[k] for k in sorted(schema)}
+    return json.dumps(sorted_schema, indent=2)
+def format_schema_csv(schema: SchemaDict) -> str:
+    """Format resolved schema as CSV.
+    Output format:
+        table,column,type
+        customers,id,UNKNOWN
+        customers,name,UNKNOWN
+    Args:
+        schema: Resolved schema dictionary mapping table names to column dicts.
+    Returns:
+        CSV-formatted string.
+    """
+    output = StringIO()
+    writer = csv.writer(output)
+    writer.writerow(["table", "column", "type"])
+    for table_name in sorted(schema):
+        for column_name in sorted(schema[table_name]):
+            writer.writerow([table_name, column_name, schema[table_name][column_name]])
+    return output.getvalue()
+def format_schema(schema: SchemaDict, output_format: str = "text") -> str:
+    """Format resolved schema in the specified format.
+    Args:
+        schema: Resolved schema dictionary.
+        output_format: One of "text", "json", or "csv".
+    Returns:
+        Formatted string.
+    Raises:
+        ValueError: If output_format is not recognized.
+    """
+    formatters = {
+        "text": format_schema_text,
+        "json": format_schema_json,
+        "csv": format_schema_csv,
+    }
+    formatter = formatters.get(output_format)
+    if formatter is None:
+        raise ValueError(
+            f"Invalid schema format '{output_format}'. Use 'text', 'json', or 'csv'."
+        )
+    return formatter(schema)

sqlglider/lineage/analyzer.py CHANGED Viewed

@@ -15,6 +15,10 @@ class StarResolutionError(Exception):
     """Raised when SELECT * cannot be resolved and no_star mode is enabled."""
+class SchemaResolutionError(Exception):
+    """Raised when a column's table cannot be identified and strict_schema is enabled."""
 class TableUsage(str, Enum):
     """How a table is used in a query."""
@@ -89,7 +93,14 @@ WarningCallback = Callable[[str], None]
 class LineageAnalyzer:
     """Analyze column and table lineage for SQL queries."""
-    def __init__(self, sql: str, dialect: str = "spark", no_star: bool = False):
+    def __init__(
+        self,
+        sql: str,
+        dialect: str = "spark",
+        no_star: bool = False,
+        schema: Optional[Dict[str, Dict[str, str]]] = None,
+        strict_schema: bool = False,
+    ):
         """
         Initialize the lineage analyzer.
@@ -97,6 +108,12 @@ class LineageAnalyzer:
             sql: SQL query string to analyze (can contain multiple statements)
             dialect: SQL dialect (default: spark)
             no_star: If True, fail when SELECT * cannot be resolved to columns
+            schema: Optional external schema mapping table names to column
+                definitions (e.g. {"table": {"col": "UNKNOWN"}}). File-derived
+                schema from CREATE statements will merge on top.
+            strict_schema: If True, fail during schema extraction when an
+                unqualified column cannot be attributed to a table (e.g.
+                in a multi-table SELECT without table qualifiers).
         Raises:
             ParseError: If the SQL cannot be parsed
@@ -104,10 +121,12 @@ class LineageAnalyzer:
         self.sql = sql
         self.dialect = dialect
         self._no_star = no_star
+        self._strict_schema = strict_schema
         self._skipped_queries: List[SkippedQuery] = []
         # File-scoped schema context for cross-statement lineage
         # Maps table/view names to their column definitions
-        self._file_schema: Dict[str, Dict[str, str]] = {}
+        self._initial_schema: Dict[str, Dict[str, str]] = dict(schema) if schema else {}
+        self._file_schema: Dict[str, Dict[str, str]] = dict(self._initial_schema)
         try:
             # Parse all statements in the SQL string
@@ -132,6 +151,27 @@ class LineageAnalyzer:
         """Get list of queries that were skipped during analysis."""
         return self._skipped_queries.copy()
+    def get_extracted_schema(self) -> Dict[str, Dict[str, str]]:
+        """Return the accumulated file schema after analysis."""
+        return dict(self._file_schema)
+    def extract_schema_only(self) -> Dict[str, Dict[str, str]]:
+        """Parse all statements and extract schema without running lineage.
+        Iterates through all expressions, extracting schema from:
+        1. CREATE TABLE/VIEW AS SELECT statements (existing behavior)
+        2. DQL statements by inferring table columns from qualified column
+           references (e.g., ``SELECT t.id FROM table t`` infers
+           ``table: {id: UNKNOWN}``)
+        Returns the accumulated schema dict.
+        """
+        self._file_schema = dict(self._initial_schema)
+        for expr in self.expressions:
+            self._extract_schema_from_statement(expr)
+            self._extract_schema_from_dql(expr)
+        return dict(self._file_schema)
     def get_output_columns(self) -> List[str]:
         """
         Extract all output column names from the query with full qualification.
@@ -426,7 +466,7 @@ class LineageAnalyzer:
         """
         results = []
         self._skipped_queries = []  # Reset skipped queries for this analysis
-        self._file_schema = {}  # Reset file schema for this analysis run
+        self._file_schema = dict(self._initial_schema)  # Reset to external schema
         for query_index, expr, preview in self._iterate_queries(table_filter):
             # Temporarily swap self.expr to analyze this query
@@ -819,18 +859,31 @@ class LineageAnalyzer:
         else:
             current_query_sql = self.expr.sql(dialect=self.dialect)
+        # Prune schema to only tables referenced in this query to avoid
+        # sqlglot.lineage() performance degradation with large schema dicts
+        pruned_schema: Optional[Dict[str, Dict[str, str]]] = None
+        if self._file_schema:
+            referenced = {t.lower() for t in self._get_query_tables()}
+            pruned_schema = {
+                table: cols
+                for table, cols in self._file_schema.items()
+                if table.lower() in referenced
+            }
+            if not pruned_schema:
+                pruned_schema = None
         for col in columns_to_analyze:
             try:
                 # Get the column name that lineage expects
                 lineage_col = self._column_mapping.get(col, col)
                 # Get lineage tree for this column using current query SQL only
-                # Pass file schema to enable SELECT * expansion for known tables/views
+                # Pass pruned schema to enable SELECT * expansion for known tables/views
                 node = lineage(
                     lineage_col,
                     current_query_sql,
                     dialect=self.dialect,
-                    schema=self._file_schema if self._file_schema else None,
+                    schema=pruned_schema,
                 )
                 # Collect all source columns
@@ -1427,6 +1480,119 @@ class LineageAnalyzer:
             # Store with UNKNOWN type - SQLGlot only needs column names for expansion
             self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
+    def _extract_schema_from_dql(self, expr: exp.Expression) -> None:
+        """Infer table schemas from column references in DQL.
+        Walks SELECT statements and extracts table-column mappings from:
+        1. Qualified column references (e.g., ``c.id``) — always resolved.
+        2. Unqualified column references (e.g., ``id``) — only when the
+           SELECT has exactly one real table source (no joins), making
+           attribution unambiguous.
+        Aliases are resolved back to actual table names.  CTEs and subquery
+        aliases are skipped since they don't represent external tables.
+        Args:
+            expr: The SQL expression to extract schema from.
+        """
+        # Find all SELECT nodes in the expression tree
+        selects = list(expr.find_all(exp.Select))
+        if not selects:
+            return
+        for select_node in selects:
+            # Build alias-to-table mapping for this SELECT scope
+            alias_map: Dict[str, str] = {}
+            cte_names: Set[str] = set()
+            # Collect CTE names so we can skip them
+            parent = select_node
+            while parent:
+                with_clause = parent.args.get("with")
+                if with_clause:
+                    for cte in with_clause.expressions:
+                        if isinstance(cte, exp.CTE) and cte.alias:
+                            cte_names.add(cte.alias.lower())
+                parent = parent.parent if hasattr(parent, "parent") else None
+            # Collect subquery aliases so we can skip them too
+            subquery_aliases: Set[str] = set()
+            from_clause = select_node.args.get("from")
+            if from_clause and isinstance(from_clause, exp.From):
+                source = from_clause.this
+                if isinstance(source, exp.Subquery) and source.alias:
+                    subquery_aliases.add(source.alias.lower())
+            for join in select_node.find_all(exp.Join):
+                if isinstance(join.this, exp.Subquery) and join.this.alias:
+                    subquery_aliases.add(join.this.alias.lower())
+            # Build alias map from FROM/JOIN table references
+            real_tables: list[str] = []  # track non-CTE, non-subquery tables
+            for table_ref in select_node.find_all(exp.Table):
+                # Skip tables inside nested selects — they belong to inner scope
+                if table_ref.find_ancestor(exp.Select) is not select_node:
+                    continue
+                qualified = self._get_qualified_table_name(table_ref)
+                if table_ref.alias:
+                    alias_map[table_ref.alias.lower()] = qualified
+                else:
+                    alias_map[table_ref.name.lower()] = qualified
+                # Track real tables (not CTEs or subqueries)
+                if (
+                    qualified.lower() not in cte_names
+                    and qualified.lower() not in subquery_aliases
+                ):
+                    real_tables.append(qualified)
+            # Determine single-table target for unqualified columns
+            # Only set when exactly one real table source exists (unambiguous)
+            single_table: Optional[str] = (
+                real_tables[0] if len(real_tables) == 1 else None
+            )
+            # Walk all column references in this SELECT
+            for column in select_node.find_all(exp.Column):
+                if isinstance(column.this, exp.Star):
+                    continue
+                table_ref_name = column.table
+                col_name = column.name
+                if table_ref_name:
+                    # Qualified column — resolve alias to actual table
+                    ref_lower = table_ref_name.lower()
+                    # Skip CTE and subquery references
+                    if ref_lower in cte_names or ref_lower in subquery_aliases:
+                        continue
+                    actual_table = alias_map.get(ref_lower)
+                    if not actual_table:
+                        continue
+                    # Skip if it resolved to a CTE or subquery
+                    if (
+                        actual_table.lower() in cte_names
+                        or actual_table.lower() in subquery_aliases
+                    ):
+                        continue
+                else:
+                    # Unqualified column — attribute to single table if unambiguous
+                    if not single_table:
+                        if self._strict_schema:
+                            preview = select_node.sql(dialect=self.dialect)[:80]
+                            raise SchemaResolutionError(
+                                f"Cannot resolve table for unqualified column "
+                                f"'{col_name}' in multi-table query: {preview}"
+                            )
+                        continue
+                    actual_table = single_table
+                if actual_table not in self._file_schema:
+                    self._file_schema[actual_table] = {}
+                if col_name not in self._file_schema[actual_table]:
+                    self._file_schema[actual_table][col_name] = "UNKNOWN"
     def _extract_columns_from_select(
         self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
     ) -> List[str]:

sqlglider/utils/config.py CHANGED Viewed

@@ -61,6 +61,10 @@ class ConfigSettings(BaseModel):
     ddl_folder: Optional[str] = None
     catalog: Optional[CatalogConfig] = None
     no_star: Optional[bool] = None
+    resolve_schema: Optional[bool] = None
+    dump_schema: Optional[str] = None
+    dump_schema_format: Optional[str] = None
+    strict_schema: Optional[bool] = None
 def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:

sqlglider/utils/schema.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""Schema utilities for parsing DDL into schema dictionaries."""
+from typing import Dict
+from sqlglot import exp, parse
+def parse_ddl_to_schema(ddl: str, dialect: str = "spark") -> Dict[str, Dict[str, str]]:
+    """Extract table schemas from DDL statements.
+    Parses CREATE TABLE/VIEW statements and extracts column names.
+    Only column names are needed — types are stored as "UNKNOWN" since
+    SQLGlot's lineage only uses names for star expansion.
+    Args:
+        ddl: SQL string containing one or more CREATE TABLE/VIEW statements
+        dialect: SQL dialect for parsing
+    Returns:
+        Schema dict mapping table names to column definitions,
+        e.g. {"my_table": {"id": "UNKNOWN", "name": "UNKNOWN"}}
+    """
+    schema: Dict[str, Dict[str, str]] = {}
+    expressions = parse(ddl, dialect=dialect)
+    for expr in expressions:
+        if expr is None:
+            continue
+        if not isinstance(expr, (exp.Create,)):
+            continue
+        # Get target table name
+        target = expr.this
+        if isinstance(target, exp.Schema):
+            # Schema node wraps the table and column definitions
+            columns = [
+                col.name for col in target.expressions if isinstance(col, exp.ColumnDef)
+            ]
+            target = target.this
+        else:
+            columns = []
+        if not isinstance(target, exp.Table):
+            continue
+        table_name = _get_qualified_name(target)
+        if columns:
+            schema[table_name] = {col: "UNKNOWN" for col in columns}
+    return schema
+def _get_qualified_name(table: exp.Table) -> str:
+    """Build a qualified table name from a SQLGlot Table expression."""
+    parts = []
+    if table.catalog:
+        parts.append(table.catalog)
+    if table.db:
+        parts.append(table.db)
+    parts.append(table.name)
+    return ".".join(parts)

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sql_glider-0.1.11.dist-info → sql_glider-0.1.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sql-glider 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

sql-glider 0.1.11py3-none-any.whl → 0.1.13py3-none-any.whl