PyPI - sql-glider - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

sql-glider 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/METADATA +177 -5
sql_glider-0.1.4.dist-info/RECORD +34 -0
{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/entry_points.txt +3 -0
sqlglider/_version.py +2 -2
sqlglider/catalog/__init__.py +30 -0
sqlglider/catalog/base.py +99 -0
sqlglider/catalog/databricks.py +255 -0
sqlglider/catalog/registry.py +121 -0
sqlglider/cli.py +467 -15
sqlglider/dissection/__init__.py +17 -0
sqlglider/dissection/analyzer.py +767 -0
sqlglider/dissection/formatters.py +222 -0
sqlglider/dissection/models.py +112 -0
sqlglider/graph/builder.py +46 -8
sqlglider/lineage/analyzer.py +281 -13
sqlglider/utils/config.py +25 -0
sql_glider-0.1.2.dist-info/RECORD +0 -26
{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/WHEEL +0 -0
{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/licenses/LICENSE +0 -0

sqlglider/lineage/analyzer.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Core lineage analysis using SQLGlot."""
 from enum import Enum
-from typing import Callable, Iterator, List, Optional, Set, Tuple
+from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
 from pydantic import BaseModel, Field
 from sqlglot import exp, parse
@@ -99,6 +99,9 @@ class LineageAnalyzer:
         self.sql = sql
         self.dialect = dialect
         self._skipped_queries: List[SkippedQuery] = []
+        # File-scoped schema context for cross-statement lineage
+        # Maps table/view names to their column definitions
+        self._file_schema: Dict[str, Dict[str, str]] = {}
         try:
             # Parse all statements in the SQL string
@@ -155,7 +158,25 @@ class LineageAnalyzer:
         if target_table:
             # DML/DDL: Use target table for output column qualification
             # The columns are from the SELECT, but qualified with the target table
-            for projection in select_node.expressions:
+            projections = self._get_select_projections(select_node)
+            first_select = self._get_first_select(select_node)
+            for projection in projections:
+                # Handle SELECT * by resolving from file schema
+                if isinstance(projection, exp.Star):
+                    if first_select:
+                        star_columns = self._resolve_star_columns(first_select)
+                        for star_col in star_columns:
+                            qualified_name = f"{target_table}.{star_col}"
+                            columns.append(qualified_name)
+                            self._column_mapping[qualified_name] = star_col
+                    if not columns:
+                        # Fallback: can't resolve *, use * as column name
+                        qualified_name = f"{target_table}.*"
+                        columns.append(qualified_name)
+                        self._column_mapping[qualified_name] = "*"
+                    continue
                 # Get the underlying expression (unwrap alias if present)
                 if isinstance(projection, exp.Alias):
                     # For aliased columns, use the alias as the column name
@@ -178,7 +199,10 @@ class LineageAnalyzer:
         else:
             # DQL (pure SELECT): Use the SELECT columns as output
-            for projection in select_node.expressions:
+            projections = self._get_select_projections(select_node)
+            # Get the first SELECT for table resolution (handles UNION case)
+            first_select = self._get_first_select(select_node)
+            for projection in projections:
                 # Get the underlying expression (unwrap alias if present)
                 if isinstance(projection, exp.Alias):
                     source_expr = projection.this
@@ -195,20 +219,20 @@ class LineageAnalyzer:
                     table_name = source_expr.table
                     col_name = column_name or source_expr.name
-                    if table_name:
+                    if table_name and first_select:
                         # Resolve table reference (could be table, CTE, or subquery alias)
                         # This works at any nesting level because we're only looking at the immediate context
                         resolved_table = self._resolve_table_reference(
-                            table_name, select_node
+                            table_name, first_select
                         )
                         qualified_name = f"{resolved_table}.{col_name}"
                         columns.append(qualified_name)
                         # Map qualified name to what lineage expects
                         self._column_mapping[qualified_name] = lineage_name or col_name
-                    else:
+                    elif first_select:
                         # No table qualifier - try to infer from FROM clause
                         # This handles "SELECT col FROM single_source" cases
-                        inferred_table = self._infer_single_table_source(select_node)
+                        inferred_table = self._infer_single_table_source(first_select)
                         if inferred_table:
                             qualified_name = f"{inferred_table}.{col_name}"
                             columns.append(qualified_name)
@@ -219,6 +243,10 @@ class LineageAnalyzer:
                             # Can't infer table, just use column name
                             columns.append(col_name)
                             self._column_mapping[col_name] = lineage_name or col_name
+                    else:
+                        # No SELECT found, just use column name
+                        columns.append(col_name)
+                        self._column_mapping[col_name] = lineage_name or col_name
                 else:
                     # For other expressions (literals, functions, etc.)
                     # Use the alias if available, otherwise the SQL representation
@@ -232,6 +260,46 @@ class LineageAnalyzer:
         return columns
+    def _get_select_projections(self, node: exp.Expression) -> List[exp.Expression]:
+        """
+        Get the SELECT projections from a SELECT or set operation node.
+        For set operations (UNION, INTERSECT, EXCEPT), returns projections from
+        the first branch since all branches must have the same number of columns
+        with compatible types.
+        Args:
+            node: A SELECT or set operation (UNION/INTERSECT/EXCEPT) expression
+        Returns:
+            List of projection expressions from the SELECT clause
+        """
+        if isinstance(node, exp.Select):
+            return list(node.expressions)
+        elif isinstance(node, (exp.Union, exp.Intersect, exp.Except)):
+            # Recursively get from the left branch (could be nested set operations)
+            return self._get_select_projections(node.left)
+        return []
+    def _get_first_select(self, node: exp.Expression) -> Optional[exp.Select]:
+        """
+        Get the first SELECT node from a SELECT or set operation expression.
+        For set operations (UNION, INTERSECT, EXCEPT), returns the leftmost
+        SELECT branch.
+        Args:
+            node: A SELECT or set operation (UNION/INTERSECT/EXCEPT) expression
+        Returns:
+            The first SELECT node, or None if not found
+        """
+        if isinstance(node, exp.Select):
+            return node
+        elif isinstance(node, (exp.Union, exp.Intersect, exp.Except)):
+            return self._get_first_select(node.left)
+        return None
     def analyze_queries(
         self,
         level: AnalysisLevel = AnalysisLevel.COLUMN,
@@ -276,6 +344,7 @@ class LineageAnalyzer:
         """
         results = []
         self._skipped_queries = []  # Reset skipped queries for this analysis
+        self._file_schema = {}  # Reset file schema for this analysis run
         for query_index, expr, preview in self._iterate_queries(table_filter):
             # Temporarily swap self.expr to analyze this query
@@ -327,6 +396,9 @@ class LineageAnalyzer:
                     )
                 )
             finally:
+                # Extract schema from this statement AFTER analysis
+                # This builds up context for subsequent statements to use
+                self._extract_schema_from_statement(expr)
                 # Restore original expression
                 self.expr = original_expr
@@ -654,7 +726,13 @@ class LineageAnalyzer:
                 lineage_col = self._column_mapping.get(col, col)
                 # Get lineage tree for this column using current query SQL only
-                node = lineage(lineage_col, current_query_sql, dialect=self.dialect)
+                # Pass file schema to enable SELECT * expansion for known tables/views
+                node = lineage(
+                    lineage_col,
+                    current_query_sql,
+                    dialect=self.dialect,
+                    schema=self._file_schema if self._file_schema else None,
+                )
                 # Collect all source columns
                 sources: Set[str] = set()
@@ -795,7 +873,9 @@ class LineageAnalyzer:
     def _get_target_and_select(
         self,
-    ) -> Optional[tuple[Optional[str], exp.Select]]:
+    ) -> Optional[
+        tuple[Optional[str], Union[exp.Select, exp.Union, exp.Intersect, exp.Except]]
+    ]:
         """
         Detect if this is a DML/DDL statement and extract the target table and SELECT node.
@@ -817,9 +897,11 @@ class LineageAnalyzer:
             target = self.expr.this
             if isinstance(target, exp.Table):
                 target_name = self._get_qualified_table_name(target)
-                # Find the SELECT within the INSERT
+                # Find the SELECT within the INSERT (may be a set operation)
                 select_node = self.expr.expression
-                if isinstance(select_node, exp.Select):
+                if isinstance(
+                    select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
+                ):
                     return (target_name, select_node)
         # Check for CREATE TABLE AS SELECT (CTAS) or CREATE VIEW AS SELECT
@@ -831,9 +913,11 @@ class LineageAnalyzer:
                     target = target.this
                 if isinstance(target, exp.Table):
                     target_name = self._get_qualified_table_name(target)
-                    # Find the SELECT in the expression
+                    # Find the SELECT in the expression (may be a set operation)
                     select_node = self.expr.expression
-                    if isinstance(select_node, exp.Select):
+                    if isinstance(
+                        select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
+                    ):
                         return (target_name, select_node)
         # Check for MERGE statement
@@ -1181,3 +1265,187 @@ class LineageAnalyzer:
             preview = self._generate_query_preview(expr)
             yield idx, expr, preview
+    # -------------------------------------------------------------------------
+    # File-scoped schema context methods
+    # -------------------------------------------------------------------------
+    def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
+        """
+        Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
+        This method builds up file-scoped schema context as statements are processed,
+        enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
+        Args:
+            expr: The SQL expression to extract schema from
+        """
+        # Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
+        if not isinstance(expr, exp.Create):
+            return
+        if expr.kind not in ("VIEW", "TABLE"):
+            return
+        # Get target table/view name
+        target = expr.this
+        if isinstance(target, exp.Schema):
+            target = target.this
+        if not isinstance(target, exp.Table):
+            return
+        target_name = self._get_qualified_table_name(target)
+        # Get the SELECT node from the CREATE statement
+        select_node = expr.expression
+        if select_node is None:
+            return
+        # Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
+        if isinstance(select_node, exp.Subquery):
+            select_node = select_node.this
+        if not isinstance(
+            select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
+        ):
+            return
+        # Extract column names from the SELECT
+        columns = self._extract_columns_from_select(select_node)
+        if columns:
+            # Store with UNKNOWN type - SQLGlot only needs column names for expansion
+            self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
+    def _extract_columns_from_select(
+        self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
+    ) -> List[str]:
+        """
+        Extract column names from a SELECT statement.
+        Handles aliases, direct column references, and SELECT * by resolving
+        against the known file schema.
+        Args:
+            select_node: The SELECT or set operation expression
+        Returns:
+            List of column names
+        """
+        columns: List[str] = []
+        # Get projections (for UNION, use first branch)
+        projections = self._get_select_projections(select_node)
+        first_select = self._get_first_select(select_node)
+        for projection in projections:
+            if isinstance(projection, exp.Alias):
+                # Use the alias name as the column name
+                columns.append(projection.alias)
+            elif isinstance(projection, exp.Column):
+                # Use the column name
+                columns.append(projection.name)
+            elif isinstance(projection, exp.Star):
+                # Resolve SELECT * from known schema
+                if first_select:
+                    star_columns = self._resolve_star_columns(first_select)
+                    columns.extend(star_columns)
+            else:
+                # For expressions without alias, use SQL representation
+                col_sql = projection.sql(dialect=self.dialect)
+                columns.append(col_sql)
+        return columns
+    def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
+        """
+        Resolve SELECT * to actual column names from known file schema or CTEs.
+        Args:
+            select_node: The SELECT node containing the * reference
+        Returns:
+            List of column names if source is known, empty list otherwise
+        """
+        columns: List[str] = []
+        # Get the source table(s) from FROM clause
+        from_clause = select_node.args.get("from")
+        if not from_clause or not isinstance(from_clause, exp.From):
+            return columns
+        source = from_clause.this
+        # Handle table reference
+        if isinstance(source, exp.Table):
+            source_name = self._get_qualified_table_name(source)
+            # First check file schema (views/tables from previous statements)
+            if source_name in self._file_schema:
+                columns.extend(self._file_schema[source_name].keys())
+            else:
+                # Check if this is a CTE reference within the same statement
+                cte_columns = self._resolve_cte_columns(source_name, select_node)
+                columns.extend(cte_columns)
+        # Handle subquery - can't resolve without deeper analysis
+        elif isinstance(source, exp.Subquery) and source.alias:
+            # Check if this subquery alias is in file schema (unlikely)
+            if source.alias in self._file_schema:
+                columns.extend(self._file_schema[source.alias].keys())
+        return columns
+    def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
+        """
+        Resolve columns from a CTE definition within the same statement.
+        Args:
+            cte_name: Name of the CTE to resolve
+            select_node: The SELECT node that references the CTE
+        Returns:
+            List of column names from the CTE, empty if CTE not found
+        """
+        # Walk up the tree to find the WITH clause containing this CTE
+        parent = select_node
+        while parent:
+            if hasattr(parent, "args") and parent.args.get("with"):
+                with_clause = parent.args["with"]
+                for cte in with_clause.expressions:
+                    if isinstance(cte, exp.CTE) and cte.alias == cte_name:
+                        # Found the CTE - extract its columns
+                        cte_select = cte.this
+                        if isinstance(cte_select, exp.Select):
+                            return self._extract_cte_select_columns(cte_select)
+            parent = parent.parent if hasattr(parent, "parent") else None
+        return []
+    def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
+        """
+        Extract column names from a CTE's SELECT statement.
+        This handles SELECT * within the CTE by resolving against file schema.
+        Args:
+            cte_select: The SELECT expression within the CTE
+        Returns:
+            List of column names
+        """
+        columns: List[str] = []
+        for projection in cte_select.expressions:
+            if isinstance(projection, exp.Alias):
+                columns.append(projection.alias)
+            elif isinstance(projection, exp.Column):
+                columns.append(projection.name)
+            elif isinstance(projection, exp.Star):
+                # Resolve SELECT * in CTE from file schema
+                star_columns = self._resolve_star_columns(cte_select)
+                columns.extend(star_columns)
+            else:
+                col_sql = projection.sql(dialect=self.dialect)
+                columns.append(col_sql)
+        return columns

sqlglider/utils/config.py CHANGED Viewed

@@ -23,6 +23,28 @@ class TemplatingConfig(BaseModel):
     variables: Optional[Dict[str, Any]] = None
+class DatabricksCatalogConfig(BaseModel):
+    """Configuration for Databricks catalog provider.
+    All fields are optional - they can also be set via environment variables.
+    The SDK supports unified authentication with multiple methods.
+    """
+    warehouse_id: Optional[str] = None
+    profile: Optional[str] = None  # Databricks CLI profile from ~/.databrickscfg
+    host: Optional[str] = None
+    token: Optional[str] = None  # Legacy PAT, prefer OAuth or profile
+class CatalogConfig(BaseModel):
+    """Configuration for catalog providers.
+    Contains provider-specific configuration under sub-keys.
+    """
+    databricks: Optional[DatabricksCatalogConfig] = None
 class ConfigSettings(BaseModel):
     """Configuration settings for SQL Glider.
@@ -35,6 +57,9 @@ class ConfigSettings(BaseModel):
     output_format: Optional[str] = None
     templater: Optional[str] = None
     templating: Optional[TemplatingConfig] = None
+    catalog_type: Optional[str] = None
+    ddl_folder: Optional[str] = None
+    catalog: Optional[CatalogConfig] = None
 def find_config_file(start_path: Optional[Path] = None) -> Optional[Path]:

sql_glider-0.1.2.dist-info/RECORD DELETED Viewed

@@ -1,26 +0,0 @@
-sqlglider/__init__.py,sha256=gDf7s52dMcX7JuCZ1SLawcB1vb3U0yJCohu9RQAATBY,125
-sqlglider/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
-sqlglider/cli.py,sha256=POWIhv0jfvoNtwSoURpxJydco1rvxX9rAvyjuA9FGC8,36445
-sqlglider/global_models.py,sha256=2vyJXAuXOsXQpE-D3F0ejj7eR9z0nDWFjTkielhzM8k,356
-sqlglider/graph/__init__.py,sha256=4DDdrPM75CmeQWt7wHdBsjCm1s70BHGLYdijIbaUEKY,871
-sqlglider/graph/builder.py,sha256=rrcpGAXLz-VHZ1Y73uw6R7kMXHpzBz7tQ2tdV5BY05w,10202
-sqlglider/graph/merge.py,sha256=uUZlm4BN3S9gRL66Cc2mzhbtuh4SVAv2n4cN4eUEQBU,4077
-sqlglider/graph/models.py,sha256=EYmjv_WzDSNp_WfhJ6H-qBIOkAcoNKS7GRUryfKrHuY,9330
-sqlglider/graph/query.py,sha256=LHU8Cvn7ZPPSEnqdDn2pF8f1_LQjIvNIrZqs8cFlb6U,9433
-sqlglider/graph/serialization.py,sha256=7JJo31rwSlxnDhdqdTJdK4Dr_ZcSYetXfx3_CmndSac,2662
-sqlglider/lineage/__init__.py,sha256=llXMeI5_PIZaiBo8tKk3-wOubF4m_6QBHbn1FtWxT7k,256
-sqlglider/lineage/analyzer.py,sha256=58lyrUc0XsCUrYSb23A02OSBmq7eCtJwc477PbjS3c0,45905
-sqlglider/lineage/formatters.py,sha256=_Y9wcTX4JXn1vVnZ1xI656g1FF2rMjcAVc-GHjbd9QA,10389
-sqlglider/templating/__init__.py,sha256=g3_wb6rSDI0usq2UUMDpn-J5kVwlAw3NtLdwbxL6UHs,1435
-sqlglider/templating/base.py,sha256=y5bWAW7qXl_4pPyo5KycfHwNVvt1-7slZ63DAsvTE1s,2902
-sqlglider/templating/jinja.py,sha256=o01UG72N4G1-tOT5LKK1Wkccv4nJH2VN4VFaMi5c1-g,5220
-sqlglider/templating/registry.py,sha256=BJU3N2qNVMTUtkgbibyqo8Wme_acXQRw5XI-6ZVgyac,3476
-sqlglider/templating/variables.py,sha256=5593PtLBcOxsnMCSRm2pGAD5I0Y9f__VV3_J_HfXVlQ,8010
-sqlglider/utils/__init__.py,sha256=KGp9-UzKz_OFBOTFoSy-g-NXDZsvyWXG_9-1zcC6ePE,276
-sqlglider/utils/config.py,sha256=mkven_CcE_dNfKiHi0h2CsE5TMQDX9XqbU7GGEELwEY,3959
-sqlglider/utils/file_utils.py,sha256=5_ff28E0r1R7emZzsOnRuHd-7zIX6873eyr1SuPEr4E,1093
-sql_glider-0.1.2.dist-info/METADATA,sha256=JUXRDvhfnJBj2owWMaupDugZj4Y6uDv1R7RCCkaEWlw,22349
-sql_glider-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-sql_glider-0.1.2.dist-info/entry_points.txt,sha256=LWVdQEfvDT5uZ2RQ4Rse8m0HxBCOMbbqDkxdwUh9d78,169
-sql_glider-0.1.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-sql_glider-0.1.2.dist-info/RECORD,,

{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{sql_glider-0.1.2.dist-info → sql_glider-0.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

sql-glider 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

sql-glider 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl