PyPI - sql-glider - Versions diffs - 0.1.3__tar.gz → 0.1.4__tar.gz - Mend

sql-glider 0.1.3tar.gz → 0.1.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

{sql_glider-0.1.3 → sql_glider-0.1.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sql-glider
-Version: 0.1.3
+Version: 0.1.4
 Summary: SQL Utility Toolkit for better understanding, use, and governance of your queries in a native environment.
 Project-URL: Homepage, https://github.com/rycowhi/sql-glider/
 Project-URL: Repository, https://github.com/rycowhi/sql-glider/

{sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/_version.py RENAMED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.1.3'
-__version_tuple__ = version_tuple = (0, 1, 3)
+__version__ = version = '0.1.4'
+__version_tuple__ = version_tuple = (0, 1, 4)
 __commit_id__ = commit_id = None

{sql_glider-0.1.3 → sql_glider-0.1.4}/src/sqlglider/lineage/analyzer.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Core lineage analysis using SQLGlot."""
 from enum import Enum
-from typing import Callable, Iterator, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, Iterator, List, Optional, Set, Tuple, Union
 from pydantic import BaseModel, Field
 from sqlglot import exp, parse
@@ -99,6 +99,9 @@ class LineageAnalyzer:
         self.sql = sql
         self.dialect = dialect
         self._skipped_queries: List[SkippedQuery] = []
+        # File-scoped schema context for cross-statement lineage
+        # Maps table/view names to their column definitions
+        self._file_schema: Dict[str, Dict[str, str]] = {}
         try:
             # Parse all statements in the SQL string
@@ -156,7 +159,24 @@ class LineageAnalyzer:
             # DML/DDL: Use target table for output column qualification
             # The columns are from the SELECT, but qualified with the target table
             projections = self._get_select_projections(select_node)
+            first_select = self._get_first_select(select_node)
             for projection in projections:
+                # Handle SELECT * by resolving from file schema
+                if isinstance(projection, exp.Star):
+                    if first_select:
+                        star_columns = self._resolve_star_columns(first_select)
+                        for star_col in star_columns:
+                            qualified_name = f"{target_table}.{star_col}"
+                            columns.append(qualified_name)
+                            self._column_mapping[qualified_name] = star_col
+                    if not columns:
+                        # Fallback: can't resolve *, use * as column name
+                        qualified_name = f"{target_table}.*"
+                        columns.append(qualified_name)
+                        self._column_mapping[qualified_name] = "*"
+                    continue
                 # Get the underlying expression (unwrap alias if present)
                 if isinstance(projection, exp.Alias):
                     # For aliased columns, use the alias as the column name
@@ -324,6 +344,7 @@ class LineageAnalyzer:
         """
         results = []
         self._skipped_queries = []  # Reset skipped queries for this analysis
+        self._file_schema = {}  # Reset file schema for this analysis run
         for query_index, expr, preview in self._iterate_queries(table_filter):
             # Temporarily swap self.expr to analyze this query
@@ -375,6 +396,9 @@ class LineageAnalyzer:
                     )
                 )
             finally:
+                # Extract schema from this statement AFTER analysis
+                # This builds up context for subsequent statements to use
+                self._extract_schema_from_statement(expr)
                 # Restore original expression
                 self.expr = original_expr
@@ -702,7 +726,13 @@ class LineageAnalyzer:
                 lineage_col = self._column_mapping.get(col, col)
                 # Get lineage tree for this column using current query SQL only
-                node = lineage(lineage_col, current_query_sql, dialect=self.dialect)
+                # Pass file schema to enable SELECT * expansion for known tables/views
+                node = lineage(
+                    lineage_col,
+                    current_query_sql,
+                    dialect=self.dialect,
+                    schema=self._file_schema if self._file_schema else None,
+                )
                 # Collect all source columns
                 sources: Set[str] = set()
@@ -1235,3 +1265,187 @@ class LineageAnalyzer:
             preview = self._generate_query_preview(expr)
             yield idx, expr, preview
+    # -------------------------------------------------------------------------
+    # File-scoped schema context methods
+    # -------------------------------------------------------------------------
+    def _extract_schema_from_statement(self, expr: exp.Expression) -> None:
+        """
+        Extract column definitions from CREATE VIEW/TABLE AS SELECT statements.
+        This method builds up file-scoped schema context as statements are processed,
+        enabling SQLGlot to correctly expand SELECT * and trace cross-statement references.
+        Args:
+            expr: The SQL expression to extract schema from
+        """
+        # Only handle CREATE VIEW or CREATE TABLE (AS SELECT)
+        if not isinstance(expr, exp.Create):
+            return
+        if expr.kind not in ("VIEW", "TABLE"):
+            return
+        # Get target table/view name
+        target = expr.this
+        if isinstance(target, exp.Schema):
+            target = target.this
+        if not isinstance(target, exp.Table):
+            return
+        target_name = self._get_qualified_table_name(target)
+        # Get the SELECT node from the CREATE statement
+        select_node = expr.expression
+        if select_node is None:
+            return
+        # Handle Subquery wrapper (e.g., CREATE VIEW AS (SELECT ...))
+        if isinstance(select_node, exp.Subquery):
+            select_node = select_node.this
+        if not isinstance(
+            select_node, (exp.Select, exp.Union, exp.Intersect, exp.Except)
+        ):
+            return
+        # Extract column names from the SELECT
+        columns = self._extract_columns_from_select(select_node)
+        if columns:
+            # Store with UNKNOWN type - SQLGlot only needs column names for expansion
+            self._file_schema[target_name] = {col: "UNKNOWN" for col in columns}
+    def _extract_columns_from_select(
+        self, select_node: Union[exp.Select, exp.Union, exp.Intersect, exp.Except]
+    ) -> List[str]:
+        """
+        Extract column names from a SELECT statement.
+        Handles aliases, direct column references, and SELECT * by resolving
+        against the known file schema.
+        Args:
+            select_node: The SELECT or set operation expression
+        Returns:
+            List of column names
+        """
+        columns: List[str] = []
+        # Get projections (for UNION, use first branch)
+        projections = self._get_select_projections(select_node)
+        first_select = self._get_first_select(select_node)
+        for projection in projections:
+            if isinstance(projection, exp.Alias):
+                # Use the alias name as the column name
+                columns.append(projection.alias)
+            elif isinstance(projection, exp.Column):
+                # Use the column name
+                columns.append(projection.name)
+            elif isinstance(projection, exp.Star):
+                # Resolve SELECT * from known schema
+                if first_select:
+                    star_columns = self._resolve_star_columns(first_select)
+                    columns.extend(star_columns)
+            else:
+                # For expressions without alias, use SQL representation
+                col_sql = projection.sql(dialect=self.dialect)
+                columns.append(col_sql)
+        return columns
+    def _resolve_star_columns(self, select_node: exp.Select) -> List[str]:
+        """
+        Resolve SELECT * to actual column names from known file schema or CTEs.
+        Args:
+            select_node: The SELECT node containing the * reference
+        Returns:
+            List of column names if source is known, empty list otherwise
+        """
+        columns: List[str] = []
+        # Get the source table(s) from FROM clause
+        from_clause = select_node.args.get("from")
+        if not from_clause or not isinstance(from_clause, exp.From):
+            return columns
+        source = from_clause.this
+        # Handle table reference
+        if isinstance(source, exp.Table):
+            source_name = self._get_qualified_table_name(source)
+            # First check file schema (views/tables from previous statements)
+            if source_name in self._file_schema:
+                columns.extend(self._file_schema[source_name].keys())
+            else:
+                # Check if this is a CTE reference within the same statement
+                cte_columns = self._resolve_cte_columns(source_name, select_node)
+                columns.extend(cte_columns)
+        # Handle subquery - can't resolve without deeper analysis
+        elif isinstance(source, exp.Subquery) and source.alias:
+            # Check if this subquery alias is in file schema (unlikely)
+            if source.alias in self._file_schema:
+                columns.extend(self._file_schema[source.alias].keys())
+        return columns
+    def _resolve_cte_columns(self, cte_name: str, select_node: exp.Select) -> List[str]:
+        """
+        Resolve columns from a CTE definition within the same statement.
+        Args:
+            cte_name: Name of the CTE to resolve
+            select_node: The SELECT node that references the CTE
+        Returns:
+            List of column names from the CTE, empty if CTE not found
+        """
+        # Walk up the tree to find the WITH clause containing this CTE
+        parent = select_node
+        while parent:
+            if hasattr(parent, "args") and parent.args.get("with"):
+                with_clause = parent.args["with"]
+                for cte in with_clause.expressions:
+                    if isinstance(cte, exp.CTE) and cte.alias == cte_name:
+                        # Found the CTE - extract its columns
+                        cte_select = cte.this
+                        if isinstance(cte_select, exp.Select):
+                            return self._extract_cte_select_columns(cte_select)
+            parent = parent.parent if hasattr(parent, "parent") else None
+        return []
+    def _extract_cte_select_columns(self, cte_select: exp.Select) -> List[str]:
+        """
+        Extract column names from a CTE's SELECT statement.
+        This handles SELECT * within the CTE by resolving against file schema.
+        Args:
+            cte_select: The SELECT expression within the CTE
+        Returns:
+            List of column names
+        """
+        columns: List[str] = []
+        for projection in cte_select.expressions:
+            if isinstance(projection, exp.Alias):
+                columns.append(projection.alias)
+            elif isinstance(projection, exp.Column):
+                columns.append(projection.name)
+            elif isinstance(projection, exp.Star):
+                # Resolve SELECT * in CTE from file schema
+                star_columns = self._resolve_star_columns(cte_select)
+                columns.extend(star_columns)
+            else:
+                col_sql = projection.sql(dialect=self.dialect)
+                columns.append(col_sql)
+        return columns

sql_glider-0.1.4/tests/fixtures/original_queries/test_view_window_cte.sql ADDED Viewed

@@ -0,0 +1,27 @@
+CREATE TEMPORARY VIEW first_view AS (
+    SELECT
+        a,
+        b,
+        c
+    FROM source_table
+);
+CREATE TEMPORARY VIEW second_view AS
+WITH first_view_cte AS (
+    SELECT
+        *,
+        row_number() OVER (
+            PARTITION BY a ORDER BY b DESC
+        ) AS row_num
+    FROM first_view
+)
+SELECT * FROM first_view_cte
+WHERE c = 1;
+INSERT OVERWRITE output_table
+SELECT
+    a,
+    b,
+    c,
+    row_num
+FROM second_view;

{sql_glider-0.1.3 → sql_glider-0.1.4}/tests/sqlglider/graph/test_builder.py RENAMED Viewed

@@ -428,3 +428,153 @@ class TestGraphBuilderInsertWithUnion:
         assert "db.source_a.last" in upstream_ids
         assert "db.source_b.first" in upstream_ids
         assert "db.source_b.last" in upstream_ids
+class TestGraphBuilderCreateViewWithCTEAndWindowFunction:
+    """Tests for CREATE VIEW statements with CTEs and window functions."""
+    def test_create_view_with_cte_and_row_number(self, tmp_path):
+        """CREATE VIEW with CTE and ROW_NUMBER() OVER (PARTITION BY ...) should work."""
+        sql_file = tmp_path / "query.sql"
+        sql_file.write_text("""
+            CREATE VIEW my_view AS
+            WITH ranked_orders AS (
+                SELECT
+                    customer_id,
+                    order_date,
+                    amount,
+                    ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY order_date DESC) as rn
+                FROM orders
+            )
+            SELECT customer_id, order_date, amount
+            FROM ranked_orders
+            WHERE rn = 1
+        """)
+        builder = GraphBuilder(dialect="spark")
+        builder.add_file(sql_file)
+        graph = builder.build()
+        # Should have nodes created successfully
+        assert graph.metadata.total_nodes > 0
+        assert graph.metadata.total_edges > 0
+        # Check that output columns are qualified with the view name
+        node_ids = {node.identifier for node in graph.nodes}
+        assert "my_view.customer_id" in node_ids
+        assert "my_view.order_date" in node_ids
+        assert "my_view.amount" in node_ids
+        # Source columns from orders table should exist
+        assert "orders.customer_id" in node_ids
+        assert "orders.order_date" in node_ids
+        assert "orders.amount" in node_ids
+    def test_create_view_with_cte_row_number_lineage_tracing(self, tmp_path):
+        """Test that lineage correctly traces through CTE with window function."""
+        from sqlglider.graph.query import GraphQuerier
+        sql_file = tmp_path / "query.sql"
+        sql_file.write_text("""
+            CREATE VIEW latest_orders AS
+            WITH ranked AS (
+                SELECT
+                    o.customer_id,
+                    o.order_date,
+                    o.total_amount,
+                    ROW_NUMBER() OVER (PARTITION BY o.customer_id ORDER BY o.order_date DESC) as rn
+                FROM sales.orders o
+            )
+            SELECT customer_id, order_date, total_amount
+            FROM ranked
+            WHERE rn = 1
+        """)
+        builder = GraphBuilder(dialect="spark")
+        builder.add_file(sql_file)
+        graph = builder.build()
+        # Query upstream from output columns
+        querier = GraphQuerier(graph)
+        # customer_id should trace back to sales.orders.customer_id
+        upstream_customer = querier.find_upstream("latest_orders.customer_id")
+        upstream_ids = {n.identifier for n in upstream_customer.related_columns}
+        assert "sales.orders.customer_id" in upstream_ids
+        # total_amount should trace back to sales.orders.total_amount
+        upstream_amount = querier.find_upstream("latest_orders.total_amount")
+        upstream_ids = {n.identifier for n in upstream_amount.related_columns}
+        assert "sales.orders.total_amount" in upstream_ids
+    def test_create_view_multiple_window_functions(self, tmp_path):
+        """Test CREATE VIEW with multiple window functions."""
+        sql_file = tmp_path / "query.sql"
+        sql_file.write_text("""
+            CREATE VIEW customer_rankings AS
+            WITH metrics AS (
+                SELECT
+                    customer_id,
+                    total_spend,
+                    ROW_NUMBER() OVER (ORDER BY total_spend DESC) as spend_rank,
+                    RANK() OVER (PARTITION BY region ORDER BY total_spend DESC) as region_rank,
+                    LAG(total_spend) OVER (PARTITION BY customer_id ORDER BY order_date) as prev_spend
+                FROM customer_orders
+            )
+            SELECT customer_id, total_spend, spend_rank, region_rank
+            FROM metrics
+        """)
+        builder = GraphBuilder(dialect="spark")
+        builder.add_file(sql_file)
+        graph = builder.build()
+        # Should process successfully with multiple window functions
+        assert graph.metadata.total_nodes > 0
+        node_ids = {node.identifier for node in graph.nodes}
+        assert "customer_rankings.customer_id" in node_ids
+        assert "customer_rankings.total_spend" in node_ids
+        assert "customer_rankings.spend_rank" in node_ids
+        assert "customer_rankings.region_rank" in node_ids
+    def test_create_view_nested_ctes_with_window(self, tmp_path):
+        """Test CREATE VIEW with nested CTEs and window functions."""
+        sql_file = tmp_path / "query.sql"
+        sql_file.write_text("""
+            CREATE VIEW final_report AS
+            WITH base_data AS (
+                SELECT customer_id, product_id, quantity, sale_date
+                FROM raw_sales
+            ),
+            ranked_sales AS (
+                SELECT
+                    customer_id,
+                    product_id,
+                    quantity,
+                    ROW_NUMBER() OVER (
+                        PARTITION BY customer_id, product_id
+                        ORDER BY sale_date DESC
+                    ) as sale_rank
+                FROM base_data
+            )
+            SELECT customer_id, product_id, quantity
+            FROM ranked_sales
+            WHERE sale_rank = 1
+        """)
+        builder = GraphBuilder(dialect="spark")
+        builder.add_file(sql_file)
+        graph = builder.build()
+        assert graph.metadata.total_nodes > 0
+        node_ids = {node.identifier for node in graph.nodes}
+        assert "final_report.customer_id" in node_ids
+        assert "final_report.product_id" in node_ids
+        assert "final_report.quantity" in node_ids
+        # Source should trace to raw_sales
+        assert "raw_sales.customer_id" in node_ids
+        assert "raw_sales.product_id" in node_ids
+        assert "raw_sales.quantity" in node_ids

sql-glider 0.1.3__tar.gz → 0.1.4__tar.gz

sql-glider 0.1.3tar.gz → 0.1.4tar.gz