PyPI - sql-glider - Versions diffs - 0.1.2__py3-none-any.whl - Mend

sql-glider 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

sql_glider-0.1.2.dist-info/METADATA +721 -0
sql_glider-0.1.2.dist-info/RECORD +26 -0
sql_glider-0.1.2.dist-info/WHEEL +4 -0
sql_glider-0.1.2.dist-info/entry_points.txt +6 -0
sql_glider-0.1.2.dist-info/licenses/LICENSE +201 -0
sqlglider/__init__.py +3 -0
sqlglider/_version.py +34 -0
sqlglider/cli.py +1137 -0
sqlglider/global_models.py +17 -0
sqlglider/graph/__init__.py +42 -0
sqlglider/graph/builder.py +310 -0
sqlglider/graph/merge.py +136 -0
sqlglider/graph/models.py +289 -0
sqlglider/graph/query.py +287 -0
sqlglider/graph/serialization.py +107 -0
sqlglider/lineage/__init__.py +10 -0
sqlglider/lineage/analyzer.py +1183 -0
sqlglider/lineage/formatters.py +335 -0
sqlglider/templating/__init__.py +51 -0
sqlglider/templating/base.py +103 -0
sqlglider/templating/jinja.py +163 -0
sqlglider/templating/registry.py +124 -0
sqlglider/templating/variables.py +295 -0
sqlglider/utils/__init__.py +11 -0
sqlglider/utils/config.py +130 -0
sqlglider/utils/file_utils.py +38 -0

sqlglider/graph/models.py ADDED Viewed

@@ -0,0 +1,289 @@
+"""Pydantic models for graph-based lineage representation."""
+import csv
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import List, Optional
+from pydantic import BaseModel, Field
+from sqlglider.global_models import NodeFormat
+class GraphNode(BaseModel):
+    """Represents a node in the lineage graph (a column)."""
+    identifier: str = Field(
+        ..., description="Unique node identifier (fully-qualified column name)"
+    )
+    file_path: str = Field(
+        ..., description="Source SQL file path where first encountered"
+    )
+    query_index: int = Field(..., description="Index of query within the file")
+    # Structured fields for flexible querying (always populated from identifier)
+    schema_name: Optional[str] = Field(None, description="Schema name (if present)")
+    table: Optional[str] = Field(None, description="Table name")
+    column: Optional[str] = Field(None, description="Column name")
+    @classmethod
+    def from_identifier(
+        cls,
+        identifier: str,
+        file_path: str,
+        query_index: int,
+    ) -> "GraphNode":
+        """
+        Create a GraphNode from a column identifier.
+        Parses the identifier into schema, table, and column components.
+        Args:
+            identifier: Fully-qualified column name (e.g., "schema.table.column" or "table.column")
+            file_path: Source SQL file path
+            query_index: Query index within the file
+        Returns:
+            GraphNode with parsed components
+        """
+        parts = identifier.split(".")
+        if len(parts) >= 3:
+            schema_name = parts[0]
+            table = parts[1]
+            column = ".".join(parts[2:])  # Handle columns with dots
+        elif len(parts) == 2:
+            schema_name = None
+            table = parts[0]
+            column = parts[1]
+        else:
+            schema_name = None
+            table = None
+            column = identifier
+        return cls(
+            identifier=identifier,
+            file_path=file_path,
+            query_index=query_index,
+            schema_name=schema_name,
+            table=table,
+            column=column,
+        )
+class GraphEdge(BaseModel):
+    """Represents an edge in the lineage graph (contributes_to relationship)."""
+    source_node: str = Field(
+        ..., description="Source node identifier (contributes from)"
+    )
+    target_node: str = Field(..., description="Target node identifier (contributes to)")
+    file_path: str = Field(
+        ..., description="Source SQL file where relationship is defined"
+    )
+    query_index: int = Field(..., description="Index of query within the file")
+class ManifestEntry(BaseModel):
+    """Represents a single entry in a manifest file."""
+    file_path: str = Field(..., description="Path to SQL file")
+    dialect: Optional[str] = Field(
+        None, description="SQL dialect (optional, uses default if empty)"
+    )
+class Manifest(BaseModel):
+    """Represents a manifest file with SQL file paths and optional dialects."""
+    entries: List[ManifestEntry] = Field(default_factory=list)
+    @classmethod
+    def from_csv(cls, csv_path: Path) -> "Manifest":
+        """
+        Load manifest from CSV file.
+        Expected CSV format:
+        ```
+        file_path,dialect
+        queries/orders.sql,spark
+        queries/customers.sql,postgres
+        queries/legacy.sql,
+        ```
+        Args:
+            csv_path: Path to manifest CSV file
+        Returns:
+            Manifest with loaded entries
+        Raises:
+            FileNotFoundError: If CSV file doesn't exist
+            ValueError: If CSV is missing required 'file_path' column
+        """
+        if not csv_path.exists():
+            raise FileNotFoundError(f"Manifest file not found: {csv_path}")
+        entries = []
+        with open(csv_path, newline="", encoding="utf-8") as f:
+            reader = csv.DictReader(f)
+            # Validate required column
+            if reader.fieldnames is None or "file_path" not in reader.fieldnames:
+                raise ValueError("Manifest CSV must have a 'file_path' column")
+            for row in reader:
+                file_path = row["file_path"].strip()
+                if not file_path:
+                    continue  # Skip empty rows
+                dialect = row.get("dialect", "").strip() or None
+                entries.append(ManifestEntry(file_path=file_path, dialect=dialect))
+        return cls(entries=entries)
+class LineagePath(BaseModel):
+    """A single lineage path from a node to the queried column."""
+    nodes: List[str] = Field(
+        ..., description="Ordered list of node identifiers in the path"
+    )
+    @property
+    def hops(self) -> int:
+        """Number of hops in the path (edges traversed)."""
+        return len(self.nodes) - 1 if len(self.nodes) > 1 else 0
+    def to_arrow_string(self) -> str:
+        """Format path as arrow-separated string for display."""
+        return " -> ".join(self.nodes)
+class LineageNode(BaseModel):
+    """
+    A node in lineage query results with additional context.
+    Extends GraphNode fields with query-specific information like hop distance
+    and the output column being queried.
+    """
+    # Fields from GraphNode
+    identifier: str = Field(
+        ..., description="Unique node identifier (fully-qualified column name)"
+    )
+    file_path: str = Field(
+        ..., description="Source SQL file path where first encountered"
+    )
+    query_index: int = Field(..., description="Index of query within the file")
+    schema_name: Optional[str] = Field(None, description="Schema name (if present)")
+    table: Optional[str] = Field(None, description="Table name")
+    column: Optional[str] = Field(None, description="Column name")
+    # Query result fields
+    hops: int = Field(..., description="Number of hops from the queried column")
+    output_column: str = Field(..., description="The column that was queried")
+    # Path tracking and root/leaf detection fields
+    is_root: bool = Field(
+        default=False, description="True if node has no upstream dependencies"
+    )
+    is_leaf: bool = Field(
+        default=False, description="True if node has no downstream dependencies"
+    )
+    paths: List[LineagePath] = Field(
+        default_factory=list,
+        description="All paths from this node to the queried column",
+    )
+    @classmethod
+    def from_graph_node(
+        cls,
+        node: "GraphNode",
+        hops: int,
+        output_column: str,
+        is_root: bool = False,
+        is_leaf: bool = False,
+        paths: Optional[List[LineagePath]] = None,
+    ) -> "LineageNode":
+        """
+        Create a LineageNode from a GraphNode with additional context.
+        Args:
+            node: The underlying GraphNode
+            hops: Number of hops from the query column
+            output_column: The column that was queried
+            is_root: True if node has no upstream dependencies
+            is_leaf: True if node has no downstream dependencies
+            paths: List of all paths from this node to the queried column
+        Returns:
+            LineageNode with all GraphNode fields plus query context
+        """
+        return cls(
+            identifier=node.identifier,
+            file_path=node.file_path,
+            query_index=node.query_index,
+            schema_name=node.schema_name,
+            table=node.table,
+            column=node.column,
+            hops=hops,
+            output_column=output_column,
+            is_root=is_root,
+            is_leaf=is_leaf,
+            paths=paths or [],
+        )
+class GraphMetadata(BaseModel):
+    """Metadata about the lineage graph."""
+    node_format: NodeFormat = Field(
+        default=NodeFormat.QUALIFIED,
+        description="Format of node identifiers in serialized output",
+    )
+    default_dialect: str = Field(
+        default="spark", description="Default SQL dialect used"
+    )
+    created_at: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat(),
+        description="ISO 8601 timestamp of graph creation",
+    )
+    source_files: List[str] = Field(
+        default_factory=list,
+        description="List of source SQL files included in the graph",
+    )
+    total_nodes: int = Field(
+        default=0, description="Total number of nodes in the graph"
+    )
+    total_edges: int = Field(
+        default=0, description="Total number of edges in the graph"
+    )
+class LineageGraph(BaseModel):
+    """Serializable representation of the complete lineage graph."""
+    metadata: GraphMetadata = Field(default_factory=GraphMetadata)
+    nodes: List[GraphNode] = Field(
+        default_factory=list, description="All nodes in the graph"
+    )
+    edges: List[GraphEdge] = Field(
+        default_factory=list, description="All edges in the graph"
+    )
+    def get_node_by_identifier(self, identifier: str) -> Optional[GraphNode]:
+        """
+        Find a node by its identifier (case-insensitive).
+        Args:
+            identifier: Node identifier to find
+        Returns:
+            GraphNode if found, None otherwise
+        """
+        identifier_lower = identifier.lower()
+        for node in self.nodes:
+            if node.identifier.lower() == identifier_lower:
+                return node
+        return None

sqlglider/graph/query.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""Graph query functionality for upstream/downstream analysis."""
+from pathlib import Path
+from typing import List, Optional
+import rustworkx as rx
+from sqlglider.graph.models import GraphNode, LineageGraph, LineageNode, LineagePath
+from sqlglider.graph.serialization import load_graph, to_rustworkx
+class LineageQueryResult:
+    """Result of a lineage query."""
+    def __init__(
+        self,
+        query_column: str,
+        direction: str,  # "upstream" or "downstream"
+        related_columns: List[LineageNode],
+    ):
+        """
+        Initialize query result.
+        Args:
+            query_column: The column that was queried
+            direction: Query direction ("upstream" or "downstream")
+            related_columns: List of related LineageNode objects with hop info
+        """
+        self.query_column = query_column
+        self.direction = direction
+        self.related_columns = related_columns
+    def __len__(self) -> int:
+        """Return number of related columns."""
+        return len(self.related_columns)
+    def __iter__(self):
+        """Iterate over related columns."""
+        return iter(self.related_columns)
+class GraphQuerier:
+    """Query lineage graphs for upstream/downstream dependencies."""
+    def __init__(self, graph: LineageGraph):
+        """
+        Initialize the querier with a graph.
+        Args:
+            graph: LineageGraph to query
+        """
+        self.graph = graph
+        self.rx_graph, self.node_map = to_rustworkx(graph)
+        self._reverse_map = {v: k for k, v in self.node_map.items()}
+        # Create reversed graph for upstream queries (lazy initialization)
+        self._rx_graph_reversed: Optional[rx.PyDiGraph] = None
+    @property
+    def rx_graph_reversed(self) -> rx.PyDiGraph:
+        """Get reversed graph for upstream traversal (created lazily)."""
+        if self._rx_graph_reversed is None:
+            self._rx_graph_reversed = self.rx_graph.copy()
+            self._rx_graph_reversed.reverse()
+        return self._rx_graph_reversed
+    def _is_root(self, node_idx: int) -> bool:
+        """Check if node is a root (no incoming edges in original graph)."""
+        return self.rx_graph.in_degree(node_idx) == 0
+    def _is_leaf(self, node_idx: int) -> bool:
+        """Check if node is a leaf (no outgoing edges in original graph)."""
+        return self.rx_graph.out_degree(node_idx) == 0
+    def _find_all_paths(
+        self,
+        from_idx: int,
+        to_idx: int,
+        use_reversed: bool = False,
+    ) -> List[List[int]]:
+        """
+        Find all simple paths between two nodes.
+        Args:
+            from_idx: Starting node index
+            to_idx: Target node index
+            use_reversed: If True, use reversed graph for upstream queries
+        Returns:
+            List of paths, where each path is a list of node indices
+        """
+        graph = self.rx_graph_reversed if use_reversed else self.rx_graph
+        return rx.all_simple_paths(graph, from_idx, to_idx)
+    def _convert_path_to_identifiers(
+        self,
+        path: List[int],
+        reverse: bool = False,
+    ) -> LineagePath:
+        """
+        Convert a path of node indices to a LineagePath with identifiers.
+        Args:
+            path: List of node indices
+            reverse: If True, reverse the path order (for upstream queries)
+        Returns:
+            LineagePath with node identifiers
+        """
+        identifiers = [self._reverse_map[idx] for idx in path]
+        if reverse:
+            identifiers = list(reversed(identifiers))
+        return LineagePath(nodes=identifiers)
+    @classmethod
+    def from_file(cls, graph_path: Path) -> "GraphQuerier":
+        """
+        Create a querier from a graph file.
+        Args:
+            graph_path: Path to graph JSON file
+        Returns:
+            GraphQuerier instance
+        Raises:
+            FileNotFoundError: If file doesn't exist
+        """
+        graph = load_graph(graph_path)
+        return cls(graph)
+    def find_upstream(self, column: str) -> LineageQueryResult:
+        """
+        Find all upstream (source) columns for a given column.
+        Uses dijkstra_shortest_path_lengths on a reversed graph to find all
+        nodes that have a path leading to the specified column, with hop counts,
+        root/leaf detection, and full path information.
+        Args:
+            column: Column identifier to analyze
+        Returns:
+            LineageQueryResult with upstream columns including:
+            - hop distances (shortest path)
+            - is_root/is_leaf flags
+            - all paths to the queried column
+        Raises:
+            ValueError: If column not found in graph
+        """
+        # Case-insensitive lookup
+        matched_column = self._find_column(column)
+        if matched_column is None:
+            raise ValueError(f"Column '{column}' not found in graph")
+        target_idx = self.node_map[matched_column]
+        # Use dijkstra on reversed graph to get distances to all ancestors
+        # Each edge has weight 1.0 for hop counting
+        distances = rx.dijkstra_shortest_path_lengths(
+            self.rx_graph_reversed,
+            target_idx,
+            edge_cost_fn=lambda _: 1.0,
+        )
+        # Build LineageNode for each reachable node
+        upstream_columns = []
+        for idx, hops in distances.items():
+            node_data = self.rx_graph[idx]
+            # Find all paths from this node to target
+            # On reversed graph: from target to this node, then reverse the paths
+            raw_paths = self._find_all_paths(target_idx, idx, use_reversed=True)
+            paths = [
+                self._convert_path_to_identifiers(p, reverse=True) for p in raw_paths
+            ]
+            upstream_columns.append(
+                LineageNode.from_graph_node(
+                    GraphNode(**node_data),
+                    hops=int(hops),
+                    output_column=matched_column,
+                    is_root=self._is_root(idx),
+                    is_leaf=self._is_leaf(idx),
+                    paths=paths,
+                )
+            )
+        # Sort by identifier for consistent output
+        upstream_columns.sort(key=lambda n: n.identifier.lower())
+        return LineageQueryResult(
+            query_column=matched_column,
+            direction="upstream",
+            related_columns=upstream_columns,
+        )
+    def find_downstream(self, column: str) -> LineageQueryResult:
+        """
+        Find all downstream (affected) columns for a given column.
+        Uses dijkstra_shortest_path_lengths to find all nodes that have a path
+        from the specified column, with hop counts, root/leaf detection, and
+        full path information.
+        Args:
+            column: Column identifier to analyze
+        Returns:
+            LineageQueryResult with downstream columns including:
+            - hop distances (shortest path)
+            - is_root/is_leaf flags
+            - all paths from the queried column
+        Raises:
+            ValueError: If column not found in graph
+        """
+        # Case-insensitive lookup
+        matched_column = self._find_column(column)
+        if matched_column is None:
+            raise ValueError(f"Column '{column}' not found in graph")
+        source_idx = self.node_map[matched_column]
+        # Use dijkstra on original graph to get distances to all descendants
+        # Each edge has weight 1.0 for hop counting
+        distances = rx.dijkstra_shortest_path_lengths(
+            self.rx_graph,
+            source_idx,
+            edge_cost_fn=lambda _: 1.0,
+        )
+        # Build LineageNode for each reachable node
+        downstream_columns = []
+        for idx, hops in distances.items():
+            node_data = self.rx_graph[idx]
+            # Find all paths from source to this node
+            raw_paths = self._find_all_paths(source_idx, idx, use_reversed=False)
+            paths = [
+                self._convert_path_to_identifiers(p, reverse=False) for p in raw_paths
+            ]
+            downstream_columns.append(
+                LineageNode.from_graph_node(
+                    GraphNode(**node_data),
+                    hops=int(hops),
+                    output_column=matched_column,
+                    is_root=self._is_root(idx),
+                    is_leaf=self._is_leaf(idx),
+                    paths=paths,
+                )
+            )
+        # Sort by identifier for consistent output
+        downstream_columns.sort(key=lambda n: n.identifier.lower())
+        return LineageQueryResult(
+            query_column=matched_column,
+            direction="downstream",
+            related_columns=downstream_columns,
+        )
+    def _find_column(self, column: str) -> Optional[str]:
+        """
+        Find column with case-insensitive matching.
+        Args:
+            column: Column identifier to find
+        Returns:
+            Matched column identifier or None
+        """
+        column_lower = column.lower()
+        for identifier in self.node_map.keys():
+            if identifier.lower() == column_lower:
+                return identifier
+        return None
+    def list_columns(self) -> List[str]:
+        """
+        List all column identifiers in the graph.
+        Returns:
+            Sorted list of column identifiers
+        """
+        return sorted(self.node_map.keys(), key=str.lower)

sqlglider/graph/serialization.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Serialization and deserialization for lineage graphs."""
+from pathlib import Path
+from typing import Dict, Tuple
+import rustworkx as rx
+from sqlglider.graph.models import (
+    GraphEdge,
+    GraphMetadata,
+    GraphNode,
+    LineageGraph,
+)
+def save_graph(graph: LineageGraph, output_path: Path) -> None:
+    """
+    Save a LineageGraph to a JSON file.
+    Args:
+        graph: LineageGraph to save
+        output_path: Output file path
+    """
+    output_path.write_text(
+        graph.model_dump_json(indent=2),
+        encoding="utf-8",
+    )
+def load_graph(input_path: Path) -> LineageGraph:
+    """
+    Load a LineageGraph from a JSON file.
+    Args:
+        input_path: Input file path
+    Returns:
+        Loaded LineageGraph
+    Raises:
+        FileNotFoundError: If input file doesn't exist
+        ValueError: If file content is invalid JSON or doesn't match schema
+    """
+    if not input_path.exists():
+        raise FileNotFoundError(f"Graph file not found: {input_path}")
+    content = input_path.read_text(encoding="utf-8")
+    return LineageGraph.model_validate_json(content)
+def to_rustworkx(graph: LineageGraph) -> Tuple[rx.PyDiGraph, Dict[str, int]]:
+    """
+    Convert a LineageGraph to a rustworkx PyDiGraph.
+    Args:
+        graph: LineageGraph to convert
+    Returns:
+        Tuple of (PyDiGraph, node_identifier_to_index_map)
+    """
+    rx_graph: rx.PyDiGraph = rx.PyDiGraph()
+    node_map: Dict[str, int] = {}
+    # Add nodes
+    for node in graph.nodes:
+        idx = rx_graph.add_node(node.model_dump())
+        node_map[node.identifier] = idx
+    # Add edges
+    for edge in graph.edges:
+        source_idx = node_map.get(edge.source_node)
+        target_idx = node_map.get(edge.target_node)
+        if source_idx is not None and target_idx is not None:
+            rx_graph.add_edge(source_idx, target_idx, edge.model_dump())
+    return rx_graph, node_map
+def from_rustworkx(
+    rx_graph: rx.PyDiGraph,
+    metadata: GraphMetadata,
+) -> LineageGraph:
+    """
+    Convert a rustworkx PyDiGraph to a LineageGraph.
+    Args:
+        rx_graph: rustworkx directed graph
+        metadata: Graph metadata to include
+    Returns:
+        LineageGraph with nodes and edges from the rustworkx graph
+    """
+    nodes = [GraphNode(**rx_graph[idx]) for idx in rx_graph.node_indices()]
+    edges = [
+        GraphEdge(**rx_graph.get_edge_data_by_index(idx))
+        for idx in rx_graph.edge_indices()
+    ]
+    # Update metadata counts
+    metadata.total_nodes = len(nodes)
+    metadata.total_edges = len(edges)
+    return LineageGraph(
+        metadata=metadata,
+        nodes=nodes,
+        edges=edges,
+    )

sqlglider/lineage/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Lineage analysis module for SQL Glider."""
+from sqlglider.lineage.analyzer import (
+    LineageAnalyzer,
+    LineageItem,
+    QueryLineageResult,
+    QueryMetadata,
+)
+__all__ = ["LineageAnalyzer", "LineageItem", "QueryLineageResult", "QueryMetadata"]