PyPI - sql-glider - Versions diffs - 0.1.8__py3-none-any.whl - Mend

sql-glider 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

sql_glider-0.1.8.dist-info/METADATA +893 -0
sql_glider-0.1.8.dist-info/RECORD +34 -0
sql_glider-0.1.8.dist-info/WHEEL +4 -0
sql_glider-0.1.8.dist-info/entry_points.txt +9 -0
sql_glider-0.1.8.dist-info/licenses/LICENSE +201 -0
sqlglider/__init__.py +3 -0
sqlglider/_version.py +34 -0
sqlglider/catalog/__init__.py +30 -0
sqlglider/catalog/base.py +99 -0
sqlglider/catalog/databricks.py +255 -0
sqlglider/catalog/registry.py +121 -0
sqlglider/cli.py +1589 -0
sqlglider/dissection/__init__.py +17 -0
sqlglider/dissection/analyzer.py +767 -0
sqlglider/dissection/formatters.py +222 -0
sqlglider/dissection/models.py +112 -0
sqlglider/global_models.py +17 -0
sqlglider/graph/__init__.py +42 -0
sqlglider/graph/builder.py +349 -0
sqlglider/graph/merge.py +136 -0
sqlglider/graph/models.py +289 -0
sqlglider/graph/query.py +287 -0
sqlglider/graph/serialization.py +107 -0
sqlglider/lineage/__init__.py +10 -0
sqlglider/lineage/analyzer.py +1631 -0
sqlglider/lineage/formatters.py +335 -0
sqlglider/templating/__init__.py +51 -0
sqlglider/templating/base.py +103 -0
sqlglider/templating/jinja.py +163 -0
sqlglider/templating/registry.py +124 -0
sqlglider/templating/variables.py +295 -0
sqlglider/utils/__init__.py +11 -0
sqlglider/utils/config.py +155 -0
sqlglider/utils/file_utils.py +38 -0

sqlglider/graph/builder.py ADDED Viewed

@@ -0,0 +1,349 @@
+"""Graph builder for constructing lineage graphs from SQL files."""
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Set
+import rustworkx as rx
+from rich.console import Console
+from rich.progress import BarColumn, Progress, TaskProgressColumn, TextColumn
+from sqlglider.global_models import AnalysisLevel, NodeFormat
+from sqlglider.graph.models import (
+    GraphEdge,
+    GraphMetadata,
+    GraphNode,
+    LineageGraph,
+    Manifest,
+)
+from sqlglider.lineage.analyzer import LineageAnalyzer
+from sqlglider.utils.file_utils import read_sql_file
+console = Console(stderr=True)
+# Type alias for SQL preprocessor functions
+SqlPreprocessor = Callable[[str, Path], str]
+class GraphBuilder:
+    """Build lineage graphs from SQL files using rustworkx."""
+    def __init__(
+        self,
+        node_format: NodeFormat = NodeFormat.QUALIFIED,
+        dialect: str = "spark",
+        sql_preprocessor: Optional[SqlPreprocessor] = None,
+    ):
+        """
+        Initialize the graph builder.
+        Args:
+            node_format: Format for node identifiers (QUALIFIED or STRUCTURED)
+            dialect: Default SQL dialect (used when not specified per-file)
+            sql_preprocessor: Optional function to preprocess SQL before analysis.
+                             Takes (sql: str, file_path: Path) and returns processed SQL.
+                             Useful for templating (e.g., Jinja2 rendering).
+        """
+        self.node_format = node_format
+        self.dialect = dialect
+        self.sql_preprocessor = sql_preprocessor
+        self.graph: rx.PyDiGraph = rx.PyDiGraph()
+        self._node_index_map: Dict[str, int] = {}  # identifier -> rustworkx node index
+        self._source_files: Set[str] = set()
+        self._edge_set: Set[tuple] = set()  # (source, target) for dedup
+        self._skipped_files: List[tuple[str, str]] = []  # (file_path, reason)
+    def add_file(
+        self,
+        file_path: Path,
+        dialect: Optional[str] = None,
+    ) -> "GraphBuilder":
+        """
+        Add lineage from a single SQL file to the graph.
+        Args:
+            file_path: Path to SQL file
+            dialect: SQL dialect (uses builder default if not specified)
+        Returns:
+            self for method chaining
+        Raises:
+            FileNotFoundError: If file doesn't exist
+            ParseError: If SQL cannot be parsed
+        """
+        file_dialect = dialect or self.dialect
+        file_path_str = str(file_path.resolve())
+        try:
+            sql_content = read_sql_file(file_path)
+            # Apply SQL preprocessor if configured (e.g., for templating)
+            if self.sql_preprocessor:
+                sql_content = self.sql_preprocessor(sql_content, file_path)
+            analyzer = LineageAnalyzer(sql_content, dialect=file_dialect)
+            results = analyzer.analyze_queries(level=AnalysisLevel.COLUMN)
+            # Print warnings for any skipped queries within the file
+            for skipped in analyzer.skipped_queries:
+                console.print(
+                    f"[yellow]Warning:[/yellow] Skipping query {skipped.query_index} "
+                    f"in {file_path.name} ({skipped.statement_type}): {skipped.reason}"
+                )
+            self._source_files.add(file_path_str)
+            for result in results:
+                query_index = result.metadata.query_index
+                for item in result.lineage_items:
+                    if not item.source_name:  # Skip empty sources
+                        continue
+                    # Add/get nodes
+                    source_node_idx = self._ensure_node(
+                        item.source_name,
+                        file_path_str,
+                        query_index,
+                    )
+                    target_node_idx = self._ensure_node(
+                        item.output_name,
+                        file_path_str,
+                        query_index,
+                    )
+                    # Add edge (source contributes_to target) - deduplicate
+                    edge_key = (item.source_name.lower(), item.output_name.lower())
+                    if edge_key not in self._edge_set:
+                        edge = GraphEdge(
+                            source_node=item.source_name.lower(),
+                            target_node=item.output_name.lower(),
+                            file_path=file_path_str,
+                            query_index=query_index,
+                        )
+                        self.graph.add_edge(
+                            source_node_idx, target_node_idx, edge.model_dump()
+                        )
+                        self._edge_set.add(edge_key)
+        except ValueError as e:
+            # Skip files that fail completely (all statements unsupported)
+            error_msg = str(e)
+            self._skipped_files.append((file_path_str, error_msg))
+            console.print(
+                f"[yellow]Warning:[/yellow] Skipping {file_path.name}: {error_msg}"
+            )
+        return self
+    def add_directory(
+        self,
+        dir_path: Path,
+        recursive: bool = False,
+        glob_pattern: str = "*.sql",
+        dialect: Optional[str] = None,
+    ) -> "GraphBuilder":
+        """
+        Add lineage from all SQL files in a directory.
+        Args:
+            dir_path: Path to directory
+            recursive: Whether to search recursively
+            glob_pattern: Glob pattern for SQL files
+            dialect: SQL dialect (uses builder default if not specified)
+        Returns:
+            self for method chaining
+        Raises:
+            ValueError: If path is not a directory
+        """
+        if not dir_path.is_dir():
+            raise ValueError(f"Not a directory: {dir_path}")
+        if recursive:
+            pattern = f"**/{glob_pattern}"
+        else:
+            pattern = glob_pattern
+        sql_files = [f for f in sorted(dir_path.glob(pattern)) if f.is_file()]
+        return self.add_files(sql_files, dialect)
+    def add_manifest(
+        self,
+        manifest_path: Path,
+        dialect: Optional[str] = None,
+    ) -> "GraphBuilder":
+        """
+        Add lineage from files specified in a manifest CSV.
+        Args:
+            manifest_path: Path to manifest CSV file
+            dialect: Default SQL dialect (overridden by manifest entries)
+        Returns:
+            self for method chaining
+        Raises:
+            FileNotFoundError: If manifest or referenced files don't exist
+            ValueError: If manifest format is invalid
+        """
+        manifest = Manifest.from_csv(manifest_path)
+        base_dir = manifest_path.parent
+        # Collect files with their dialects
+        files_with_dialects: List[tuple[Path, str]] = []
+        for entry in manifest.entries:
+            # Resolve file path relative to manifest location
+            file_path = Path(entry.file_path)
+            if not file_path.is_absolute():
+                file_path = (base_dir / entry.file_path).resolve()
+            # Use entry dialect, then CLI dialect, then builder default
+            entry_dialect = entry.dialect or dialect or self.dialect
+            files_with_dialects.append((file_path, entry_dialect))
+        # Process with progress
+        if files_with_dialects:
+            total = len(files_with_dialects)
+            with Progress(
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                console=console,
+                transient=False,
+            ) as progress:
+                task = progress.add_task("Parsing", total=total)
+                for i, (file_path, file_dialect) in enumerate(
+                    files_with_dialects, start=1
+                ):
+                    console.print(f"Parsing file {i}/{total}: {file_path.name}")
+                    self.add_file(file_path, file_dialect)
+                    progress.advance(task)
+        return self
+    def add_files(
+        self,
+        file_paths: List[Path],
+        dialect: Optional[str] = None,
+        show_progress: bool = True,
+    ) -> "GraphBuilder":
+        """
+        Add lineage from multiple SQL files.
+        Args:
+            file_paths: List of paths to SQL files
+            dialect: SQL dialect (uses builder default if not specified)
+            show_progress: Whether to print progress messages
+        Returns:
+            self for method chaining
+        """
+        if not file_paths:
+            return self
+        if show_progress:
+            total = len(file_paths)
+            with Progress(
+                TextColumn("[progress.description]{task.description}"),
+                BarColumn(),
+                TaskProgressColumn(),
+                console=console,
+                transient=False,
+            ) as progress:
+                task = progress.add_task("Parsing", total=total)
+                for i, file_path in enumerate(file_paths, start=1):
+                    console.print(f"Parsing file {i}/{total}: {file_path.name}")
+                    self.add_file(file_path, dialect)
+                    progress.advance(task)
+        else:
+            for file_path in file_paths:
+                self.add_file(file_path, dialect)
+        return self
+    def _ensure_node(
+        self,
+        identifier: str,
+        file_path: str,
+        query_index: int,
+    ) -> int:
+        """
+        Ensure a node exists in the graph, creating it if necessary.
+        Args:
+            identifier: Node identifier (e.g., "table.column")
+            file_path: Source file path
+            query_index: Query index within file
+        Returns:
+            rustworkx node index
+        """
+        key = identifier.lower()
+        if key in self._node_index_map:
+            return self._node_index_map[key]
+        node = GraphNode.from_identifier(
+            identifier=key,
+            file_path=file_path,
+            query_index=query_index,
+        )
+        node_idx = self.graph.add_node(node.model_dump())
+        self._node_index_map[key] = node_idx
+        return node_idx
+    def build(self) -> LineageGraph:
+        """
+        Build and return the final LineageGraph.
+        Returns:
+            LineageGraph with metadata, nodes, and edges
+        """
+        nodes = []
+        for idx in self.graph.node_indices():
+            node_data = self.graph[idx]
+            nodes.append(GraphNode(**node_data))
+        edges = []
+        for edge_idx in self.graph.edge_indices():
+            edge_data = self.graph.get_edge_data_by_index(edge_idx)
+            edges.append(GraphEdge(**edge_data))
+        metadata = GraphMetadata(
+            node_format=self.node_format,
+            default_dialect=self.dialect,
+            created_at=datetime.now(timezone.utc).isoformat(),
+            source_files=sorted(self._source_files),
+            total_nodes=len(nodes),
+            total_edges=len(edges),
+        )
+        # Print summary of skipped files if any
+        if self._skipped_files:
+            console.print(
+                f"\n[yellow]Summary:[/yellow] Skipped {len(self._skipped_files)} "
+                f"file(s) that could not be analyzed for lineage."
+            )
+        return LineageGraph(
+            metadata=metadata,
+            nodes=nodes,
+            edges=edges,
+        )
+    @property
+    def rustworkx_graph(self) -> rx.PyDiGraph:
+        """Get the underlying rustworkx graph for direct operations."""
+        return self.graph
+    @property
+    def node_index_map(self) -> Dict[str, int]:
+        """Get mapping from node identifiers to rustworkx indices."""
+        return self._node_index_map.copy()
+    @property
+    def skipped_files(self) -> List[tuple[str, str]]:
+        """Get list of files that were skipped during graph building."""
+        return self._skipped_files.copy()

sqlglider/graph/merge.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Graph merging functionality."""
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List, Set
+import rustworkx as rx
+from sqlglider.global_models import NodeFormat
+from sqlglider.graph.models import (
+    GraphEdge,
+    GraphMetadata,
+    GraphNode,
+    LineageGraph,
+)
+from sqlglider.graph.serialization import load_graph
+class GraphMerger:
+    """Merge multiple lineage graphs into one."""
+    def __init__(self):
+        """Initialize the merger."""
+        self.merged_graph: rx.PyDiGraph = rx.PyDiGraph()
+        self._node_map: Dict[str, int] = {}  # identifier -> node index
+        self._source_files: Set[str] = set()
+        self._edge_set: Set[tuple] = set()  # (source, target) for dedup
+    def add_graph(self, graph: LineageGraph) -> "GraphMerger":
+        """
+        Add a graph to be merged.
+        Nodes are deduplicated by identifier (first occurrence wins).
+        Edges are deduplicated by (source_node, target_node) pair.
+        Args:
+            graph: LineageGraph to add
+        Returns:
+            self for method chaining
+        """
+        self._source_files.update(graph.metadata.source_files)
+        # Add nodes (deduplicate by identifier)
+        for node in graph.nodes:
+            if node.identifier not in self._node_map:
+                idx = self.merged_graph.add_node(node.model_dump())
+                self._node_map[node.identifier] = idx
+        # Add edges (deduplicate by source-target pair)
+        for edge in graph.edges:
+            edge_key = (edge.source_node, edge.target_node)
+            if edge_key not in self._edge_set:
+                source_idx = self._node_map.get(edge.source_node)
+                target_idx = self._node_map.get(edge.target_node)
+                if source_idx is not None and target_idx is not None:
+                    self.merged_graph.add_edge(
+                        source_idx, target_idx, edge.model_dump()
+                    )
+                    self._edge_set.add(edge_key)
+        return self
+    def add_file(self, graph_path: Path) -> "GraphMerger":
+        """
+        Add a graph from a JSON file.
+        Args:
+            graph_path: Path to graph JSON file
+        Returns:
+            self for method chaining
+        Raises:
+            FileNotFoundError: If file doesn't exist
+            ValueError: If file is not valid graph JSON
+        """
+        graph = load_graph(graph_path)
+        return self.add_graph(graph)
+    def add_files(self, graph_paths: List[Path]) -> "GraphMerger":
+        """
+        Add multiple graphs from JSON files.
+        Args:
+            graph_paths: List of paths to graph JSON files
+        Returns:
+            self for method chaining
+        """
+        for path in graph_paths:
+            self.add_file(path)
+        return self
+    def merge(self) -> LineageGraph:
+        """
+        Build the merged graph.
+        Returns:
+            Merged LineageGraph with combined nodes and edges
+        """
+        nodes = [
+            GraphNode(**self.merged_graph[idx])
+            for idx in self.merged_graph.node_indices()
+        ]
+        edges = [
+            GraphEdge(**self.merged_graph.get_edge_data_by_index(idx))
+            for idx in self.merged_graph.edge_indices()
+        ]
+        metadata = GraphMetadata(
+            node_format=NodeFormat.QUALIFIED,  # Merged graphs use qualified format
+            default_dialect="spark",
+            created_at=datetime.now(timezone.utc).isoformat(),
+            source_files=sorted(self._source_files),
+            total_nodes=len(nodes),
+            total_edges=len(edges),
+        )
+        return LineageGraph(metadata=metadata, nodes=nodes, edges=edges)
+def merge_graphs(graph_paths: List[Path]) -> LineageGraph:
+    """
+    Convenience function to merge multiple graph files.
+    Args:
+        graph_paths: List of paths to graph JSON files
+    Returns:
+        Merged LineageGraph
+    """
+    merger = GraphMerger()
+    for path in graph_paths:
+        merger.add_file(path)
+    return merger.merge()