PyPI - sql-code-graph - Versions diffs - 0.2.1__py3-none-any.whl - Mend

sql-code-graph 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

sql_code_graph-0.2.1.dist-info/METADATA +171 -0
sql_code_graph-0.2.1.dist-info/RECORD +55 -0
sql_code_graph-0.2.1.dist-info/WHEEL +4 -0
sql_code_graph-0.2.1.dist-info/entry_points.txt +2 -0
sqlcg/__init__.py +5 -0
sqlcg/__main__.py +6 -0
sqlcg/cli/__init__.py +1 -0
sqlcg/cli/commands/__init__.py +1 -0
sqlcg/cli/commands/analyze.py +93 -0
sqlcg/cli/commands/db.py +83 -0
sqlcg/cli/commands/find.py +63 -0
sqlcg/cli/commands/gain.py +169 -0
sqlcg/cli/commands/git.py +73 -0
sqlcg/cli/commands/index.py +92 -0
sqlcg/cli/commands/install.py +60 -0
sqlcg/cli/commands/mcp.py +54 -0
sqlcg/cli/commands/report.py +135 -0
sqlcg/cli/commands/watch.py +57 -0
sqlcg/cli/main.py +40 -0
sqlcg/core/__init__.py +8 -0
sqlcg/core/config.py +104 -0
sqlcg/core/graph_db.py +179 -0
sqlcg/core/jobs.py +105 -0
sqlcg/core/kuzu_backend.py +269 -0
sqlcg/core/neo4j_backend.py +195 -0
sqlcg/core/queries.py +82 -0
sqlcg/core/schema.cypher +104 -0
sqlcg/core/schema.py +48 -0
sqlcg/indexer/__init__.py +1 -0
sqlcg/indexer/dbt_adapter.py +23 -0
sqlcg/indexer/indexer.py +317 -0
sqlcg/indexer/walker.py +55 -0
sqlcg/indexer/watcher.py +195 -0
sqlcg/lineage/__init__.py +1 -0
sqlcg/lineage/aggregator.py +58 -0
sqlcg/lineage/schema_resolver.py +198 -0
sqlcg/metrics/__init__.py +5 -0
sqlcg/metrics/store.py +273 -0
sqlcg/parsers/__init__.py +30 -0
sqlcg/parsers/ansi_parser.py +215 -0
sqlcg/parsers/base.py +414 -0
sqlcg/parsers/bigquery_parser.py +77 -0
sqlcg/parsers/postgres_parser.py +27 -0
sqlcg/parsers/registry.py +46 -0
sqlcg/parsers/snowflake_parser.py +148 -0
sqlcg/parsers/tsql_parser.py +27 -0
sqlcg/server/__init__.py +1 -0
sqlcg/server/exceptions.py +20 -0
sqlcg/server/models.py +83 -0
sqlcg/server/server.py +57 -0
sqlcg/server/tools.py +663 -0
sqlcg/utils/__init__.py +6 -0
sqlcg/utils/hashing.py +18 -0
sqlcg/utils/ignore.py +36 -0
sqlcg/utils/logging.py +29 -0

sqlcg/lineage/schema_resolver.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""SchemaResolver for managing table schema information and resolving table/view references.
+Thread-safety: a Lock guards all cache mutations. The lock is re-entrant only
+within a single thread. Do not share a SchemaResolver instance across concurrent
+jobs — construct one per re-index job instead (see jobs.py).
+"""
+import copy
+import threading
+from pathlib import Path
+from typing import Any
+from sqlcg.utils.logging import getLogger
+logger = getLogger(__name__)
+class SchemaResolver:
+    """Manages table schema information for SQL parsing and column lineage.
+    Attributes:
+        dialect: SQL dialect (e.g., "snowflake", "bigquery", None for ANSI)
+        _tables: Internal dict of (catalog, db, table) -> [col_names]
+        _view_bodies: Mapping of view names to ParsedFile objects
+        _lock: threading.Lock protecting mutations and cache
+        _cache: Manual cache dict for as_dict() results
+    """
+    def __init__(self, dialect: str | None = None):
+        """Initialize SchemaResolver.
+        Args:
+            dialect: Optional SQL dialect for normalization
+        """
+        self.dialect = dialect
+        self._tables: dict[tuple[str | None, str | None, str], list[str]] = {}
+        self._view_bodies: dict[str, Any] = {}  # str -> ParsedFile
+        self._lock = threading.Lock()
+        self._cache: dict | None = None
+    def add_create_table(self, ast: Any) -> None:
+        """Parse a CREATE TABLE AST node and register the table schema.
+        Args:
+            ast: sqlglot AST node (exp.Create)
+        """
+        import sqlglot.expressions as exp
+        if not isinstance(ast, exp.Create):
+            return
+        # Extract table name (catalog, db, table)
+        table_expr = ast.this
+        if not table_expr:
+            return
+        # If this is a Schema node, extract the table from it
+        if isinstance(table_expr, exp.Schema):
+            actual_table = table_expr.this
+        else:
+            actual_table = table_expr
+        # Parse the table reference
+        catalog, db, table_name = self._extract_table_parts(actual_table)
+        if not table_name:
+            return
+        # Extract column names from the CREATE statement
+        # Walk the AST to find all ColumnDef nodes
+        col_names = []
+        for node in ast.walk():
+            if isinstance(node, exp.ColumnDef):
+                col_names.append(node.name)
+        with self._lock:
+            self._tables[(catalog, db, table_name)] = col_names
+            self._cache = None  # Invalidate cache
+    def add_view_sources(self, sources: dict[str, Any]) -> None:
+        """Register view-to-source-table mapping.
+        Args:
+            sources: Mapping of view names to ParsedFile objects
+        """
+        with self._lock:
+            self._view_bodies.update(sources)
+            self._cache = None  # Invalidate cache
+    def add_dbt_manifest(self, manifest_path: str | Path) -> None:
+        """Load and register schemas from a dbt manifest.
+        Args:
+            manifest_path: Path to dbt manifest.json
+        """
+        try:
+            import json
+            from pathlib import Path
+            manifest_path = Path(manifest_path)
+            if not manifest_path.exists():
+                logger.warning("dbt manifest not found: %s", manifest_path)
+                return
+            with open(manifest_path, encoding="utf-8") as f:
+                manifest = json.load(f)
+            # Extract table schemas from manifest nodes
+            nodes = manifest.get("nodes", {})
+            with self._lock:
+                for _node_id, node_data in nodes.items():
+                    if node_data.get("resource_type") not in ("table", "view"):
+                        continue
+                    # Parse dbt node metadata
+                    table_name = node_data.get("name", "")
+                    database = node_data.get("database", "")
+                    schema = node_data.get("schema", "")
+                    if not table_name:
+                        continue
+                    # Extract column names from columns metadata
+                    col_names = list(node_data.get("columns", {}).keys())
+                    # Store in internal dict
+                    key = (database if database else None, schema if schema else None, table_name)
+                    self._tables[key] = col_names
+                self._cache = None  # Invalidate cache
+        except (FileNotFoundError, json.JSONDecodeError, KeyError) as exc:
+            logger.warning("Failed to load dbt manifest %s: %s", manifest_path, exc)
+    def add_information_schema(self, csv_path: str | Path) -> None:
+        """Load table schemas from an information_schema CSV.
+        Args:
+            csv_path: Path to CSV file
+        Raises:
+            NotImplementedError: This feature is deferred to v2.
+        """
+        raise NotImplementedError("--schema-from-info-schema is not yet implemented (v2)")
+    def as_dict(self) -> dict:
+        """Return the schema as a nested dict: {catalog: {db: {table: [cols]}}}.
+        Returns:
+            A deep copy of the cached schema dict. Mutations by the caller
+            do not affect the internal cache.
+        """
+        with self._lock:
+            if self._cache is None:
+                self._cache = self._build_dict()
+            return copy.deepcopy(self._cache)
+    def _build_dict(self) -> dict:
+        """Build the nested schema dictionary (called only under self._lock).
+        Returns:
+            Nested dictionary structure
+        """
+        out: dict = {}
+        for (cat, db, name), cols in self._tables.items():
+            cur = out
+            for k in [cat, db]:
+                if k:
+                    cur = cur.setdefault(k, {})
+            cur[name] = cols
+        return out
+    @staticmethod
+    def _extract_table_parts(table_expr: Any) -> tuple[str | None, str | None, str]:
+        """Extract catalog, db, and table name from a table expression.
+        Args:
+            table_expr: sqlglot table expression
+        Returns:
+            Tuple of (catalog, db, table_name)
+        """
+        import sqlglot.expressions as exp
+        match table_expr:
+            case exp.Table():
+                # table.name is the table identifier
+                # table.db is the schema (if present)
+                return (
+                    table_expr.catalog,
+                    table_expr.db,
+                    table_expr.name,
+                )
+            case exp.Identifier():
+                return (None, None, table_expr.name)
+            case _:
+                # Try to extract name from expression
+                table_name = table_expr.name if hasattr(table_expr, "name") else ""
+                return (None, None, table_name)

sqlcg/metrics/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Metrics collection and reporting for SQL Code Graph."""
+from sqlcg.metrics.store import MetricsStore
+__all__ = ["MetricsStore"]

sqlcg/metrics/store.py ADDED Viewed

@@ -0,0 +1,273 @@
+"""SQLite-based metrics storage for SQL Code Graph.
+Importable without KùzuDB. All writes are wrapped in try/except with
+WARNING-level logging on failure. Opt-out via SQLCG_METRICS=0.
+"""
+import logging
+import os
+import sqlite3
+from datetime import UTC, datetime
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class MetricsStore:
+    """SQLite metrics store with append-only design.
+    All write operations are best-effort: failures are logged as WARNING
+    and do not raise. This ensures that metrics collection never crashes
+    the application.
+    Use as a context manager to ensure proper cleanup.
+    """
+    def __init__(self, db_path: Path | str) -> None:
+        """Initialize metrics store.
+        Args:
+            db_path: Path to SQLite database file.
+        """
+        self.db_path = Path(db_path)
+        self._conn: sqlite3.Connection | None = None
+        # Check if metrics are disabled
+        if os.environ.get("SQLCG_METRICS", "").strip().lower() == "0":
+            self._enabled = False
+        else:
+            self._enabled = True
+            # Ensure parent directory exists
+            self.db_path.parent.mkdir(parents=True, exist_ok=True)
+            self._conn = sqlite3.connect(
+                str(self.db_path),
+                check_same_thread=True,
+                timeout=5.0,
+            )
+            # Enable WAL mode for better concurrency
+            try:
+                self._conn.execute("PRAGMA journal_mode=WAL")
+            except sqlite3.Error:
+                pass  # Silently ignore pragma failures
+    def __enter__(self) -> "MetricsStore":
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Context manager exit with cleanup."""
+        self.close()
+    def close(self) -> None:
+        """Close the database connection."""
+        if self._conn is not None:
+            try:
+                self._conn.close()
+            except sqlite3.Error:
+                pass  # Silently ignore close errors
+            finally:
+                self._conn = None
+    def init_schema(self) -> None:
+        """Initialize database schema (idempotent).
+        Creates three tables if they don't exist:
+        - tool_calls: MCP tool invocations with timing
+        - index_runs: Indexer.index_repo() invocations with parse stats
+        - feedback: User feedback labels (TP/FP) for queries
+        """
+        if not self._enabled or self._conn is None:
+            return
+        try:
+            self._conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS tool_calls (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    timestamp TEXT NOT NULL,
+                    tool_name TEXT NOT NULL,
+                    duration_ms REAL NOT NULL,
+                    success INTEGER NOT NULL
+                )
+                """
+            )
+            self._conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS index_runs (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    timestamp TEXT NOT NULL,
+                    repo_path TEXT NOT NULL,
+                    files_parsed INTEGER NOT NULL,
+                    parse_errors INTEGER NOT NULL,
+                    tables_found INTEGER NOT NULL,
+                    lineage_edges INTEGER NOT NULL,
+                    duration_ms REAL NOT NULL
+                )
+                """
+            )
+            self._conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS feedback (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    timestamp TEXT NOT NULL,
+                    tool_name TEXT NOT NULL,
+                    query TEXT NOT NULL,
+                    label TEXT NOT NULL,
+                    note TEXT
+                )
+                """
+            )
+            self._conn.commit()
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to initialize metrics schema: {e}")
+    def record_tool_call(
+        self,
+        tool_name: str,
+        duration_ms: float,
+        success: bool = True,
+    ) -> None:
+        """Record a tool call with timing information.
+        Args:
+            tool_name: Name of the MCP tool.
+            duration_ms: Execution time in milliseconds.
+            success: Whether the call succeeded.
+        """
+        if not self._enabled or self._conn is None:
+            return
+        try:
+            timestamp = datetime.now(UTC).isoformat()
+            self._conn.execute(
+                """
+                INSERT INTO tool_calls (timestamp, tool_name, duration_ms, success)
+                VALUES (?, ?, ?, ?)
+                """,
+                (timestamp, tool_name, duration_ms, int(success)),
+            )
+            self._conn.commit()
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to record tool call: {e}")
+    def record_index_run(
+        self,
+        repo_path: str,
+        files_parsed: int,
+        parse_errors: int,
+        tables_found: int,
+        lineage_edges: int,
+        duration_ms: float,
+    ) -> None:
+        """Record an index run with parse statistics.
+        Args:
+            repo_path: Path to the indexed repository.
+            files_parsed: Number of SQL files parsed.
+            parse_errors: Number of files with parse errors.
+            tables_found: Number of table/view nodes created.
+            lineage_edges: Number of lineage edges created.
+            duration_ms: Total duration in milliseconds.
+        """
+        if not self._enabled or self._conn is None:
+            return
+        try:
+            timestamp = datetime.now(UTC).isoformat()
+            self._conn.execute(
+                """
+                INSERT INTO index_runs
+                (timestamp, repo_path, files_parsed, parse_errors, tables_found,
+                 lineage_edges, duration_ms)
+                VALUES (?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    timestamp,
+                    repo_path,
+                    files_parsed,
+                    parse_errors,
+                    tables_found,
+                    lineage_edges,
+                    duration_ms,
+                ),
+            )
+            self._conn.commit()
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to record index run: {e}")
+    def record_feedback(
+        self,
+        tool_name: str,
+        query: str,
+        label: str,
+        note: str = "",
+    ) -> None:
+        """Record feedback on a tool result.
+        Args:
+            tool_name: Name of the tool being evaluated.
+            query: The query or pattern that was evaluated.
+            label: Feedback label: "TP" or "FP".
+            note: Optional user note (truncated to 500 chars).
+        """
+        if not self._enabled or self._conn is None:
+            return
+        try:
+            timestamp = datetime.now(UTC).isoformat()
+            # Truncate note to 500 characters
+            truncated_note = note[:500] if note else None
+            self._conn.execute(
+                """
+                INSERT INTO feedback (timestamp, tool_name, query, label, note)
+                VALUES (?, ?, ?, ?, ?)
+                """,
+                (timestamp, tool_name, query, label, truncated_note),
+            )
+            self._conn.commit()
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to record feedback: {e}")
+    def execute_query(self, query: str, params: tuple | None = None) -> list[tuple]:
+        """Execute a read-only query.
+        Args:
+            query: SQL SELECT query.
+            params: Optional tuple of query parameters for placeholders.
+        Returns:
+            List of result tuples, or empty list on error.
+        """
+        if not self._enabled or self._conn is None:
+            return []
+        try:
+            if params:
+                cursor = self._conn.execute(query, params)
+            else:
+                cursor = self._conn.execute(query)
+            return cursor.fetchall()
+        except sqlite3.Error as e:
+            logger.warning(f"Failed to execute query: {e}")
+            return []
+    def table_exists(self, table_name: str) -> bool:
+        """Check if a table exists in the database.
+        Args:
+            table_name: Name of the table.
+        Returns:
+            True if table exists, False otherwise.
+        """
+        if not self._enabled or self._conn is None:
+            return False
+        try:
+            cursor = self._conn.execute(
+                "SELECT 1 FROM sqlite_master WHERE type='table' AND name=?",
+                (table_name,),
+            )
+            return cursor.fetchone() is not None
+        except sqlite3.Error:
+            return False

sqlcg/parsers/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Parser module for SQL lineage extraction."""
+# Import all dialect parsers to register them
+from sqlcg.parsers.ansi_parser import AnsiParser  # noqa: F401
+from sqlcg.parsers.base import (
+    ColumnRef,
+    LineageEdge,
+    ParsedFile,
+    QueryNode,
+    SqlParser,
+    TableRef,
+)
+from sqlcg.parsers.bigquery_parser import BigQueryParser  # noqa: F401
+from sqlcg.parsers.postgres_parser import PostgresParser  # noqa: F401
+from sqlcg.parsers.snowflake_parser import SnowflakeParser  # noqa: F401
+from sqlcg.parsers.tsql_parser import TsqlParser  # noqa: F401
+__all__ = [
+    "TableRef",
+    "ColumnRef",
+    "LineageEdge",
+    "QueryNode",
+    "ParsedFile",
+    "SqlParser",
+    "AnsiParser",
+    "BigQueryParser",
+    "PostgresParser",
+    "SnowflakeParser",
+    "TsqlParser",
+]