PyPI - sqlprism - Versions diffs - 1.0.0__py3-none-any.whl - Mend

sqlprism 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

sqlprism/__init__.py +1 -0
sqlprism/cli.py +625 -0
sqlprism/core/__init__.py +0 -0
sqlprism/core/graph.py +1547 -0
sqlprism/core/indexer.py +677 -0
sqlprism/core/mcp_tools.py +982 -0
sqlprism/languages/__init__.py +28 -0
sqlprism/languages/dbt.py +199 -0
sqlprism/languages/sql.py +1031 -0
sqlprism/languages/sqlmesh.py +203 -0
sqlprism/languages/utils.py +73 -0
sqlprism/types.py +190 -0
sqlprism-1.0.0.dist-info/METADATA +429 -0
sqlprism-1.0.0.dist-info/RECORD +17 -0
sqlprism-1.0.0.dist-info/WHEEL +4 -0
sqlprism-1.0.0.dist-info/entry_points.txt +2 -0
sqlprism-1.0.0.dist-info/licenses/LICENSE +190 -0

sqlprism/languages/sqlmesh.py ADDED Viewed

@@ -0,0 +1,203 @@
+"""SQLMesh model renderer.
+Runs an inline Python script via `uv run python` in the sqlmesh project's
+own virtualenv. The script uses sqlmesh's Python API to load the project,
+create a local DuckDB gateway (no remote connections needed), render all
+models, and output JSON to stdout.
+This avoids needing sqlmesh as a dependency of this project — it uses
+whatever sqlmesh version the project already has installed.
+"""
+import json
+import logging
+import shlex
+import subprocess
+import textwrap
+from pathlib import Path
+from sqlprism.languages.sql import SqlParser
+from sqlprism.languages.utils import build_env, enrich_nodes, find_venv_dir
+from sqlprism.types import ParseResult
+logger = logging.getLogger(__name__)
+# Inline script that runs inside the sqlmesh project's venv
+_RENDER_SCRIPT = textwrap.dedent("""\
+    import json
+    import sys
+    import os
+    project_path = sys.argv[1]
+    dialect = sys.argv[2]
+    gateway = sys.argv[3]
+    variables = json.loads(sys.argv[4])
+    from sqlmesh import Context
+    from sqlmesh.core.config import (
+        Config, DuckDBConnectionConfig, GatewayConfig, ModelDefaultsConfig,
+    )
+    config = Config(
+        model_defaults=ModelDefaultsConfig(dialect=dialect),
+        gateways={gateway: GatewayConfig(connection=DuckDBConnectionConfig())},
+        default_gateway=gateway,
+        variables=variables,
+    )
+    context = Context(paths=[project_path], config=config)
+    rendered = {}
+    errors = []
+    for model_name in context.models:
+        try:
+            query = context.render(model_name)
+            sql = query.sql(dialect=dialect)
+            if sql:
+                rendered[model_name] = sql
+        except Exception as e:
+            errors.append({"model": model_name, "error": str(e)})
+    json.dump({"rendered": rendered, "errors": errors}, sys.stdout)
+""")
+class SqlMeshRenderer:
+    """Renders sqlmesh models into ``ParseResult`` objects via subprocess.
+    Runs an inline Python script inside the sqlmesh project's own virtualenv
+    to load the project, render every model to SQL, and output JSON to stdout.
+    The rendered SQL is then parsed by ``SqlParser``. This avoids requiring
+    sqlmesh as a direct dependency of the indexer.
+    """
+    def __init__(self, sql_parser: SqlParser | None = None):
+        """Initialise the renderer.
+        Args:
+            sql_parser: ``SqlParser`` instance to use for parsing rendered SQL.
+                Creates a default instance if not provided.
+        """
+        self.sql_parser = sql_parser or SqlParser()
+    def render_project(
+        self,
+        project_path: str | Path,
+        env_file: str | Path | None = None,
+        variables: dict[str, str | int] | None = None,
+        gateway: str = "local",
+        dialect: str = "athena",
+        sqlmesh_command: str = "uv run python",
+        venv_dir: str | Path | None = None,
+        schema_catalog: dict | None = None,
+    ) -> dict[str, ParseResult]:
+        """Render all models in a sqlmesh project.
+        Args:
+            project_path: Path to the sqlmesh project directory (containing config.yaml)
+            env_file: Path to .env file to source before loading context
+            variables: Extra sqlmesh variables (e.g. {"GRACE_PERIOD": 7})
+            gateway: Gateway name to use (default "local" — uses duckdb, no remote deps)
+            dialect: SQL dialect for rendering output
+            sqlmesh_command: Command to run python in the sqlmesh venv (default: "uv run python")
+            venv_dir: Directory to run from (where .venv lives). Auto-detects if not set.
+        Returns:
+            Dict mapping model name -> ParseResult
+        """
+        project_path = Path(project_path).resolve()
+        # Determine where to run uv from (where .venv lives)
+        if venv_dir:
+            cwd = Path(venv_dir).resolve()
+        else:
+            cwd = find_venv_dir(project_path)
+        env = build_env(env_file)
+        # Run the render script in the project's venv
+        models, errors = self._run_render_script(
+            project_path=project_path,
+            cwd=cwd,
+            env=env,
+            variables=variables or {},
+            gateway=gateway,
+            dialect=dialect,
+            sqlmesh_command=sqlmesh_command,
+        )
+        for err in errors:
+            logger.warning(
+                "sqlmesh render error for model %s: %s",
+                err.get("model", "<unknown>"),
+                err.get("error", "<no message>"),
+            )
+        results: dict[str, ParseResult] = {}
+        for model_name, rendered_sql in models.items():
+            clean_name = model_name.strip('"').replace('"."', "/")
+            result = self.sql_parser.parse(clean_name + ".sql", rendered_sql, schema=schema_catalog)
+            enrich_nodes(result, "sqlmesh_model", model_name)
+            results[model_name] = result
+        return results
+    def _run_render_script(
+        self,
+        project_path: Path,
+        cwd: Path,
+        env: dict[str, str],
+        variables: dict[str, str | int],
+        gateway: str,
+        dialect: str,
+        sqlmesh_command: str,
+    ) -> tuple[dict[str, str], list[dict]]:
+        """Run the inline render script via subprocess. Returns ({model_name: sql}, errors)."""
+        _validate_command(sqlmesh_command, allowed_keywords={"python", "sqlmesh", "uv"})
+        cmd = shlex.split(sqlmesh_command) + [
+            "-c",
+            _RENDER_SCRIPT,
+            str(project_path),
+            dialect,
+            gateway,
+            json.dumps(variables),
+        ]
+        result = subprocess.run(
+            cmd,
+            cwd=cwd,
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=600,  # 10 min timeout for large projects
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"sqlmesh render failed (exit {result.returncode}):\n{result.stderr}")
+        output = json.loads(result.stdout)
+        return output.get("rendered", {}), output.get("errors", [])
+def _validate_command(command: str, allowed_keywords: set[str]) -> None:
+    """Validate a subprocess command against an allowlist.
+    The first token of the command must contain one of the allowed keywords.
+    Rejects shell metacharacters that could enable command injection.
+    """
+    # Reject shell metacharacters
+    dangerous_chars = set(";|&`$(){}!")
+    if dangerous_chars & set(command):
+        raise ValueError(f"Command contains disallowed shell characters: {command!r}")
+    parts = shlex.split(command)
+    if not parts:
+        raise ValueError("Empty command")
+    # The base command (first token) must exactly match an allowed keyword
+    base = parts[0].rsplit("/", 1)[-1]  # strip path prefix
+    if base not in allowed_keywords:
+        raise ValueError(
+            f"Command {parts[0]!r} not in allowlist. Base command must be one of: {', '.join(sorted(allowed_keywords))}"
+        )

sqlprism/languages/utils.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""Shared utilities for renderer modules (sqlmesh, dbt)."""
+import os
+from pathlib import Path
+from sqlprism.types import NodeResult, ParseResult
+def find_venv_dir(project_path: Path) -> Path:
+    """Find the directory containing .venv for uv run.
+    Checks project_path first, then walks up to 3 parent levels.
+    Falls back to project_path if no .venv found.
+    """
+    if (project_path / ".venv").exists():
+        return project_path
+    current = project_path.parent
+    for _ in range(3):
+        if (current / ".venv").exists():
+            return current
+        current = current.parent
+    return project_path
+def parse_dotenv(env_path: Path) -> dict[str, str]:
+    """Parse a .env file into a dict.
+    Handles comments, blank lines, and properly strips matching quotes
+    (single or double) from values.
+    """
+    result: dict[str, str] = {}
+    for line in env_path.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if "=" not in line:
+            continue
+        key, _, value = line.partition("=")
+        key = key.strip()
+        value = value.strip()
+        # Strip matching quotes only (not mismatched ones)
+        if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
+            value = value[1:-1]
+        result[key] = value
+    return result
+def build_env(env_file: str | Path | None = None) -> dict[str, str]:
+    """Build subprocess environment, optionally loading a .env file."""
+    env = os.environ.copy()
+    if env_file:
+        env_path = Path(env_file).resolve()
+        if env_path.exists():
+            env.update(parse_dotenv(env_path))
+    return env
+def enrich_nodes(result: ParseResult, metadata_key: str, metadata_value: str) -> None:
+    """Add renderer metadata to all nodes in a ParseResult (mutates in place)."""
+    enriched = []
+    for node in result.nodes:
+        meta = dict(node.metadata) if node.metadata else {}
+        meta[metadata_key] = metadata_value
+        enriched.append(
+            NodeResult(
+                kind=node.kind,
+                name=node.name,
+                line_start=node.line_start,
+                line_end=node.line_end,
+                metadata=meta,
+            )
+        )
+    result.nodes = enriched

sqlprism/types.py ADDED Viewed

@@ -0,0 +1,190 @@
+"""Shared data types for the SQL indexer.
+These dataclasses define the contract between parsers and the indexer orchestrator.
+Every language parser returns a ParseResult. The orchestrator consumes ParseResults
+and writes to DuckDB. Parsers never touch the database. The orchestrator never
+does language-specific parsing.
+"""
+from dataclasses import dataclass, field
+@dataclass(frozen=True)
+class NodeResult:
+    """A nameable entity found in a file.
+    Nodes are the universal unit of the knowledge graph. A node is anything
+    a parser identifies as structurally meaningful: a table, view, CTE,
+    function, class, module, API endpoint, Terraform resource, etc.
+    The ``kind`` field is parser-defined and unconstrained -- each language
+    emits whatever kinds are meaningful for it.
+    Attributes:
+        kind: Entity type (e.g. ``"table"``, ``"view"``, ``"cte"``).
+        name: Unqualified entity name (e.g. ``"orders"``).
+        line_start: First line in the source file, or ``None`` if unknown.
+        line_end: Last line in the source file, or ``None`` if unknown.
+        metadata: Arbitrary parser-supplied metadata (schema, dialect, filters, etc.).
+    """
+    kind: str
+    name: str
+    line_start: int | None = None
+    line_end: int | None = None
+    metadata: dict | None = None
+@dataclass(frozen=True)
+class EdgeResult:
+    """A relationship between two entities.
+    Edges reference nodes by ``(name, kind)`` pairs, not database IDs. The
+    indexer orchestrator resolves these to node IDs during insertion. This
+    means parsers don't need to know about the database and parse order
+    doesn't matter.
+    The target may be in another file or even another repo. If unresolved at
+    insert time, the orchestrator creates a phantom node.
+    Attributes:
+        source_name: Name of the source node.
+        source_kind: Kind of the source node (e.g. ``"query"``).
+        target_name: Name of the target node.
+        target_kind: Kind of the target node (e.g. ``"table"``).
+        relationship: Edge label (e.g. ``"references"``, ``"defines"``,
+            ``"inserts_into"``, ``"cte_references"``).
+        context: Human-readable context (e.g. ``"FROM clause"``, ``"JOIN clause"``).
+        metadata: Arbitrary edge metadata (source_schema, target_schema, etc.).
+    """
+    source_name: str
+    source_kind: str
+    target_name: str
+    target_kind: str
+    relationship: str
+    context: str | None = None
+    metadata: dict | None = None
+@dataclass(frozen=True)
+class ColumnUsageResult:
+    """SQL-specific: column-level lineage from sqlglot.
+    Records which columns are used where and how. Only the SQL parser
+    populates these -- all other parsers return an empty list.
+    This data is stored in a separate table from edges because column
+    usage is high-volume with its own query patterns (flat scans, not
+    graph traversals).
+    Attributes:
+        node_name: Name of the query/CTE/view that uses this column.
+        node_kind: Kind of the owning node (e.g. ``"query"``, ``"cte"``).
+        table_name: Source table the column belongs to.
+        column_name: Column name (``"*"`` for ``SELECT *``).
+        usage_type: How the column is used. One of ``"select"``,
+            ``"where"``, ``"join_on"``, ``"group_by"``, ``"order_by"``,
+            ``"having"``, ``"insert"``, ``"update"``, ``"partition_by"``,
+            ``"window_order"``, ``"qualify"``.
+        alias: Output alias if the column is aliased (``AS name``).
+        transform: Wrapping expression, e.g. ``"CAST(a.updated AS DATETIME)"``.
+    """
+    node_name: str
+    node_kind: str
+    table_name: str
+    column_name: str
+    # 'select', 'where', 'join_on', 'group_by', 'order_by', 'having', 'insert', 'update'
+    usage_type: str
+    alias: str | None = None
+    transform: str | None = None  # wrapping expression e.g. "CAST(a.updated AS DATETIME)"
+@dataclass(frozen=True)
+class LineageHop:
+    """One hop in a column lineage chain.
+    Attributes:
+        column: Column name at this hop.
+        table: Table, CTE, or subquery name at this hop.
+        expression: Transform applied at this hop (e.g. ``"CAST(amount AS DECIMAL)"``),
+            or ``None`` if the column passes through unchanged.
+    """
+    column: str
+    table: str  # table, CTE, or subquery name
+    expression: str | None = None  # transform at this hop, e.g. "CAST(amount AS DECIMAL)"
+@dataclass(frozen=True)
+class ColumnLineageResult:
+    """End-to-end column lineage through CTEs and subqueries.
+    Traces an output column back to its source table column(s),
+    recording each intermediate hop (CTE, subquery, transform).
+    Attributes:
+        output_column: Column name in the final output.
+        output_node: The query, table, or view that produces this column.
+        chain: Ordered hops from output back to source.
+    """
+    output_column: str  # column name in the final output
+    output_node: str  # the query/table/view that produces this column
+    chain: list[LineageHop] = field(default_factory=list)  # ordered hops from output → source
+def parse_repo_config(
+    cfg: str | dict,
+    global_dialect: str | None = None,
+) -> tuple[str, str | None, dict[str, str] | None]:
+    """Parse a repo config value into (path, dialect, dialect_overrides).
+    Supports both simple string paths and full config dicts::
+        "my-repo": "/path/to/repo"
+        "my-repo": {"path": "/path", "dialect": "starrocks",
+                    "dialect_overrides": {"athena/": "athena"}}
+    """
+    if isinstance(cfg, str):
+        return cfg, global_dialect, None
+    return (
+        cfg["path"],
+        cfg.get("dialect", global_dialect),
+        cfg.get("dialect_overrides"),
+    )
+@dataclass
+class ParseResult:
+    """Everything a parser returns for one file.
+    This is the complete interface contract. A parser receives a file path
+    and its content, and returns one of these. The orchestrator handles
+    everything from here -- ID assignment, edge resolution, database writes.
+    Mutation contract:
+        ParseResult is intentionally **mutable** (not ``frozen=True``).
+        Renderers and post-processing steps mutate ``nodes``, ``edges``, and
+        other lists **in-place** -- e.g. appending synthetic nodes, deduplicating
+        edges, or rewriting names during normalisation. This is by design:
+        allocating a new ParseResult for every transform would add complexity
+        with no practical benefit, since a ParseResult is owned by a single
+        file-processing pipeline and is never shared across threads.
+    Attributes:
+        language: Parser language identifier (e.g. ``"sql"``).
+        nodes: Entities discovered in the file.
+        edges: Relationships between entities.
+        column_usage: Column-level usage records (SQL only).
+        column_lineage: End-to-end column lineage chains (SQL only).
+        errors: Non-fatal parse errors encountered during processing.
+    """
+    language: str
+    nodes: list[NodeResult] = field(default_factory=list)
+    edges: list[EdgeResult] = field(default_factory=list)
+    column_usage: list[ColumnUsageResult] = field(default_factory=list)
+    column_lineage: list[ColumnLineageResult] = field(default_factory=list)
+    errors: list[str] = field(default_factory=list)