sqlprism 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ """SQLMesh model renderer.
2
+
3
+ Runs an inline Python script via `uv run python` in the sqlmesh project's
4
+ own virtualenv. The script uses sqlmesh's Python API to load the project,
5
+ create a local DuckDB gateway (no remote connections needed), render all
6
+ models, and output JSON to stdout.
7
+
8
+ This avoids needing sqlmesh as a dependency of this project — it uses
9
+ whatever sqlmesh version the project already has installed.
10
+ """
11
+
12
+ import json
13
+ import logging
14
+ import shlex
15
+ import subprocess
16
+ import textwrap
17
+ from pathlib import Path
18
+
19
+ from sqlprism.languages.sql import SqlParser
20
+ from sqlprism.languages.utils import build_env, enrich_nodes, find_venv_dir
21
+ from sqlprism.types import ParseResult
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Inline script that runs inside the sqlmesh project's venv
26
+ _RENDER_SCRIPT = textwrap.dedent("""\
27
+ import json
28
+ import sys
29
+ import os
30
+
31
+ project_path = sys.argv[1]
32
+ dialect = sys.argv[2]
33
+ gateway = sys.argv[3]
34
+ variables = json.loads(sys.argv[4])
35
+
36
+ from sqlmesh import Context
37
+ from sqlmesh.core.config import (
38
+ Config, DuckDBConnectionConfig, GatewayConfig, ModelDefaultsConfig,
39
+ )
40
+
41
+ config = Config(
42
+ model_defaults=ModelDefaultsConfig(dialect=dialect),
43
+ gateways={gateway: GatewayConfig(connection=DuckDBConnectionConfig())},
44
+ default_gateway=gateway,
45
+ variables=variables,
46
+ )
47
+
48
+ context = Context(paths=[project_path], config=config)
49
+
50
+ rendered = {}
51
+ errors = []
52
+ for model_name in context.models:
53
+ try:
54
+ query = context.render(model_name)
55
+ sql = query.sql(dialect=dialect)
56
+ if sql:
57
+ rendered[model_name] = sql
58
+ except Exception as e:
59
+ errors.append({"model": model_name, "error": str(e)})
60
+
61
+ json.dump({"rendered": rendered, "errors": errors}, sys.stdout)
62
+ """)
63
+
64
+
65
+ class SqlMeshRenderer:
66
+ """Renders sqlmesh models into ``ParseResult`` objects via subprocess.
67
+
68
+ Runs an inline Python script inside the sqlmesh project's own virtualenv
69
+ to load the project, render every model to SQL, and output JSON to stdout.
70
+ The rendered SQL is then parsed by ``SqlParser``. This avoids requiring
71
+ sqlmesh as a direct dependency of the indexer.
72
+ """
73
+
74
+ def __init__(self, sql_parser: SqlParser | None = None):
75
+ """Initialise the renderer.
76
+
77
+ Args:
78
+ sql_parser: ``SqlParser`` instance to use for parsing rendered SQL.
79
+ Creates a default instance if not provided.
80
+ """
81
+ self.sql_parser = sql_parser or SqlParser()
82
+
83
+ def render_project(
84
+ self,
85
+ project_path: str | Path,
86
+ env_file: str | Path | None = None,
87
+ variables: dict[str, str | int] | None = None,
88
+ gateway: str = "local",
89
+ dialect: str = "athena",
90
+ sqlmesh_command: str = "uv run python",
91
+ venv_dir: str | Path | None = None,
92
+ schema_catalog: dict | None = None,
93
+ ) -> dict[str, ParseResult]:
94
+ """Render all models in a sqlmesh project.
95
+
96
+ Args:
97
+ project_path: Path to the sqlmesh project directory (containing config.yaml)
98
+ env_file: Path to .env file to source before loading context
99
+ variables: Extra sqlmesh variables (e.g. {"GRACE_PERIOD": 7})
100
+ gateway: Gateway name to use (default "local" — uses duckdb, no remote deps)
101
+ dialect: SQL dialect for rendering output
102
+ sqlmesh_command: Command to run python in the sqlmesh venv (default: "uv run python")
103
+ venv_dir: Directory to run from (where .venv lives). Auto-detects if not set.
104
+
105
+ Returns:
106
+ Dict mapping model name -> ParseResult
107
+ """
108
+ project_path = Path(project_path).resolve()
109
+
110
+ # Determine where to run uv from (where .venv lives)
111
+ if venv_dir:
112
+ cwd = Path(venv_dir).resolve()
113
+ else:
114
+ cwd = find_venv_dir(project_path)
115
+
116
+ env = build_env(env_file)
117
+
118
+ # Run the render script in the project's venv
119
+ models, errors = self._run_render_script(
120
+ project_path=project_path,
121
+ cwd=cwd,
122
+ env=env,
123
+ variables=variables or {},
124
+ gateway=gateway,
125
+ dialect=dialect,
126
+ sqlmesh_command=sqlmesh_command,
127
+ )
128
+
129
+ for err in errors:
130
+ logger.warning(
131
+ "sqlmesh render error for model %s: %s",
132
+ err.get("model", "<unknown>"),
133
+ err.get("error", "<no message>"),
134
+ )
135
+
136
+ results: dict[str, ParseResult] = {}
137
+ for model_name, rendered_sql in models.items():
138
+ clean_name = model_name.strip('"').replace('"."', "/")
139
+ result = self.sql_parser.parse(clean_name + ".sql", rendered_sql, schema=schema_catalog)
140
+ enrich_nodes(result, "sqlmesh_model", model_name)
141
+
142
+ results[model_name] = result
143
+
144
+ return results
145
+
146
+ def _run_render_script(
147
+ self,
148
+ project_path: Path,
149
+ cwd: Path,
150
+ env: dict[str, str],
151
+ variables: dict[str, str | int],
152
+ gateway: str,
153
+ dialect: str,
154
+ sqlmesh_command: str,
155
+ ) -> tuple[dict[str, str], list[dict]]:
156
+ """Run the inline render script via subprocess. Returns ({model_name: sql}, errors)."""
157
+ _validate_command(sqlmesh_command, allowed_keywords={"python", "sqlmesh", "uv"})
158
+ cmd = shlex.split(sqlmesh_command) + [
159
+ "-c",
160
+ _RENDER_SCRIPT,
161
+ str(project_path),
162
+ dialect,
163
+ gateway,
164
+ json.dumps(variables),
165
+ ]
166
+
167
+ result = subprocess.run(
168
+ cmd,
169
+ cwd=cwd,
170
+ env=env,
171
+ capture_output=True,
172
+ text=True,
173
+ timeout=600, # 10 min timeout for large projects
174
+ )
175
+
176
+ if result.returncode != 0:
177
+ raise RuntimeError(f"sqlmesh render failed (exit {result.returncode}):\n{result.stderr}")
178
+
179
+ output = json.loads(result.stdout)
180
+ return output.get("rendered", {}), output.get("errors", [])
181
+
182
+
183
+ def _validate_command(command: str, allowed_keywords: set[str]) -> None:
184
+ """Validate a subprocess command against an allowlist.
185
+
186
+ The first token of the command must contain one of the allowed keywords.
187
+ Rejects shell metacharacters that could enable command injection.
188
+ """
189
+ # Reject shell metacharacters
190
+ dangerous_chars = set(";|&`$(){}!")
191
+ if dangerous_chars & set(command):
192
+ raise ValueError(f"Command contains disallowed shell characters: {command!r}")
193
+
194
+ parts = shlex.split(command)
195
+ if not parts:
196
+ raise ValueError("Empty command")
197
+
198
+ # The base command (first token) must exactly match an allowed keyword
199
+ base = parts[0].rsplit("/", 1)[-1] # strip path prefix
200
+ if base not in allowed_keywords:
201
+ raise ValueError(
202
+ f"Command {parts[0]!r} not in allowlist. Base command must be one of: {', '.join(sorted(allowed_keywords))}"
203
+ )
@@ -0,0 +1,73 @@
1
+ """Shared utilities for renderer modules (sqlmesh, dbt)."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from sqlprism.types import NodeResult, ParseResult
7
+
8
+
9
+ def find_venv_dir(project_path: Path) -> Path:
10
+ """Find the directory containing .venv for uv run.
11
+
12
+ Checks project_path first, then walks up to 3 parent levels.
13
+ Falls back to project_path if no .venv found.
14
+ """
15
+ if (project_path / ".venv").exists():
16
+ return project_path
17
+ current = project_path.parent
18
+ for _ in range(3):
19
+ if (current / ".venv").exists():
20
+ return current
21
+ current = current.parent
22
+ return project_path
23
+
24
+
25
+ def parse_dotenv(env_path: Path) -> dict[str, str]:
26
+ """Parse a .env file into a dict.
27
+
28
+ Handles comments, blank lines, and properly strips matching quotes
29
+ (single or double) from values.
30
+ """
31
+ result: dict[str, str] = {}
32
+ for line in env_path.read_text().splitlines():
33
+ line = line.strip()
34
+ if not line or line.startswith("#"):
35
+ continue
36
+ if "=" not in line:
37
+ continue
38
+ key, _, value = line.partition("=")
39
+ key = key.strip()
40
+ value = value.strip()
41
+ # Strip matching quotes only (not mismatched ones)
42
+ if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
43
+ value = value[1:-1]
44
+ result[key] = value
45
+ return result
46
+
47
+
48
+ def build_env(env_file: str | Path | None = None) -> dict[str, str]:
49
+ """Build subprocess environment, optionally loading a .env file."""
50
+ env = os.environ.copy()
51
+ if env_file:
52
+ env_path = Path(env_file).resolve()
53
+ if env_path.exists():
54
+ env.update(parse_dotenv(env_path))
55
+ return env
56
+
57
+
58
+ def enrich_nodes(result: ParseResult, metadata_key: str, metadata_value: str) -> None:
59
+ """Add renderer metadata to all nodes in a ParseResult (mutates in place)."""
60
+ enriched = []
61
+ for node in result.nodes:
62
+ meta = dict(node.metadata) if node.metadata else {}
63
+ meta[metadata_key] = metadata_value
64
+ enriched.append(
65
+ NodeResult(
66
+ kind=node.kind,
67
+ name=node.name,
68
+ line_start=node.line_start,
69
+ line_end=node.line_end,
70
+ metadata=meta,
71
+ )
72
+ )
73
+ result.nodes = enriched
sqlprism/types.py ADDED
@@ -0,0 +1,190 @@
1
+ """Shared data types for the SQL indexer.
2
+
3
+ These dataclasses define the contract between parsers and the indexer orchestrator.
4
+ Every language parser returns a ParseResult. The orchestrator consumes ParseResults
5
+ and writes to DuckDB. Parsers never touch the database. The orchestrator never
6
+ does language-specific parsing.
7
+ """
8
+
9
+ from dataclasses import dataclass, field
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class NodeResult:
14
+ """A nameable entity found in a file.
15
+
16
+ Nodes are the universal unit of the knowledge graph. A node is anything
17
+ a parser identifies as structurally meaningful: a table, view, CTE,
18
+ function, class, module, API endpoint, Terraform resource, etc.
19
+
20
+ The ``kind`` field is parser-defined and unconstrained -- each language
21
+ emits whatever kinds are meaningful for it.
22
+
23
+ Attributes:
24
+ kind: Entity type (e.g. ``"table"``, ``"view"``, ``"cte"``).
25
+ name: Unqualified entity name (e.g. ``"orders"``).
26
+ line_start: First line in the source file, or ``None`` if unknown.
27
+ line_end: Last line in the source file, or ``None`` if unknown.
28
+ metadata: Arbitrary parser-supplied metadata (schema, dialect, filters, etc.).
29
+ """
30
+
31
+ kind: str
32
+ name: str
33
+ line_start: int | None = None
34
+ line_end: int | None = None
35
+ metadata: dict | None = None
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class EdgeResult:
40
+ """A relationship between two entities.
41
+
42
+ Edges reference nodes by ``(name, kind)`` pairs, not database IDs. The
43
+ indexer orchestrator resolves these to node IDs during insertion. This
44
+ means parsers don't need to know about the database and parse order
45
+ doesn't matter.
46
+
47
+ The target may be in another file or even another repo. If unresolved at
48
+ insert time, the orchestrator creates a phantom node.
49
+
50
+ Attributes:
51
+ source_name: Name of the source node.
52
+ source_kind: Kind of the source node (e.g. ``"query"``).
53
+ target_name: Name of the target node.
54
+ target_kind: Kind of the target node (e.g. ``"table"``).
55
+ relationship: Edge label (e.g. ``"references"``, ``"defines"``,
56
+ ``"inserts_into"``, ``"cte_references"``).
57
+ context: Human-readable context (e.g. ``"FROM clause"``, ``"JOIN clause"``).
58
+ metadata: Arbitrary edge metadata (source_schema, target_schema, etc.).
59
+ """
60
+
61
+ source_name: str
62
+ source_kind: str
63
+ target_name: str
64
+ target_kind: str
65
+ relationship: str
66
+ context: str | None = None
67
+ metadata: dict | None = None
68
+
69
+
70
+ @dataclass(frozen=True)
71
+ class ColumnUsageResult:
72
+ """SQL-specific: column-level lineage from sqlglot.
73
+
74
+ Records which columns are used where and how. Only the SQL parser
75
+ populates these -- all other parsers return an empty list.
76
+
77
+ This data is stored in a separate table from edges because column
78
+ usage is high-volume with its own query patterns (flat scans, not
79
+ graph traversals).
80
+
81
+ Attributes:
82
+ node_name: Name of the query/CTE/view that uses this column.
83
+ node_kind: Kind of the owning node (e.g. ``"query"``, ``"cte"``).
84
+ table_name: Source table the column belongs to.
85
+ column_name: Column name (``"*"`` for ``SELECT *``).
86
+ usage_type: How the column is used. One of ``"select"``,
87
+ ``"where"``, ``"join_on"``, ``"group_by"``, ``"order_by"``,
88
+ ``"having"``, ``"insert"``, ``"update"``, ``"partition_by"``,
89
+ ``"window_order"``, ``"qualify"``.
90
+ alias: Output alias if the column is aliased (``AS name``).
91
+ transform: Wrapping expression, e.g. ``"CAST(a.updated AS DATETIME)"``.
92
+ """
93
+
94
+ node_name: str
95
+ node_kind: str
96
+ table_name: str
97
+ column_name: str
98
+ # 'select', 'where', 'join_on', 'group_by', 'order_by', 'having', 'insert', 'update'
99
+ usage_type: str
100
+ alias: str | None = None
101
+ transform: str | None = None # wrapping expression e.g. "CAST(a.updated AS DATETIME)"
102
+
103
+
104
+ @dataclass(frozen=True)
105
+ class LineageHop:
106
+ """One hop in a column lineage chain.
107
+
108
+ Attributes:
109
+ column: Column name at this hop.
110
+ table: Table, CTE, or subquery name at this hop.
111
+ expression: Transform applied at this hop (e.g. ``"CAST(amount AS DECIMAL)"``),
112
+ or ``None`` if the column passes through unchanged.
113
+ """
114
+
115
+ column: str
116
+ table: str # table, CTE, or subquery name
117
+ expression: str | None = None # transform at this hop, e.g. "CAST(amount AS DECIMAL)"
118
+
119
+
120
+ @dataclass(frozen=True)
121
+ class ColumnLineageResult:
122
+ """End-to-end column lineage through CTEs and subqueries.
123
+
124
+ Traces an output column back to its source table column(s),
125
+ recording each intermediate hop (CTE, subquery, transform).
126
+
127
+ Attributes:
128
+ output_column: Column name in the final output.
129
+ output_node: The query, table, or view that produces this column.
130
+ chain: Ordered hops from output back to source.
131
+ """
132
+
133
+ output_column: str # column name in the final output
134
+ output_node: str # the query/table/view that produces this column
135
+ chain: list[LineageHop] = field(default_factory=list) # ordered hops from output → source
136
+
137
+
138
+ def parse_repo_config(
139
+ cfg: str | dict,
140
+ global_dialect: str | None = None,
141
+ ) -> tuple[str, str | None, dict[str, str] | None]:
142
+ """Parse a repo config value into (path, dialect, dialect_overrides).
143
+
144
+ Supports both simple string paths and full config dicts::
145
+
146
+ "my-repo": "/path/to/repo"
147
+ "my-repo": {"path": "/path", "dialect": "starrocks",
148
+ "dialect_overrides": {"athena/": "athena"}}
149
+ """
150
+ if isinstance(cfg, str):
151
+ return cfg, global_dialect, None
152
+ return (
153
+ cfg["path"],
154
+ cfg.get("dialect", global_dialect),
155
+ cfg.get("dialect_overrides"),
156
+ )
157
+
158
+
159
+ @dataclass
160
+ class ParseResult:
161
+ """Everything a parser returns for one file.
162
+
163
+ This is the complete interface contract. A parser receives a file path
164
+ and its content, and returns one of these. The orchestrator handles
165
+ everything from here -- ID assignment, edge resolution, database writes.
166
+
167
+ Mutation contract:
168
+ ParseResult is intentionally **mutable** (not ``frozen=True``).
169
+ Renderers and post-processing steps mutate ``nodes``, ``edges``, and
170
+ other lists **in-place** -- e.g. appending synthetic nodes, deduplicating
171
+ edges, or rewriting names during normalisation. This is by design:
172
+ allocating a new ParseResult for every transform would add complexity
173
+ with no practical benefit, since a ParseResult is owned by a single
174
+ file-processing pipeline and is never shared across threads.
175
+
176
+ Attributes:
177
+ language: Parser language identifier (e.g. ``"sql"``).
178
+ nodes: Entities discovered in the file.
179
+ edges: Relationships between entities.
180
+ column_usage: Column-level usage records (SQL only).
181
+ column_lineage: End-to-end column lineage chains (SQL only).
182
+ errors: Non-fatal parse errors encountered during processing.
183
+ """
184
+
185
+ language: str
186
+ nodes: list[NodeResult] = field(default_factory=list)
187
+ edges: list[EdgeResult] = field(default_factory=list)
188
+ column_usage: list[ColumnUsageResult] = field(default_factory=list)
189
+ column_lineage: list[ColumnLineageResult] = field(default_factory=list)
190
+ errors: list[str] = field(default_factory=list)