sqlprism 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlprism/__init__.py +1 -0
- sqlprism/cli.py +625 -0
- sqlprism/core/__init__.py +0 -0
- sqlprism/core/graph.py +1547 -0
- sqlprism/core/indexer.py +677 -0
- sqlprism/core/mcp_tools.py +982 -0
- sqlprism/languages/__init__.py +28 -0
- sqlprism/languages/dbt.py +199 -0
- sqlprism/languages/sql.py +1031 -0
- sqlprism/languages/sqlmesh.py +203 -0
- sqlprism/languages/utils.py +73 -0
- sqlprism/types.py +190 -0
- sqlprism-1.0.0.dist-info/METADATA +429 -0
- sqlprism-1.0.0.dist-info/RECORD +17 -0
- sqlprism-1.0.0.dist-info/WHEEL +4 -0
- sqlprism-1.0.0.dist-info/entry_points.txt +2 -0
- sqlprism-1.0.0.dist-info/licenses/LICENSE +190 -0
sqlprism/core/indexer.py
ADDED
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""Indexer orchestrator.
|
|
2
|
+
|
|
3
|
+
The only component that connects parsers to storage. It:
|
|
4
|
+
1. Scans repos for SQL files
|
|
5
|
+
2. Checksums files and compares against stored checksums
|
|
6
|
+
3. Determines the dialect per file (repo default or path-based override)
|
|
7
|
+
4. Calls SqlParser with the appropriate dialect
|
|
8
|
+
5. Resolves edge references (name/kind → node IDs)
|
|
9
|
+
6. Inserts results into DuckDB via GraphDB
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import fnmatch
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
import subprocess
|
|
16
|
+
from collections import OrderedDict
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from sqlprism.core.graph import GraphDB
|
|
20
|
+
from sqlprism.languages import SQL_EXTENSIONS, is_sql_file
|
|
21
|
+
from sqlprism.languages.sql import SqlParser
|
|
22
|
+
from sqlprism.types import ParseResult
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Indexer:
|
|
28
|
+
"""Orchestrates parsing and indexing across repos.
|
|
29
|
+
|
|
30
|
+
Connects language parsers to the ``GraphDB`` storage layer. Handles
|
|
31
|
+
file scanning, checksum diffing, dialect resolution, and batch
|
|
32
|
+
insertion of parse results. Supports plain SQL repos, sqlmesh
|
|
33
|
+
projects, and dbt projects.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, graph: GraphDB):
|
|
37
|
+
"""Initialise the indexer.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
graph: The ``GraphDB`` instance to write parsed data into.
|
|
41
|
+
"""
|
|
42
|
+
self.graph = graph
|
|
43
|
+
self._parser_cache: dict[str | None, SqlParser] = {}
|
|
44
|
+
self._sqlmesh_renderer = None
|
|
45
|
+
self._dbt_renderer = None
|
|
46
|
+
# Stat-based pre-filter cache: abs_path -> (mtime, size, checksum)
|
|
47
|
+
# Avoids re-reading file bytes when mtime+size are unchanged.
|
|
48
|
+
self._file_stat_cache: OrderedDict[str, tuple[float, int, str]] = OrderedDict()
|
|
49
|
+
|
|
50
|
+
def get_parser(self, dialect: str | None = None) -> SqlParser:
|
|
51
|
+
"""Get or create a SqlParser for the given dialect."""
|
|
52
|
+
if dialect not in self._parser_cache:
|
|
53
|
+
self._parser_cache[dialect] = SqlParser(dialect=dialect)
|
|
54
|
+
return self._parser_cache[dialect]
|
|
55
|
+
|
|
56
|
+
def get_sqlmesh_renderer(self, dialect: str | None = None):
|
|
57
|
+
"""Get or create a SqlMeshRenderer with the correct dialect parser."""
|
|
58
|
+
from sqlprism.languages.sqlmesh import SqlMeshRenderer
|
|
59
|
+
|
|
60
|
+
if self._sqlmesh_renderer is None or (dialect and self._sqlmesh_renderer.sql_parser.dialect != dialect):
|
|
61
|
+
self._sqlmesh_renderer = SqlMeshRenderer(sql_parser=self.get_parser(dialect))
|
|
62
|
+
return self._sqlmesh_renderer
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def dbt_renderer(self):
|
|
66
|
+
if self._dbt_renderer is None:
|
|
67
|
+
from sqlprism.languages.dbt import DbtRenderer
|
|
68
|
+
|
|
69
|
+
self._dbt_renderer = DbtRenderer(sql_parser=self.get_parser())
|
|
70
|
+
return self._dbt_renderer
|
|
71
|
+
|
|
72
|
+
def reindex_repo(
|
|
73
|
+
self,
|
|
74
|
+
name: str,
|
|
75
|
+
path: str | Path,
|
|
76
|
+
dialect: str | None = None,
|
|
77
|
+
dialect_overrides: dict[str, str] | None = None,
|
|
78
|
+
) -> dict:
|
|
79
|
+
"""Reindex a single repo by scanning for SQL files.
|
|
80
|
+
|
|
81
|
+
Compares file checksums against the stored index to determine
|
|
82
|
+
added, changed, and deleted files. Only changed files are re-parsed.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
name: Repo name in the index.
|
|
86
|
+
path: Absolute path to the repo root.
|
|
87
|
+
dialect: Default SQL dialect (e.g. ``"starrocks"``, ``"athena"``).
|
|
88
|
+
dialect_overrides: Per-path dialect overrides as
|
|
89
|
+
``{glob_pattern: dialect}``, e.g.
|
|
90
|
+
``{"athena/": "athena", "starrocks/**": "starrocks"}``.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Stats dict with keys ``files_scanned``, ``files_added``,
|
|
94
|
+
``files_changed``, ``files_removed``, ``nodes_added``,
|
|
95
|
+
``edges_added``, ``column_usage_added``, ``lineage_chains``,
|
|
96
|
+
``column_usage_dropped``, ``parse_errors``, and
|
|
97
|
+
``phantoms_cleaned``.
|
|
98
|
+
"""
|
|
99
|
+
path = Path(path).resolve()
|
|
100
|
+
repo_id = self.graph.upsert_repo(name, str(path))
|
|
101
|
+
|
|
102
|
+
# Get current checksums from DB
|
|
103
|
+
stored_checksums = self.graph.get_file_checksums(repo_id)
|
|
104
|
+
|
|
105
|
+
# Scan filesystem
|
|
106
|
+
current_files = self._scan_files(path)
|
|
107
|
+
|
|
108
|
+
# Determine what changed
|
|
109
|
+
changed = []
|
|
110
|
+
added = []
|
|
111
|
+
for rel_path, checksum in current_files.items():
|
|
112
|
+
if rel_path not in stored_checksums:
|
|
113
|
+
added.append(rel_path)
|
|
114
|
+
elif stored_checksums[rel_path] != checksum:
|
|
115
|
+
changed.append(rel_path)
|
|
116
|
+
|
|
117
|
+
deleted = [p for p in stored_checksums if p not in current_files]
|
|
118
|
+
|
|
119
|
+
# Parse and insert changed/added files
|
|
120
|
+
stats = {
|
|
121
|
+
"files_scanned": len(current_files),
|
|
122
|
+
"files_added": len(added),
|
|
123
|
+
"files_changed": len(changed),
|
|
124
|
+
"files_removed": len(deleted),
|
|
125
|
+
"nodes_added": 0,
|
|
126
|
+
"edges_added": 0,
|
|
127
|
+
"column_usage_added": 0,
|
|
128
|
+
"lineage_chains": 0,
|
|
129
|
+
"column_usage_dropped": 0,
|
|
130
|
+
"parse_errors": [],
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
# Delete truly removed files in a transaction
|
|
134
|
+
if deleted:
|
|
135
|
+
with self.graph.write_transaction():
|
|
136
|
+
for rel_path in deleted:
|
|
137
|
+
self.graph.delete_file_data(repo_id, rel_path)
|
|
138
|
+
|
|
139
|
+
# Build schema catalog from existing index for SELECT * expansion
|
|
140
|
+
schema_catalog = self.graph.get_table_columns(repo_id) or None
|
|
141
|
+
|
|
142
|
+
# Changed + added files: delete old + insert new in same transaction
|
|
143
|
+
# so a crash never leaves a file in a "deleted but not yet reinserted" state
|
|
144
|
+
changed_set = set(changed)
|
|
145
|
+
for rel_path in changed + added:
|
|
146
|
+
# Resolve dialect for this file
|
|
147
|
+
file_dialect = _resolve_dialect(rel_path, dialect, dialect_overrides)
|
|
148
|
+
parser = self.get_parser(file_dialect)
|
|
149
|
+
|
|
150
|
+
full_path = path / rel_path
|
|
151
|
+
try:
|
|
152
|
+
content = full_path.read_text(errors="replace")
|
|
153
|
+
except (OSError, PermissionError):
|
|
154
|
+
logger.warning("Cannot read file %s — skipping", full_path)
|
|
155
|
+
stats["parse_errors"].append(f"{rel_path}: unreadable (OS/permission error)")
|
|
156
|
+
continue
|
|
157
|
+
checksum = current_files[rel_path]
|
|
158
|
+
|
|
159
|
+
# Parse — pass schema catalog for SELECT * lineage expansion
|
|
160
|
+
result = parser.parse(rel_path, content, schema=schema_catalog)
|
|
161
|
+
if result.errors:
|
|
162
|
+
for err in result.errors:
|
|
163
|
+
stats["parse_errors"].append(f"{rel_path}: {err}")
|
|
164
|
+
|
|
165
|
+
# Wrap per-file delete + insert in a transaction for atomicity
|
|
166
|
+
with self.graph.write_transaction():
|
|
167
|
+
if rel_path in changed_set:
|
|
168
|
+
self.graph.delete_file_data(repo_id, rel_path)
|
|
169
|
+
file_id = self.graph.insert_file(repo_id, rel_path, "sql", checksum)
|
|
170
|
+
self._insert_parse_result(result, file_id, repo_id, stats)
|
|
171
|
+
|
|
172
|
+
# Clean up phantom nodes that now have real counterparts
|
|
173
|
+
phantoms_cleaned = self.graph.cleanup_phantoms()
|
|
174
|
+
stats["phantoms_cleaned"] = phantoms_cleaned
|
|
175
|
+
|
|
176
|
+
# Update repo metadata
|
|
177
|
+
commit, branch = self._get_git_info(path)
|
|
178
|
+
self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
|
|
179
|
+
|
|
180
|
+
self.graph.clear_snippet_cache()
|
|
181
|
+
return stats
|
|
182
|
+
|
|
183
|
+
def reindex_sqlmesh(
|
|
184
|
+
self,
|
|
185
|
+
repo_name: str,
|
|
186
|
+
project_path: str | Path,
|
|
187
|
+
env_file: str | Path | None = None,
|
|
188
|
+
variables: dict[str, str | int] | None = None,
|
|
189
|
+
dialect: str = "athena",
|
|
190
|
+
sqlmesh_command: str = "uv run python",
|
|
191
|
+
venv_dir: str | Path | None = None,
|
|
192
|
+
) -> dict:
|
|
193
|
+
"""Index a sqlmesh project by rendering all models first.
|
|
194
|
+
|
|
195
|
+
Uses ``SqlMeshRenderer`` to render every model via subprocess,
|
|
196
|
+
then parses the rendered SQL and inserts results into the graph.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
repo_name: Repo name in the index.
|
|
200
|
+
project_path: Path to the sqlmesh project directory
|
|
201
|
+
(containing ``config.yaml``).
|
|
202
|
+
env_file: Optional ``.env`` file to source before rendering.
|
|
203
|
+
variables: Extra sqlmesh variables (e.g. ``{"GRACE_PERIOD": 7}``).
|
|
204
|
+
dialect: SQL dialect for rendering (default ``"athena"``).
|
|
205
|
+
sqlmesh_command: Command to invoke Python in the sqlmesh venv.
|
|
206
|
+
venv_dir: Directory containing ``.venv``. Auto-detected if not set.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
Stats dict with keys ``models_rendered``, ``nodes_added``,
|
|
210
|
+
``edges_added``, ``column_usage_added``, and ``lineage_chains``.
|
|
211
|
+
"""
|
|
212
|
+
project_path = Path(project_path).resolve()
|
|
213
|
+
repo_id = self.graph.upsert_repo(repo_name, str(project_path))
|
|
214
|
+
|
|
215
|
+
# Build schema catalog from existing index for SELECT * expansion
|
|
216
|
+
schema_catalog = self.graph.get_table_columns(repo_id) or None
|
|
217
|
+
|
|
218
|
+
rendered = self.get_sqlmesh_renderer(dialect).render_project(
|
|
219
|
+
project_path=project_path,
|
|
220
|
+
env_file=env_file,
|
|
221
|
+
variables=variables,
|
|
222
|
+
dialect=dialect,
|
|
223
|
+
sqlmesh_command=sqlmesh_command,
|
|
224
|
+
venv_dir=venv_dir,
|
|
225
|
+
schema_catalog=schema_catalog,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
stats = {
|
|
229
|
+
"models_rendered": len(rendered),
|
|
230
|
+
"nodes_added": 0,
|
|
231
|
+
"edges_added": 0,
|
|
232
|
+
"column_usage_added": 0,
|
|
233
|
+
"lineage_chains": 0,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
for model_name, result in rendered.items():
|
|
237
|
+
clean_name = model_name.strip('"').replace('"."', "/")
|
|
238
|
+
file_path = clean_name + ".sql"
|
|
239
|
+
|
|
240
|
+
# Wrap delete + insert per model in a transaction for atomicity
|
|
241
|
+
with self.graph.write_transaction():
|
|
242
|
+
self.graph.delete_file_data(repo_id, file_path)
|
|
243
|
+
checksum = _checksum_parse_result(result)
|
|
244
|
+
file_id = self.graph.insert_file(repo_id, file_path, "sql", checksum)
|
|
245
|
+
self._insert_parse_result(result, file_id, repo_id, stats)
|
|
246
|
+
|
|
247
|
+
commit, branch = self._get_git_info(project_path)
|
|
248
|
+
self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
|
|
249
|
+
|
|
250
|
+
self.graph.clear_snippet_cache()
|
|
251
|
+
return stats
|
|
252
|
+
|
|
253
|
+
def reindex_dbt(
|
|
254
|
+
self,
|
|
255
|
+
repo_name: str,
|
|
256
|
+
project_path: str | Path,
|
|
257
|
+
profiles_dir: str | Path | None = None,
|
|
258
|
+
env_file: str | Path | None = None,
|
|
259
|
+
target: str | None = None,
|
|
260
|
+
dbt_command: str = "uv run dbt",
|
|
261
|
+
venv_dir: str | Path | None = None,
|
|
262
|
+
dialect: str | None = None,
|
|
263
|
+
) -> dict:
|
|
264
|
+
"""Index a dbt project by compiling all models first.
|
|
265
|
+
|
|
266
|
+
Runs ``dbt compile`` via ``DbtRenderer``, then parses each
|
|
267
|
+
compiled SQL file and inserts results into the graph.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
repo_name: Repo name in the index.
|
|
271
|
+
project_path: Path to the dbt project directory
|
|
272
|
+
(containing ``dbt_project.yml``).
|
|
273
|
+
profiles_dir: Path to the directory containing ``profiles.yml``.
|
|
274
|
+
env_file: Optional ``.env`` file to source before compilation.
|
|
275
|
+
target: dbt target name override.
|
|
276
|
+
dbt_command: Command to invoke dbt (e.g. ``"uv run dbt"``).
|
|
277
|
+
venv_dir: Directory to run from (where ``.venv`` lives).
|
|
278
|
+
dialect: SQL dialect for parsing compiled output.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Stats dict with keys ``models_compiled``, ``nodes_added``,
|
|
282
|
+
``edges_added``, ``column_usage_added``, and ``lineage_chains``.
|
|
283
|
+
"""
|
|
284
|
+
project_path = Path(project_path).resolve()
|
|
285
|
+
repo_id = self.graph.upsert_repo(repo_name, str(project_path))
|
|
286
|
+
|
|
287
|
+
# Build schema catalog from existing index for SELECT * expansion
|
|
288
|
+
schema_catalog = self.graph.get_table_columns(repo_id) or None
|
|
289
|
+
|
|
290
|
+
rendered = self.dbt_renderer.render_project(
|
|
291
|
+
project_path=project_path,
|
|
292
|
+
profiles_dir=profiles_dir,
|
|
293
|
+
env_file=env_file,
|
|
294
|
+
target=target,
|
|
295
|
+
dbt_command=dbt_command,
|
|
296
|
+
venv_dir=venv_dir,
|
|
297
|
+
dialect=dialect,
|
|
298
|
+
schema_catalog=schema_catalog,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
stats = {
|
|
302
|
+
"models_compiled": len(rendered),
|
|
303
|
+
"nodes_added": 0,
|
|
304
|
+
"edges_added": 0,
|
|
305
|
+
"column_usage_added": 0,
|
|
306
|
+
"lineage_chains": 0,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
for model_path, result in rendered.items():
|
|
310
|
+
# Wrap delete + insert per model in a transaction for atomicity
|
|
311
|
+
with self.graph.write_transaction():
|
|
312
|
+
self.graph.delete_file_data(repo_id, model_path)
|
|
313
|
+
checksum = _checksum_parse_result(result)
|
|
314
|
+
file_id = self.graph.insert_file(repo_id, model_path, "sql", checksum)
|
|
315
|
+
self._insert_parse_result(result, file_id, repo_id, stats)
|
|
316
|
+
|
|
317
|
+
commit, branch = self._get_git_info(project_path)
|
|
318
|
+
self.graph.update_repo_metadata(repo_id, commit=commit, branch=branch)
|
|
319
|
+
|
|
320
|
+
self.graph.clear_snippet_cache()
|
|
321
|
+
return stats
|
|
322
|
+
|
|
323
|
+
def _insert_parse_result(
|
|
324
|
+
self,
|
|
325
|
+
result: ParseResult,
|
|
326
|
+
file_id: int,
|
|
327
|
+
repo_id: int,
|
|
328
|
+
stats: dict,
|
|
329
|
+
) -> None:
|
|
330
|
+
"""Insert nodes, edges, column usage, and lineage from a ParseResult.
|
|
331
|
+
|
|
332
|
+
Shared by reindex_repo, reindex_sqlmesh, and reindex_dbt.
|
|
333
|
+
Uses batch inserts for performance. Updates stats dict in-place.
|
|
334
|
+
"""
|
|
335
|
+
import json
|
|
336
|
+
|
|
337
|
+
# ── Batch insert nodes ──
|
|
338
|
+
# Key includes schema to avoid collisions between staging.orders and production.orders
|
|
339
|
+
node_id_map: dict[tuple[str, str, str | None], int] = {}
|
|
340
|
+
if result.nodes:
|
|
341
|
+
node_rows = [
|
|
342
|
+
(
|
|
343
|
+
file_id,
|
|
344
|
+
node.kind,
|
|
345
|
+
node.name,
|
|
346
|
+
result.language,
|
|
347
|
+
node.line_start,
|
|
348
|
+
node.line_end,
|
|
349
|
+
json.dumps(node.metadata) if node.metadata else None,
|
|
350
|
+
(node.metadata or {}).get("schema") if node.metadata else None,
|
|
351
|
+
)
|
|
352
|
+
for node in result.nodes
|
|
353
|
+
]
|
|
354
|
+
node_ids = self.graph.insert_nodes_batch(node_rows)
|
|
355
|
+
for node, nid in zip(result.nodes, node_ids):
|
|
356
|
+
schema = (node.metadata or {}).get("schema") if node.metadata else None
|
|
357
|
+
node_id_map[(node.name, node.kind, schema)] = nid
|
|
358
|
+
stats["nodes_added"] += len(result.nodes)
|
|
359
|
+
|
|
360
|
+
# ── Batch insert edges ──
|
|
361
|
+
if result.edges:
|
|
362
|
+
edge_rows = []
|
|
363
|
+
for edge in result.edges:
|
|
364
|
+
source_id = self._resolve_edge_endpoint(
|
|
365
|
+
edge.source_name,
|
|
366
|
+
edge.source_kind,
|
|
367
|
+
node_id_map,
|
|
368
|
+
repo_id,
|
|
369
|
+
schema=(edge.metadata or {}).get("source_schema") if edge.metadata else None,
|
|
370
|
+
)
|
|
371
|
+
target_id = self._resolve_edge_endpoint(
|
|
372
|
+
edge.target_name,
|
|
373
|
+
edge.target_kind,
|
|
374
|
+
node_id_map,
|
|
375
|
+
repo_id,
|
|
376
|
+
schema=(edge.metadata or {}).get("target_schema") if edge.metadata else None,
|
|
377
|
+
)
|
|
378
|
+
edge_rows.append(
|
|
379
|
+
(
|
|
380
|
+
source_id,
|
|
381
|
+
target_id,
|
|
382
|
+
edge.relationship,
|
|
383
|
+
edge.context,
|
|
384
|
+
json.dumps(edge.metadata) if edge.metadata else None,
|
|
385
|
+
)
|
|
386
|
+
)
|
|
387
|
+
self.graph.insert_edges_batch(edge_rows)
|
|
388
|
+
stats["edges_added"] += len(edge_rows)
|
|
389
|
+
|
|
390
|
+
# ── Batch insert column usage ──
|
|
391
|
+
if result.column_usage:
|
|
392
|
+
cu_rows = []
|
|
393
|
+
for cu in result.column_usage:
|
|
394
|
+
# Try schema-aware lookup first, then fall back to schema=None
|
|
395
|
+
cu_node_id = node_id_map.get((cu.node_name, cu.node_kind, None))
|
|
396
|
+
if not cu_node_id:
|
|
397
|
+
# Try all schemas for this (name, kind)
|
|
398
|
+
for key, nid in node_id_map.items():
|
|
399
|
+
if key[0] == cu.node_name and key[1] == cu.node_kind:
|
|
400
|
+
cu_node_id = nid
|
|
401
|
+
break
|
|
402
|
+
if not cu_node_id:
|
|
403
|
+
cu_node_id = self.graph.resolve_node(cu.node_name, cu.node_kind, repo_id)
|
|
404
|
+
if cu_node_id:
|
|
405
|
+
cu_rows.append(
|
|
406
|
+
(
|
|
407
|
+
cu_node_id,
|
|
408
|
+
cu.table_name,
|
|
409
|
+
cu.column_name,
|
|
410
|
+
cu.usage_type,
|
|
411
|
+
file_id,
|
|
412
|
+
cu.alias,
|
|
413
|
+
cu.transform,
|
|
414
|
+
)
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
stats["column_usage_dropped"] = stats.get("column_usage_dropped", 0) + 1
|
|
418
|
+
logger.warning(
|
|
419
|
+
"Dropped column_usage: node %s/%s not found (table=%s col=%s)",
|
|
420
|
+
cu.node_name,
|
|
421
|
+
cu.node_kind,
|
|
422
|
+
cu.table_name,
|
|
423
|
+
cu.column_name,
|
|
424
|
+
)
|
|
425
|
+
if cu_rows:
|
|
426
|
+
self.graph.insert_column_usage_batch(cu_rows)
|
|
427
|
+
stats["column_usage_added"] += len(cu_rows)
|
|
428
|
+
|
|
429
|
+
# ── Batch insert column lineage ──
|
|
430
|
+
if result.column_lineage:
|
|
431
|
+
lineage_rows = []
|
|
432
|
+
# Track chain_index per (output_node, output_column) to disambiguate multi-path lineage
|
|
433
|
+
chain_counters: dict[tuple[str, str], int] = {}
|
|
434
|
+
for cl in result.column_lineage:
|
|
435
|
+
key = (cl.output_node, cl.output_column)
|
|
436
|
+
chain_idx = chain_counters.get(key, 0)
|
|
437
|
+
chain_counters[key] = chain_idx + 1
|
|
438
|
+
for i, hop in enumerate(cl.chain):
|
|
439
|
+
lineage_rows.append(
|
|
440
|
+
(
|
|
441
|
+
file_id,
|
|
442
|
+
cl.output_node,
|
|
443
|
+
cl.output_column,
|
|
444
|
+
chain_idx,
|
|
445
|
+
i,
|
|
446
|
+
hop.column,
|
|
447
|
+
hop.table,
|
|
448
|
+
hop.expression,
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
stats["lineage_chains"] += 1
|
|
452
|
+
if lineage_rows:
|
|
453
|
+
self.graph.insert_column_lineage_batch(lineage_rows)
|
|
454
|
+
|
|
455
|
+
def _resolve_edge_endpoint(
|
|
456
|
+
self,
|
|
457
|
+
name: str,
|
|
458
|
+
kind: str,
|
|
459
|
+
local_map: dict[tuple[str, str, str | None], int],
|
|
460
|
+
repo_id: int,
|
|
461
|
+
schema: str | None = None,
|
|
462
|
+
) -> int:
|
|
463
|
+
"""Resolve an edge endpoint to a node_id."""
|
|
464
|
+
# Try with schema first, then without
|
|
465
|
+
node_id = local_map.get((name, kind, schema))
|
|
466
|
+
if node_id:
|
|
467
|
+
return node_id
|
|
468
|
+
if schema:
|
|
469
|
+
node_id = local_map.get((name, kind, None))
|
|
470
|
+
if node_id:
|
|
471
|
+
return node_id
|
|
472
|
+
|
|
473
|
+
node_id = self.graph.resolve_node(name, kind, repo_id, schema=schema)
|
|
474
|
+
if node_id:
|
|
475
|
+
return node_id
|
|
476
|
+
|
|
477
|
+
return self.graph.get_or_create_phantom(name, kind, "sql")
|
|
478
|
+
|
|
479
|
+
def _scan_files(self, repo_path: Path) -> dict[str, str]:
|
|
480
|
+
"""Scan a repo directory for SQL files. Returns {relative_path: sha256}.
|
|
481
|
+
|
|
482
|
+
Uses mtime + size as a pre-filter: if both match the cached values
|
|
483
|
+
from a previous scan, the stored checksum is reused without reading
|
|
484
|
+
the file contents.
|
|
485
|
+
"""
|
|
486
|
+
result: dict[str, str] = {}
|
|
487
|
+
|
|
488
|
+
for file_path in repo_path.rglob("*"):
|
|
489
|
+
if not file_path.is_file():
|
|
490
|
+
continue
|
|
491
|
+
if file_path.suffix not in SQL_EXTENSIONS:
|
|
492
|
+
continue
|
|
493
|
+
# Skip common non-source directories
|
|
494
|
+
parts = file_path.relative_to(repo_path).parts
|
|
495
|
+
if any(
|
|
496
|
+
p.startswith(".") or p in ("node_modules", "__pycache__", "venv", ".venv", "target", "build")
|
|
497
|
+
for p in parts
|
|
498
|
+
):
|
|
499
|
+
continue
|
|
500
|
+
|
|
501
|
+
rel_path = str(file_path.relative_to(repo_path))
|
|
502
|
+
abs_key = str(file_path)
|
|
503
|
+
|
|
504
|
+
# Stat-based pre-filter: skip checksum if mtime+size unchanged
|
|
505
|
+
try:
|
|
506
|
+
st = file_path.stat()
|
|
507
|
+
except OSError:
|
|
508
|
+
logger.warning("Cannot stat file %s — skipping", file_path)
|
|
509
|
+
continue
|
|
510
|
+
mtime = st.st_mtime
|
|
511
|
+
size = st.st_size
|
|
512
|
+
|
|
513
|
+
cached = self._file_stat_cache.get(abs_key)
|
|
514
|
+
if cached is not None and cached[0] == mtime and cached[1] == size:
|
|
515
|
+
checksum = cached[2]
|
|
516
|
+
else:
|
|
517
|
+
try:
|
|
518
|
+
content = file_path.read_bytes()
|
|
519
|
+
except OSError:
|
|
520
|
+
logger.warning("Cannot read file %s — skipping", file_path)
|
|
521
|
+
continue
|
|
522
|
+
checksum = hashlib.sha256(content).hexdigest()
|
|
523
|
+
self._file_stat_cache[abs_key] = (mtime, size, checksum)
|
|
524
|
+
if len(self._file_stat_cache) > 10_000:
|
|
525
|
+
self._file_stat_cache.popitem(last=False) # evict oldest
|
|
526
|
+
|
|
527
|
+
result[rel_path] = checksum
|
|
528
|
+
|
|
529
|
+
return result
|
|
530
|
+
|
|
531
|
+
def _get_git_info(self, repo_path: Path) -> tuple[str | None, str | None]:
|
|
532
|
+
"""Get current git commit hash and branch name."""
|
|
533
|
+
try:
|
|
534
|
+
commit = subprocess.run(
|
|
535
|
+
["git", "rev-parse", "HEAD"],
|
|
536
|
+
cwd=repo_path,
|
|
537
|
+
capture_output=True,
|
|
538
|
+
text=True,
|
|
539
|
+
timeout=5,
|
|
540
|
+
)
|
|
541
|
+
branch = subprocess.run(
|
|
542
|
+
["git", "rev-parse", "--abbrev-ref", "HEAD"],
|
|
543
|
+
cwd=repo_path,
|
|
544
|
+
capture_output=True,
|
|
545
|
+
text=True,
|
|
546
|
+
timeout=5,
|
|
547
|
+
)
|
|
548
|
+
return (
|
|
549
|
+
commit.stdout.strip() if commit.returncode == 0 else None,
|
|
550
|
+
branch.stdout.strip() if branch.returncode == 0 else None,
|
|
551
|
+
)
|
|
552
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
553
|
+
return None, None
|
|
554
|
+
|
|
555
|
+
def parse_file(
|
|
556
|
+
self,
|
|
557
|
+
file_path: str,
|
|
558
|
+
content: str,
|
|
559
|
+
dialect: str | None = None,
|
|
560
|
+
schema: dict | None = None,
|
|
561
|
+
) -> ParseResult:
|
|
562
|
+
"""Parse a single SQL file without writing to the database.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
file_path: File path (used for naming nodes, not read from disk).
|
|
566
|
+
content: Raw SQL content.
|
|
567
|
+
dialect: Optional SQL dialect override.
|
|
568
|
+
schema: Optional schema catalog for ``SELECT *`` expansion.
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
A ``ParseResult`` with extracted nodes, edges, column usage,
|
|
572
|
+
and lineage. Returns an empty result for non-SQL files.
|
|
573
|
+
"""
|
|
574
|
+
if not is_sql_file(file_path):
|
|
575
|
+
return ParseResult(language="sql")
|
|
576
|
+
return self.get_parser(dialect).parse(file_path, content, schema=schema)
|
|
577
|
+
|
|
578
|
+
def parse_file_at_commit(
|
|
579
|
+
self,
|
|
580
|
+
repo_path: Path,
|
|
581
|
+
file_path: str,
|
|
582
|
+
commit: str,
|
|
583
|
+
dialect: str | None = None,
|
|
584
|
+
) -> ParseResult | None:
|
|
585
|
+
"""Parse a file at a specific git commit.
|
|
586
|
+
|
|
587
|
+
Retrieves file content via ``git show`` and parses it without
|
|
588
|
+
writing to the database. Used by pr_impact analysis.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
repo_path: Absolute path to the git repo root.
|
|
592
|
+
file_path: Relative file path within the repo.
|
|
593
|
+
commit: Git commit hash or ref to read from.
|
|
594
|
+
dialect: Optional SQL dialect override.
|
|
595
|
+
|
|
596
|
+
Returns:
|
|
597
|
+
A ``ParseResult``, or ``None`` if the file doesn't exist at
|
|
598
|
+
that commit or is not a SQL file.
|
|
599
|
+
"""
|
|
600
|
+
if not is_sql_file(file_path):
|
|
601
|
+
return None
|
|
602
|
+
try:
|
|
603
|
+
result = subprocess.run(
|
|
604
|
+
["git", "show", f"{commit}:{file_path}"],
|
|
605
|
+
cwd=repo_path,
|
|
606
|
+
capture_output=True,
|
|
607
|
+
text=True,
|
|
608
|
+
timeout=10,
|
|
609
|
+
)
|
|
610
|
+
if result.returncode != 0:
|
|
611
|
+
return None
|
|
612
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
613
|
+
return None
|
|
614
|
+
|
|
615
|
+
return self.get_parser(dialect).parse(file_path, result.stdout)
|
|
616
|
+
|
|
617
|
+
def get_changed_files(self, repo_path: Path, base_commit: str) -> list[str]:
|
|
618
|
+
"""Get SQL files changed between a base commit and HEAD.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
repo_path: Absolute path to the git repo root.
|
|
622
|
+
base_commit: Git commit hash or ref to diff against HEAD.
|
|
623
|
+
|
|
624
|
+
Returns:
|
|
625
|
+
List of relative file paths for changed SQL files. Returns
|
|
626
|
+
an empty list on git errors or timeouts.
|
|
627
|
+
"""
|
|
628
|
+
try:
|
|
629
|
+
result = subprocess.run(
|
|
630
|
+
["git", "diff", "--name-only", f"{base_commit}..HEAD"],
|
|
631
|
+
cwd=repo_path,
|
|
632
|
+
capture_output=True,
|
|
633
|
+
text=True,
|
|
634
|
+
timeout=10,
|
|
635
|
+
)
|
|
636
|
+
if result.returncode != 0:
|
|
637
|
+
return []
|
|
638
|
+
return [f.strip() for f in result.stdout.strip().split("\n") if f.strip() and is_sql_file(f.strip())]
|
|
639
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
640
|
+
return []
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def _resolve_dialect(
|
|
644
|
+
file_path: str,
|
|
645
|
+
default_dialect: str | None,
|
|
646
|
+
overrides: dict[str, str] | None,
|
|
647
|
+
) -> str | None:
|
|
648
|
+
"""Determine the SQL dialect for a file path.
|
|
649
|
+
|
|
650
|
+
Checks overrides first (glob patterns), falls back to default.
|
|
651
|
+
"""
|
|
652
|
+
if overrides:
|
|
653
|
+
for pattern, dialect in overrides.items():
|
|
654
|
+
# Support both "dir/" prefix matching and full glob
|
|
655
|
+
if file_path.startswith(pattern) or fnmatch.fnmatch(file_path, pattern):
|
|
656
|
+
return dialect
|
|
657
|
+
return default_dialect
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def _checksum_parse_result(result: ParseResult) -> str:
|
|
661
|
+
"""Hash the structural content of a ParseResult.
|
|
662
|
+
|
|
663
|
+
Used for rendered models (sqlmesh/dbt) where we don't have the raw SQL
|
|
664
|
+
content to hash directly. Produces a stable checksum based on the
|
|
665
|
+
extracted nodes, edges, and column usage.
|
|
666
|
+
"""
|
|
667
|
+
parts = []
|
|
668
|
+
for n in result.nodes:
|
|
669
|
+
parts.append(f"N:{n.kind}:{n.name}")
|
|
670
|
+
for e in result.edges:
|
|
671
|
+
parts.append(f"E:{e.source_name}:{e.target_name}:{e.relationship}")
|
|
672
|
+
for cu in result.column_usage:
|
|
673
|
+
parts.append(f"CU:{cu.node_name}:{cu.table_name}:{cu.column_name}:{cu.usage_type}")
|
|
674
|
+
for cl in result.column_lineage:
|
|
675
|
+
hops = "|".join(f"{h.table}.{h.column}:{h.expression}" for h in cl.chain)
|
|
676
|
+
parts.append(f"CL:{cl.output_node}:{cl.output_column}:{hops}")
|
|
677
|
+
return hashlib.sha256("\n".join(parts).encode()).hexdigest()
|