sqlprism 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlprism/__init__.py +1 -0
- sqlprism/cli.py +625 -0
- sqlprism/core/__init__.py +0 -0
- sqlprism/core/graph.py +1547 -0
- sqlprism/core/indexer.py +677 -0
- sqlprism/core/mcp_tools.py +982 -0
- sqlprism/languages/__init__.py +28 -0
- sqlprism/languages/dbt.py +199 -0
- sqlprism/languages/sql.py +1031 -0
- sqlprism/languages/sqlmesh.py +203 -0
- sqlprism/languages/utils.py +73 -0
- sqlprism/types.py +190 -0
- sqlprism-1.0.0.dist-info/METADATA +429 -0
- sqlprism-1.0.0.dist-info/RECORD +17 -0
- sqlprism-1.0.0.dist-info/WHEEL +4 -0
- sqlprism-1.0.0.dist-info/entry_points.txt +2 -0
- sqlprism-1.0.0.dist-info/licenses/LICENSE +190 -0
sqlprism/core/graph.py
ADDED
|
@@ -0,0 +1,1547 @@
|
|
|
1
|
+
"""DuckDB graph storage layer.
|
|
2
|
+
|
|
3
|
+
This module owns the database. It initialises the schema, handles inserts,
|
|
4
|
+
resolves edges (name/kind pairs → node IDs), manages phantom nodes, and
|
|
5
|
+
provides the query methods that MCP tools call.
|
|
6
|
+
|
|
7
|
+
No other module touches DuckDB directly.
|
|
8
|
+
|
|
9
|
+
Thread-safety model (read/write separation):
|
|
10
|
+
DuckDB provides MVCC, so concurrent reads are safe without locking.
|
|
11
|
+
Only write operations need serialisation via ``_write_lock``.
|
|
12
|
+
|
|
13
|
+
- **Read path** (``_execute_read``): creates a fresh cursor, executes,
|
|
14
|
+
returns results via ``fetchall()``, then closes the cursor. No lock
|
|
15
|
+
needed -- safe for concurrent access from MCP query handlers while a
|
|
16
|
+
reindex is in progress.
|
|
17
|
+
|
|
18
|
+
- **Write path** (``_execute_write``): uses ``self.conn.execute()``
|
|
19
|
+
directly. Caller must hold ``_write_lock``.
|
|
20
|
+
|
|
21
|
+
- **Transactions** (``write_transaction``): acquires ``_write_lock``
|
|
22
|
+
for the full ``BEGIN .. COMMIT`` scope.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import threading
|
|
27
|
+
import warnings
|
|
28
|
+
from contextlib import contextmanager
|
|
29
|
+
from functools import lru_cache
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
import duckdb
|
|
33
|
+
|
|
34
|
+
_MAX_FILE_SIZE = 1 * 1024 * 1024 # 1 MB – skip snippets for oversized files
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@lru_cache(maxsize=128)
|
|
38
|
+
def _read_file_lines(path: str) -> tuple[str, ...] | None:
|
|
39
|
+
"""Read and cache file lines for snippet extraction.
|
|
40
|
+
|
|
41
|
+
Returns a tuple of lines (hashable for lru_cache), or None on error.
|
|
42
|
+
Files exceeding *_MAX_FILE_SIZE* bytes are skipped to keep memory bounded.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
p = Path(path)
|
|
46
|
+
if p.stat().st_size > _MAX_FILE_SIZE:
|
|
47
|
+
return None
|
|
48
|
+
return tuple(p.read_text(errors="replace").splitlines())
|
|
49
|
+
except Exception:
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
SCHEMA_SQL = """
|
|
54
|
+
CREATE SEQUENCE IF NOT EXISTS seq_repo_id START 1;
|
|
55
|
+
CREATE SEQUENCE IF NOT EXISTS seq_file_id START 1;
|
|
56
|
+
CREATE SEQUENCE IF NOT EXISTS seq_node_id START 1;
|
|
57
|
+
CREATE SEQUENCE IF NOT EXISTS seq_edge_id START 1;
|
|
58
|
+
CREATE SEQUENCE IF NOT EXISTS seq_usage_id START 1;
|
|
59
|
+
CREATE SEQUENCE IF NOT EXISTS seq_lineage_id START 1;
|
|
60
|
+
|
|
61
|
+
CREATE TABLE IF NOT EXISTS repos (
|
|
62
|
+
repo_id INTEGER PRIMARY KEY DEFAULT nextval('seq_repo_id'),
|
|
63
|
+
name TEXT NOT NULL UNIQUE,
|
|
64
|
+
path TEXT NOT NULL,
|
|
65
|
+
last_commit TEXT,
|
|
66
|
+
last_branch TEXT,
|
|
67
|
+
indexed_at TIMESTAMP
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
71
|
+
file_id INTEGER PRIMARY KEY DEFAULT nextval('seq_file_id'),
|
|
72
|
+
repo_id INTEGER NOT NULL, -- logical FK to repos(repo_id)
|
|
73
|
+
path TEXT NOT NULL,
|
|
74
|
+
language TEXT NOT NULL,
|
|
75
|
+
checksum TEXT NOT NULL,
|
|
76
|
+
indexed_at TIMESTAMP DEFAULT now(),
|
|
77
|
+
UNIQUE(repo_id, path)
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
81
|
+
node_id INTEGER PRIMARY KEY DEFAULT nextval('seq_node_id'),
|
|
82
|
+
file_id INTEGER, -- NULL for phantom nodes; logical FK to files(file_id)
|
|
83
|
+
kind TEXT NOT NULL,
|
|
84
|
+
name TEXT NOT NULL,
|
|
85
|
+
schema TEXT,
|
|
86
|
+
language TEXT NOT NULL,
|
|
87
|
+
line_start INTEGER,
|
|
88
|
+
line_end INTEGER,
|
|
89
|
+
metadata JSON,
|
|
90
|
+
UNIQUE(file_id, kind, name, schema)
|
|
91
|
+
);
|
|
92
|
+
|
|
93
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
94
|
+
edge_id INTEGER PRIMARY KEY DEFAULT nextval('seq_edge_id'),
|
|
95
|
+
source_id INTEGER NOT NULL, -- logical FK to nodes(node_id)
|
|
96
|
+
target_id INTEGER NOT NULL, -- logical FK to nodes(node_id)
|
|
97
|
+
relationship TEXT NOT NULL,
|
|
98
|
+
context TEXT,
|
|
99
|
+
metadata JSON
|
|
100
|
+
);
|
|
101
|
+
|
|
102
|
+
CREATE TABLE IF NOT EXISTS column_usage (
|
|
103
|
+
usage_id INTEGER PRIMARY KEY DEFAULT nextval('seq_usage_id'),
|
|
104
|
+
node_id INTEGER NOT NULL, -- logical FK to nodes(node_id)
|
|
105
|
+
table_name TEXT NOT NULL,
|
|
106
|
+
column_name TEXT NOT NULL,
|
|
107
|
+
usage_type TEXT NOT NULL,
|
|
108
|
+
alias TEXT,
|
|
109
|
+
transform TEXT,
|
|
110
|
+
file_id INTEGER NOT NULL -- logical FK to files(file_id)
|
|
111
|
+
);
|
|
112
|
+
|
|
113
|
+
CREATE TABLE IF NOT EXISTS column_lineage (
|
|
114
|
+
lineage_id INTEGER PRIMARY KEY DEFAULT nextval('seq_lineage_id'),
|
|
115
|
+
file_id INTEGER NOT NULL, -- logical FK to files(file_id)
|
|
116
|
+
output_node TEXT NOT NULL,
|
|
117
|
+
output_column TEXT NOT NULL,
|
|
118
|
+
chain_index INTEGER NOT NULL DEFAULT 0,
|
|
119
|
+
hop_index INTEGER NOT NULL,
|
|
120
|
+
hop_column TEXT NOT NULL,
|
|
121
|
+
hop_table TEXT NOT NULL,
|
|
122
|
+
hop_expression TEXT
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
INDEX_SQL = """
|
|
128
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
|
|
129
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
|
|
130
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file_id);
|
|
131
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_kind_name ON nodes(kind, name);
|
|
132
|
+
CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_id);
|
|
133
|
+
CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_id);
|
|
134
|
+
CREATE INDEX IF NOT EXISTS idx_edges_relationship ON edges(relationship);
|
|
135
|
+
CREATE INDEX IF NOT EXISTS idx_col_table ON column_usage(table_name);
|
|
136
|
+
CREATE INDEX IF NOT EXISTS idx_col_column ON column_usage(column_name);
|
|
137
|
+
CREATE INDEX IF NOT EXISTS idx_col_table_column ON column_usage(table_name, column_name);
|
|
138
|
+
CREATE INDEX IF NOT EXISTS idx_col_usage_type ON column_usage(usage_type);
|
|
139
|
+
CREATE INDEX IF NOT EXISTS idx_lineage_output ON column_lineage(output_node, output_column);
|
|
140
|
+
CREATE INDEX IF NOT EXISTS idx_lineage_hop ON column_lineage(hop_table, hop_column);
|
|
141
|
+
CREATE INDEX IF NOT EXISTS idx_lineage_file ON column_lineage(file_id);
|
|
142
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_schema ON nodes(schema);
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class GraphDB:
|
|
147
|
+
"""DuckDB-backed knowledge graph storage.
|
|
148
|
+
|
|
149
|
+
The sole storage layer for the SQL indexer. Manages a DuckDB database
|
|
150
|
+
containing repos, files, nodes, edges, column usage, and column lineage
|
|
151
|
+
tables. Provides insert/upsert methods for the indexer and query methods
|
|
152
|
+
consumed by the MCP tool layer.
|
|
153
|
+
|
|
154
|
+
Thread-safety (read/write separation):
|
|
155
|
+
Write operations are serialised through ``_write_lock`` (a
|
|
156
|
+
``threading.RLock``). Read operations use a fresh cursor and
|
|
157
|
+
require no lock -- DuckDB MVCC ensures snapshot isolation.
|
|
158
|
+
|
|
159
|
+
The ``write_transaction()`` context manager holds the lock for the
|
|
160
|
+
full ``BEGIN .. COMMIT`` scope so no other thread can interleave
|
|
161
|
+
write statements. ``asyncio.to_thread()`` callers are safe because
|
|
162
|
+
reads are lock-free and writes acquire the lock internally.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, db_path: str | Path | None = None):
|
|
166
|
+
"""Initialise the database.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
db_path: Path to DuckDB file. None for in-memory (testing).
|
|
170
|
+
"""
|
|
171
|
+
self.db_path = str(db_path) if db_path else ":memory:"
|
|
172
|
+
self.conn = duckdb.connect(self.db_path)
|
|
173
|
+
self._write_lock = threading.RLock()
|
|
174
|
+
# Thread-local flag: only the thread holding _write_lock inside
|
|
175
|
+
# write_transaction() sets this to True. _execute_read checks it
|
|
176
|
+
# to decide whether to use the main connection (to see uncommitted
|
|
177
|
+
# writes) or a fresh cursor (snapshot isolation).
|
|
178
|
+
self._tlocal = threading.local()
|
|
179
|
+
self._init_schema()
|
|
180
|
+
|
|
181
|
+
def _init_schema(self) -> None:
|
|
182
|
+
"""Create tables and indices if they don't exist."""
|
|
183
|
+
with self._write_lock:
|
|
184
|
+
self._execute_write(SCHEMA_SQL)
|
|
185
|
+
self._execute_write(INDEX_SQL)
|
|
186
|
+
|
|
187
|
+
def _execute_read(self, sql: str, params=None):
|
|
188
|
+
"""Execute a read-only SQL statement.
|
|
189
|
+
|
|
190
|
+
When called **outside** a write transaction, creates a fresh cursor
|
|
191
|
+
for snapshot isolation so reads never block writes.
|
|
192
|
+
|
|
193
|
+
When called **inside** a write transaction (``_in_transaction`` is
|
|
194
|
+
``True``), uses the main connection so the read can see uncommitted
|
|
195
|
+
writes from the current transaction.
|
|
196
|
+
"""
|
|
197
|
+
if getattr(self._tlocal, "in_transaction", False):
|
|
198
|
+
# Inside a write transaction — must read from the same
|
|
199
|
+
# connection to see uncommitted data.
|
|
200
|
+
if params:
|
|
201
|
+
return self.conn.execute(sql, params)
|
|
202
|
+
return self.conn.execute(sql)
|
|
203
|
+
cursor = self.conn.cursor()
|
|
204
|
+
try:
|
|
205
|
+
if params:
|
|
206
|
+
return cursor.execute(sql, params)
|
|
207
|
+
return cursor.execute(sql)
|
|
208
|
+
except Exception:
|
|
209
|
+
cursor.close()
|
|
210
|
+
raise
|
|
211
|
+
|
|
212
|
+
def _execute_write(self, sql: str, params=None):
|
|
213
|
+
"""Execute a write SQL statement on the main connection.
|
|
214
|
+
|
|
215
|
+
The caller **must** already hold ``_write_lock`` (either directly
|
|
216
|
+
or via ``write_transaction()``). Uses ``self.conn.execute()``
|
|
217
|
+
directly so that writes participate in the current transaction.
|
|
218
|
+
"""
|
|
219
|
+
if params:
|
|
220
|
+
return self.conn.execute(sql, params)
|
|
221
|
+
return self.conn.execute(sql)
|
|
222
|
+
|
|
223
|
+
def close(self) -> None:
|
|
224
|
+
"""Close the underlying DuckDB connection."""
|
|
225
|
+
with self._write_lock:
|
|
226
|
+
self.conn.close()
|
|
227
|
+
|
|
228
|
+
def __enter__(self) -> "GraphDB":
|
|
229
|
+
return self
|
|
230
|
+
|
|
231
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
232
|
+
self.close()
|
|
233
|
+
|
|
234
|
+
@staticmethod
|
|
235
|
+
def clear_snippet_cache() -> None:
|
|
236
|
+
"""Clear the cached file contents used for snippet extraction.
|
|
237
|
+
|
|
238
|
+
Should be called after reindex to avoid serving stale content.
|
|
239
|
+
"""
|
|
240
|
+
_read_file_lines.cache_clear()
|
|
241
|
+
|
|
242
|
+
@contextmanager
|
|
243
|
+
def write_transaction(self):
|
|
244
|
+
"""Context manager that holds ``_write_lock`` for a full transaction.
|
|
245
|
+
|
|
246
|
+
Acquires the write lock, issues ``BEGIN TRANSACTION``, yields, then
|
|
247
|
+
``COMMIT`` on success or ``ROLLBACK`` on exception.
|
|
248
|
+
|
|
249
|
+
Re-entrant: if the current thread already holds the lock and is
|
|
250
|
+
inside a transaction, yields without starting a nested one (DuckDB
|
|
251
|
+
does not support nested transactions).
|
|
252
|
+
"""
|
|
253
|
+
if getattr(self._tlocal, "in_transaction", False):
|
|
254
|
+
yield
|
|
255
|
+
return
|
|
256
|
+
with self._write_lock:
|
|
257
|
+
self.conn.execute("BEGIN TRANSACTION")
|
|
258
|
+
self._tlocal.in_transaction = True
|
|
259
|
+
try:
|
|
260
|
+
yield
|
|
261
|
+
self.conn.execute("COMMIT")
|
|
262
|
+
except Exception:
|
|
263
|
+
self.conn.execute("ROLLBACK")
|
|
264
|
+
raise
|
|
265
|
+
finally:
|
|
266
|
+
self._tlocal.in_transaction = False
|
|
267
|
+
|
|
268
|
+
@contextmanager
|
|
269
|
+
def transaction(self):
|
|
270
|
+
"""Backward-compatible alias for :meth:`write_transaction`.
|
|
271
|
+
|
|
272
|
+
.. deprecated:: 0.6
|
|
273
|
+
Use :meth:`write_transaction` instead.
|
|
274
|
+
"""
|
|
275
|
+
warnings.warn(
|
|
276
|
+
"transaction() is deprecated, use write_transaction() instead",
|
|
277
|
+
DeprecationWarning,
|
|
278
|
+
stacklevel=2,
|
|
279
|
+
)
|
|
280
|
+
with self.write_transaction():
|
|
281
|
+
yield
|
|
282
|
+
|
|
283
|
+
# ── Repo management ──
|
|
284
|
+
|
|
285
|
+
def upsert_repo(self, name: str, path: str) -> int:
|
|
286
|
+
"""Create or update a repo entry.
|
|
287
|
+
|
|
288
|
+
Updates the stored path if the repo has been moved.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
name: Unique repo name used as the identifier across the index.
|
|
292
|
+
path: Absolute filesystem path to the repo root.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
The ``repo_id`` (existing or newly created).
|
|
296
|
+
"""
|
|
297
|
+
with self._write_lock:
|
|
298
|
+
existing = self._execute_write("SELECT repo_id, path FROM repos WHERE name = ?", [name]).fetchone()
|
|
299
|
+
if existing:
|
|
300
|
+
if existing[1] != str(path):
|
|
301
|
+
self._execute_write(
|
|
302
|
+
"UPDATE repos SET path = ? WHERE repo_id = ?",
|
|
303
|
+
[str(path), existing[0]],
|
|
304
|
+
)
|
|
305
|
+
return existing[0]
|
|
306
|
+
result = self._execute_write(
|
|
307
|
+
"INSERT INTO repos (name, path) VALUES (?, ?) RETURNING repo_id",
|
|
308
|
+
[name, str(path)],
|
|
309
|
+
).fetchone()
|
|
310
|
+
return result[0]
|
|
311
|
+
|
|
312
|
+
def update_repo_metadata(self, repo_id: int, commit: str | None = None, branch: str | None = None) -> None:
|
|
313
|
+
"""Update the last indexed commit/branch for a repo."""
|
|
314
|
+
with self._write_lock:
|
|
315
|
+
self._execute_write(
|
|
316
|
+
"UPDATE repos SET last_commit = ?, last_branch = ?, indexed_at = now() WHERE repo_id = ?",
|
|
317
|
+
[commit, branch, repo_id],
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def delete_repo(self, repo_id: int) -> None:
|
|
321
|
+
"""Delete a repo and all associated data (manual cascade).
|
|
322
|
+
|
|
323
|
+
DuckDB does not support ``ON DELETE CASCADE``, so child rows are
|
|
324
|
+
deleted in dependency order: lineage, column_usage, edges, nodes,
|
|
325
|
+
files, then the repo itself.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
repo_id: ID of the repo to delete.
|
|
329
|
+
"""
|
|
330
|
+
with self.write_transaction():
|
|
331
|
+
self._delete_repo_impl(repo_id)
|
|
332
|
+
|
|
333
|
+
def _delete_repo_impl(self, repo_id: int) -> None:
|
|
334
|
+
"""Inner impl -- caller must hold ``_write_lock`` (via write_transaction)."""
|
|
335
|
+
# Delete column_lineage for all files in repo
|
|
336
|
+
self._execute_write(
|
|
337
|
+
"DELETE FROM column_lineage WHERE file_id IN (SELECT file_id FROM files WHERE repo_id = ?)",
|
|
338
|
+
[repo_id],
|
|
339
|
+
)
|
|
340
|
+
# Delete column_usage for all nodes in repo's files
|
|
341
|
+
self._execute_write(
|
|
342
|
+
"DELETE FROM column_usage WHERE file_id IN (SELECT file_id FROM files WHERE repo_id = ?)",
|
|
343
|
+
[repo_id],
|
|
344
|
+
)
|
|
345
|
+
# Delete edges referencing repo's nodes
|
|
346
|
+
self._execute_write(
|
|
347
|
+
"DELETE FROM edges WHERE source_id IN "
|
|
348
|
+
"(SELECT node_id FROM nodes WHERE file_id IN "
|
|
349
|
+
"(SELECT file_id FROM files WHERE repo_id = ?))",
|
|
350
|
+
[repo_id],
|
|
351
|
+
)
|
|
352
|
+
self._execute_write(
|
|
353
|
+
"DELETE FROM edges WHERE target_id IN "
|
|
354
|
+
"(SELECT node_id FROM nodes WHERE file_id IN "
|
|
355
|
+
"(SELECT file_id FROM files WHERE repo_id = ?))",
|
|
356
|
+
[repo_id],
|
|
357
|
+
)
|
|
358
|
+
# Delete nodes
|
|
359
|
+
self._execute_write(
|
|
360
|
+
"DELETE FROM nodes WHERE file_id IN (SELECT file_id FROM files WHERE repo_id = ?)",
|
|
361
|
+
[repo_id],
|
|
362
|
+
)
|
|
363
|
+
# Delete files
|
|
364
|
+
self._execute_write("DELETE FROM files WHERE repo_id = ?", [repo_id])
|
|
365
|
+
# Delete repo
|
|
366
|
+
self._execute_write("DELETE FROM repos WHERE repo_id = ?", [repo_id])
|
|
367
|
+
|
|
368
|
+
# ── File management ──
|
|
369
|
+
|
|
370
|
+
def get_file_checksums(self, repo_id: int) -> dict[str, str]:
|
|
371
|
+
"""Get {path: checksum} for all files in a repo."""
|
|
372
|
+
rows = self._execute_read("SELECT path, checksum FROM files WHERE repo_id = ?", [repo_id]).fetchall()
|
|
373
|
+
return {path: checksum for path, checksum in rows}
|
|
374
|
+
|
|
375
|
+
def delete_file_data(self, repo_id: int, path: str) -> None:
|
|
376
|
+
"""Delete all data for a file (nodes, edges, column_usage, file record).
|
|
377
|
+
|
|
378
|
+
Nodes that have inbound edges from OTHER files are converted to phantom
|
|
379
|
+
nodes (file_id=NULL) instead of being deleted, so that cross-file edges
|
|
380
|
+
survive incremental reindex. cleanup_phantoms() will later merge these
|
|
381
|
+
phantoms with the newly-inserted real nodes.
|
|
382
|
+
|
|
383
|
+
Wraps in a write_transaction if not already inside one.
|
|
384
|
+
"""
|
|
385
|
+
with self.write_transaction():
|
|
386
|
+
self._delete_file_data_impl(repo_id, path)
|
|
387
|
+
|
|
388
|
+
def _delete_file_data_impl(self, repo_id: int, path: str) -> None:
|
|
389
|
+
"""Inner impl -- caller must hold ``_write_lock`` (via write_transaction)."""
|
|
390
|
+
file_row = self._execute_write(
|
|
391
|
+
"SELECT file_id FROM files WHERE repo_id = ? AND path = ?",
|
|
392
|
+
[repo_id, path],
|
|
393
|
+
).fetchone()
|
|
394
|
+
if not file_row:
|
|
395
|
+
return
|
|
396
|
+
file_id = file_row[0]
|
|
397
|
+
|
|
398
|
+
self._execute_write("DELETE FROM column_lineage WHERE file_id = ?", [file_id])
|
|
399
|
+
self._execute_write("DELETE FROM column_usage WHERE file_id = ?", [file_id])
|
|
400
|
+
|
|
401
|
+
# Delete edges where source is in this file's nodes (outbound from this file)
|
|
402
|
+
self._execute_write(
|
|
403
|
+
"DELETE FROM edges WHERE source_id IN (SELECT node_id FROM nodes WHERE file_id = ?)",
|
|
404
|
+
[file_id],
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Find nodes in this file that have inbound edges from OTHER files' nodes.
|
|
408
|
+
# These must be preserved as phantoms so cross-file edges survive.
|
|
409
|
+
nodes_with_cross_file_edges = self._execute_write(
|
|
410
|
+
"SELECT DISTINCT n.node_id FROM nodes n "
|
|
411
|
+
"JOIN edges e ON e.target_id = n.node_id "
|
|
412
|
+
"JOIN nodes src ON e.source_id = src.node_id "
|
|
413
|
+
"WHERE n.file_id = ? AND (src.file_id IS NULL OR src.file_id != ?)",
|
|
414
|
+
[file_id, file_id],
|
|
415
|
+
).fetchall()
|
|
416
|
+
|
|
417
|
+
if nodes_with_cross_file_edges:
|
|
418
|
+
# Convert these nodes to phantoms (preserve for edge continuity)
|
|
419
|
+
phantom_ids = [row[0] for row in nodes_with_cross_file_edges]
|
|
420
|
+
placeholders = ",".join(["?"] * len(phantom_ids))
|
|
421
|
+
self._execute_write(
|
|
422
|
+
f"UPDATE nodes SET file_id = NULL, line_start = NULL, line_end = NULL "
|
|
423
|
+
f"WHERE node_id IN ({placeholders})",
|
|
424
|
+
phantom_ids,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
# Delete edges where target is in this file's non-phantom nodes
|
|
428
|
+
# (edges from other files now point to phantoms, so they're safe)
|
|
429
|
+
self._execute_write(
|
|
430
|
+
"DELETE FROM edges WHERE target_id IN (SELECT node_id FROM nodes WHERE file_id = ?)",
|
|
431
|
+
[file_id],
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
# Delete remaining (non-phantom) nodes for this file
|
|
435
|
+
self._execute_write("DELETE FROM nodes WHERE file_id = ?", [file_id])
|
|
436
|
+
self._execute_write("DELETE FROM files WHERE file_id = ?", [file_id])
|
|
437
|
+
|
|
438
|
+
def insert_file(self, repo_id: int, path: str, language: str, checksum: str) -> int:
|
|
439
|
+
"""Insert a file record.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
repo_id: Owning repo.
|
|
443
|
+
path: Relative file path within the repo.
|
|
444
|
+
language: Language identifier (e.g. ``"sql"``).
|
|
445
|
+
checksum: SHA-256 hex digest of the file content.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
The newly assigned ``file_id``.
|
|
449
|
+
"""
|
|
450
|
+
with self._write_lock:
|
|
451
|
+
result = self._execute_write(
|
|
452
|
+
"INSERT INTO files (repo_id, path, language, checksum) VALUES (?, ?, ?, ?) RETURNING file_id",
|
|
453
|
+
[repo_id, path, language, checksum],
|
|
454
|
+
).fetchone()
|
|
455
|
+
return result[0]
|
|
456
|
+
|
|
457
|
+
# ── Node management ──
|
|
458
|
+
|
|
459
|
+
def insert_node(
|
|
460
|
+
self,
|
|
461
|
+
file_id: int | None,
|
|
462
|
+
kind: str,
|
|
463
|
+
name: str,
|
|
464
|
+
language: str,
|
|
465
|
+
line_start: int | None = None,
|
|
466
|
+
line_end: int | None = None,
|
|
467
|
+
metadata: dict | None = None,
|
|
468
|
+
schema: str | None = None,
|
|
469
|
+
) -> int:
|
|
470
|
+
"""Insert a single node.
|
|
471
|
+
|
|
472
|
+
Args:
|
|
473
|
+
file_id: Owning file, or ``None`` for phantom nodes.
|
|
474
|
+
kind: Node kind (e.g. ``"table"``, ``"view"``, ``"cte"``).
|
|
475
|
+
name: Unqualified entity name.
|
|
476
|
+
language: Language identifier.
|
|
477
|
+
line_start: First source line, if known.
|
|
478
|
+
line_end: Last source line, if known.
|
|
479
|
+
metadata: Arbitrary JSON-serialisable metadata.
|
|
480
|
+
schema: Database schema qualifier (e.g. ``"staging"``).
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
The newly assigned ``node_id``.
|
|
484
|
+
"""
|
|
485
|
+
with self._write_lock:
|
|
486
|
+
result = self._execute_write(
|
|
487
|
+
"INSERT INTO nodes (file_id, kind, name, language, "
|
|
488
|
+
"line_start, line_end, metadata, schema) "
|
|
489
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?) RETURNING node_id",
|
|
490
|
+
[
|
|
491
|
+
file_id,
|
|
492
|
+
kind,
|
|
493
|
+
name,
|
|
494
|
+
language,
|
|
495
|
+
line_start,
|
|
496
|
+
line_end,
|
|
497
|
+
json.dumps(metadata) if metadata else None,
|
|
498
|
+
schema,
|
|
499
|
+
],
|
|
500
|
+
).fetchone()
|
|
501
|
+
return result[0]
|
|
502
|
+
|
|
503
|
+
def resolve_node(
|
|
504
|
+
self,
|
|
505
|
+
name: str,
|
|
506
|
+
kind: str,
|
|
507
|
+
repo_id: int | None = None,
|
|
508
|
+
schema: str | None = None,
|
|
509
|
+
) -> int | None:
|
|
510
|
+
"""Find a node by name and kind.
|
|
511
|
+
|
|
512
|
+
Matches on short name (e.g. ``"orders"``) which covers both
|
|
513
|
+
unqualified references and qualified ones (stored as short name
|
|
514
|
+
plus schema column). Search order: same repo first, then cross-repo.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
name: Unqualified entity name.
|
|
518
|
+
kind: Node kind to match.
|
|
519
|
+
repo_id: Prefer nodes from this repo. Falls back to cross-repo
|
|
520
|
+
search if not found.
|
|
521
|
+
schema: Optional schema qualifier. When provided, only nodes
|
|
522
|
+
with a matching ``schema`` column are returned.
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
The ``node_id`` if found, otherwise ``None``.
|
|
526
|
+
"""
|
|
527
|
+
schema_clause = ""
|
|
528
|
+
schema_params: list = []
|
|
529
|
+
if schema is not None:
|
|
530
|
+
schema_clause = " AND n.schema = ?"
|
|
531
|
+
schema_params = [schema]
|
|
532
|
+
|
|
533
|
+
if repo_id:
|
|
534
|
+
row = self._execute_read(
|
|
535
|
+
"SELECT n.node_id FROM nodes n "
|
|
536
|
+
"JOIN files f ON n.file_id = f.file_id "
|
|
537
|
+
"WHERE n.name = ? AND n.kind = ? AND f.repo_id = ?" + schema_clause + " LIMIT 1",
|
|
538
|
+
[name, kind, repo_id] + schema_params,
|
|
539
|
+
).fetchone()
|
|
540
|
+
if row:
|
|
541
|
+
return row[0]
|
|
542
|
+
|
|
543
|
+
# Cross-repo search (use alias so schema_clause referencing 'n.' works)
|
|
544
|
+
row = self._execute_read(
|
|
545
|
+
"SELECT n.node_id FROM nodes n WHERE n.name = ? AND n.kind = ?" + schema_clause + " LIMIT 1",
|
|
546
|
+
[name, kind] + schema_params,
|
|
547
|
+
).fetchone()
|
|
548
|
+
return row[0] if row else None
|
|
549
|
+
|
|
550
|
+
def get_or_create_phantom(self, name: str, kind: str, language: str) -> int:
|
|
551
|
+
"""Get an existing phantom node or create one. Returns node_id."""
|
|
552
|
+
with self._write_lock:
|
|
553
|
+
row = self._execute_write(
|
|
554
|
+
"SELECT node_id FROM nodes WHERE name = ? AND kind = ? AND file_id IS NULL LIMIT 1",
|
|
555
|
+
[name, kind],
|
|
556
|
+
).fetchone()
|
|
557
|
+
if row:
|
|
558
|
+
return row[0]
|
|
559
|
+
# insert_node acquires _write_lock (RLock is re-entrant)
|
|
560
|
+
return self.insert_node(file_id=None, kind=kind, name=name, language=language)
|
|
561
|
+
|
|
562
|
+
def cleanup_phantoms(self) -> int:
|
|
563
|
+
"""Repoint edges from phantom nodes to real counterparts, then delete phantoms.
|
|
564
|
+
|
|
565
|
+
A phantom node (file_id IS NULL) can be replaced when a real node with
|
|
566
|
+
the same name+kind exists. Edges pointing to/from the phantom are updated
|
|
567
|
+
to reference the real node, then the phantom is deleted.
|
|
568
|
+
|
|
569
|
+
Returns the number of phantom nodes cleaned up.
|
|
570
|
+
"""
|
|
571
|
+
with self._write_lock:
|
|
572
|
+
# Find phantoms that have a real counterpart
|
|
573
|
+
phantoms = self._execute_write(
|
|
574
|
+
"SELECT p.node_id AS phantom_id, r.node_id AS real_id "
|
|
575
|
+
"FROM nodes p "
|
|
576
|
+
"JOIN nodes r ON p.name = r.name AND p.kind = r.kind "
|
|
577
|
+
"AND COALESCE(p.schema, '') = COALESCE(r.schema, '') "
|
|
578
|
+
"WHERE p.file_id IS NULL AND r.file_id IS NOT NULL"
|
|
579
|
+
).fetchall()
|
|
580
|
+
|
|
581
|
+
if not phantoms:
|
|
582
|
+
# Still check for orphaned phantoms (no edges at all)
|
|
583
|
+
orphaned = self._execute_write(
|
|
584
|
+
"SELECT node_id FROM nodes "
|
|
585
|
+
"WHERE file_id IS NULL "
|
|
586
|
+
"AND node_id NOT IN (SELECT source_id FROM edges) "
|
|
587
|
+
"AND node_id NOT IN (SELECT target_id FROM edges)"
|
|
588
|
+
).fetchall()
|
|
589
|
+
# Also find stale phantoms: phantoms whose only inbound edges
|
|
590
|
+
# come from other phantoms (no real node references them).
|
|
591
|
+
stale = self._execute_write(
|
|
592
|
+
"SELECT p.node_id FROM nodes p "
|
|
593
|
+
"WHERE p.file_id IS NULL "
|
|
594
|
+
"AND p.node_id IN (SELECT target_id FROM edges) "
|
|
595
|
+
"AND NOT EXISTS ("
|
|
596
|
+
" SELECT 1 FROM edges e "
|
|
597
|
+
" JOIN nodes src ON e.source_id = src.node_id "
|
|
598
|
+
" WHERE e.target_id = p.node_id AND src.file_id IS NOT NULL"
|
|
599
|
+
")"
|
|
600
|
+
).fetchall()
|
|
601
|
+
to_delete = {row[0] for row in orphaned} | {row[0] for row in stale}
|
|
602
|
+
if to_delete:
|
|
603
|
+
delete_ids = list(to_delete)
|
|
604
|
+
placeholders = ",".join(["?"] * len(delete_ids))
|
|
605
|
+
# Remove edges referencing stale phantoms before deleting nodes
|
|
606
|
+
self._execute_write(
|
|
607
|
+
f"DELETE FROM edges WHERE source_id IN ({placeholders}) OR target_id IN ({placeholders})",
|
|
608
|
+
delete_ids + delete_ids,
|
|
609
|
+
)
|
|
610
|
+
self._execute_write(
|
|
611
|
+
f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
|
|
612
|
+
delete_ids,
|
|
613
|
+
)
|
|
614
|
+
return len(to_delete)
|
|
615
|
+
return 0
|
|
616
|
+
|
|
617
|
+
# Batch repoint edges: single UPDATE per direction using a mapping table
|
|
618
|
+
# instead of O(phantoms) individual UPDATEs.
|
|
619
|
+
mapping_values = ", ".join([f"({phantom_id}, {real_id})" for phantom_id, real_id in phantoms])
|
|
620
|
+
self._execute_write(
|
|
621
|
+
f"UPDATE edges SET source_id = m.real_id "
|
|
622
|
+
f"FROM (VALUES {mapping_values}) AS m(phantom_id, real_id) "
|
|
623
|
+
f"WHERE edges.source_id = m.phantom_id"
|
|
624
|
+
)
|
|
625
|
+
self._execute_write(
|
|
626
|
+
f"UPDATE edges SET target_id = m.real_id "
|
|
627
|
+
f"FROM (VALUES {mapping_values}) AS m(phantom_id, real_id) "
|
|
628
|
+
f"WHERE edges.target_id = m.phantom_id"
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
# Delete all phantoms that had real counterparts
|
|
632
|
+
phantom_ids = [p[0] for p in phantoms]
|
|
633
|
+
placeholders = ",".join(["?"] * len(phantom_ids))
|
|
634
|
+
self._execute_write(
|
|
635
|
+
f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
|
|
636
|
+
phantom_ids,
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
# Clean up orphaned phantoms: phantom nodes with no edges at all
|
|
640
|
+
orphaned = self._execute_write(
|
|
641
|
+
"SELECT node_id FROM nodes "
|
|
642
|
+
"WHERE file_id IS NULL "
|
|
643
|
+
"AND node_id NOT IN (SELECT source_id FROM edges) "
|
|
644
|
+
"AND node_id NOT IN (SELECT target_id FROM edges)"
|
|
645
|
+
).fetchall()
|
|
646
|
+
|
|
647
|
+
if orphaned:
|
|
648
|
+
orphan_ids = [row[0] for row in orphaned]
|
|
649
|
+
placeholders = ",".join(["?"] * len(orphan_ids))
|
|
650
|
+
self._execute_write(
|
|
651
|
+
f"DELETE FROM nodes WHERE node_id IN ({placeholders})",
|
|
652
|
+
orphan_ids,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
return len(phantoms) + len(orphaned)
|
|
656
|
+
|
|
657
|
+
# ── Edge management ──
|
|
658
|
+
|
|
659
|
+
def insert_edge(
|
|
660
|
+
self,
|
|
661
|
+
source_id: int,
|
|
662
|
+
target_id: int,
|
|
663
|
+
relationship: str,
|
|
664
|
+
context: str | None = None,
|
|
665
|
+
metadata: dict | None = None,
|
|
666
|
+
) -> int:
|
|
667
|
+
"""Insert an edge. Returns edge_id."""
|
|
668
|
+
with self._write_lock:
|
|
669
|
+
result = self._execute_write(
|
|
670
|
+
"INSERT INTO edges (source_id, target_id, relationship, context, metadata) "
|
|
671
|
+
"VALUES (?, ?, ?, ?, ?) RETURNING edge_id",
|
|
672
|
+
[
|
|
673
|
+
source_id,
|
|
674
|
+
target_id,
|
|
675
|
+
relationship,
|
|
676
|
+
context,
|
|
677
|
+
json.dumps(metadata) if metadata else None,
|
|
678
|
+
],
|
|
679
|
+
).fetchone()
|
|
680
|
+
return result[0]
|
|
681
|
+
|
|
682
|
+
# ── Batch inserts ──
|
|
683
|
+
|
|
684
|
+
def insert_nodes_batch(
|
|
685
|
+
self,
|
|
686
|
+
rows: list[tuple],
|
|
687
|
+
) -> list[int]:
|
|
688
|
+
"""Batch insert nodes.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
rows: List of tuples, each containing
|
|
692
|
+
``(file_id, kind, name, language, line_start, line_end, metadata_json, schema)``.
|
|
693
|
+
|
|
694
|
+
Returns:
|
|
695
|
+
``node_id`` values in insertion order.
|
|
696
|
+
"""
|
|
697
|
+
if not rows:
|
|
698
|
+
return []
|
|
699
|
+
chunk_size = 200
|
|
700
|
+
all_ids = []
|
|
701
|
+
with self._write_lock:
|
|
702
|
+
for i in range(0, len(rows), chunk_size):
|
|
703
|
+
chunk = rows[i : i + chunk_size]
|
|
704
|
+
placeholders = ", ".join(["(?, ?, ?, ?, ?, ?, ?, ?)"] * len(chunk))
|
|
705
|
+
flat = [v for row in chunk for v in row]
|
|
706
|
+
result = self.conn.execute(
|
|
707
|
+
"INSERT INTO nodes (file_id, kind, name, language, "
|
|
708
|
+
"line_start, line_end, metadata, schema) "
|
|
709
|
+
f"VALUES {placeholders} RETURNING node_id",
|
|
710
|
+
flat,
|
|
711
|
+
).fetchall()
|
|
712
|
+
all_ids.extend(r[0] for r in result)
|
|
713
|
+
return all_ids
|
|
714
|
+
|
|
715
|
+
def insert_edges_batch(self, rows: list[tuple]) -> None:
|
|
716
|
+
"""Batch insert edges.
|
|
717
|
+
|
|
718
|
+
Args:
|
|
719
|
+
rows: List of tuples, each containing
|
|
720
|
+
``(source_id, target_id, relationship, context, metadata_json)``.
|
|
721
|
+
"""
|
|
722
|
+
if not rows:
|
|
723
|
+
return
|
|
724
|
+
with self._write_lock:
|
|
725
|
+
self.conn.executemany(
|
|
726
|
+
"INSERT INTO edges (source_id, target_id, relationship, context, metadata) VALUES (?, ?, ?, ?, ?)",
|
|
727
|
+
rows,
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
def insert_column_usage_batch(self, rows: list[tuple]) -> None:
|
|
731
|
+
"""Batch insert column usage records.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
rows: List of tuples, each containing
|
|
735
|
+
``(node_id, table_name, column_name, usage_type, file_id, alias, transform)``.
|
|
736
|
+
"""
|
|
737
|
+
if not rows:
|
|
738
|
+
return
|
|
739
|
+
with self._write_lock:
|
|
740
|
+
self.conn.executemany(
|
|
741
|
+
"INSERT INTO column_usage (node_id, table_name, column_name, "
|
|
742
|
+
"usage_type, file_id, alias, transform) "
|
|
743
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
744
|
+
rows,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
def insert_column_lineage_batch(self, rows: list[tuple]) -> None:
|
|
748
|
+
"""Batch insert column lineage hops.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
rows: List of tuples, each containing
|
|
752
|
+
``(file_id, output_node, output_column, chain_index,
|
|
753
|
+
hop_index, hop_column, hop_table, hop_expression)``.
|
|
754
|
+
"""
|
|
755
|
+
if not rows:
|
|
756
|
+
return
|
|
757
|
+
with self._write_lock:
|
|
758
|
+
self.conn.executemany(
|
|
759
|
+
"INSERT INTO column_lineage "
|
|
760
|
+
"(file_id, output_node, output_column, chain_index, "
|
|
761
|
+
"hop_index, hop_column, hop_table, hop_expression) "
|
|
762
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
763
|
+
rows,
|
|
764
|
+
)
|
|
765
|
+
|
|
766
|
+
# ── Column usage ──
|
|
767
|
+
|
|
768
|
+
def insert_column_usage(
|
|
769
|
+
self,
|
|
770
|
+
node_id: int,
|
|
771
|
+
table_name: str,
|
|
772
|
+
column_name: str,
|
|
773
|
+
usage_type: str,
|
|
774
|
+
file_id: int,
|
|
775
|
+
alias: str | None = None,
|
|
776
|
+
transform: str | None = None,
|
|
777
|
+
) -> None:
|
|
778
|
+
"""Insert a column usage record."""
|
|
779
|
+
with self._write_lock:
|
|
780
|
+
self._execute_write(
|
|
781
|
+
"INSERT INTO column_usage (node_id, table_name, column_name, "
|
|
782
|
+
"usage_type, file_id, alias, transform) "
|
|
783
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?)",
|
|
784
|
+
[node_id, table_name, column_name, usage_type, file_id, alias, transform],
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
# ── Column lineage ──
|
|
788
|
+
|
|
789
|
+
def insert_column_lineage(
|
|
790
|
+
self,
|
|
791
|
+
file_id: int,
|
|
792
|
+
output_node: str,
|
|
793
|
+
output_column: str,
|
|
794
|
+
hop_index: int,
|
|
795
|
+
hop_column: str,
|
|
796
|
+
hop_table: str,
|
|
797
|
+
hop_expression: str | None = None,
|
|
798
|
+
chain_index: int = 0,
|
|
799
|
+
) -> None:
|
|
800
|
+
"""Insert a single hop in a column lineage chain."""
|
|
801
|
+
with self._write_lock:
|
|
802
|
+
self._execute_write(
|
|
803
|
+
"INSERT INTO column_lineage "
|
|
804
|
+
"(file_id, output_node, output_column, chain_index, "
|
|
805
|
+
"hop_index, hop_column, hop_table, hop_expression) "
|
|
806
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
807
|
+
[
|
|
808
|
+
file_id,
|
|
809
|
+
output_node,
|
|
810
|
+
output_column,
|
|
811
|
+
chain_index,
|
|
812
|
+
hop_index,
|
|
813
|
+
hop_column,
|
|
814
|
+
hop_table,
|
|
815
|
+
hop_expression,
|
|
816
|
+
],
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
def query_column_lineage(
|
|
820
|
+
self,
|
|
821
|
+
table: str | None = None,
|
|
822
|
+
column: str | None = None,
|
|
823
|
+
output_node: str | None = None,
|
|
824
|
+
repo: str | None = None,
|
|
825
|
+
limit: int = 100,
|
|
826
|
+
offset: int = 0,
|
|
827
|
+
) -> dict:
|
|
828
|
+
"""Query column lineage chains.
|
|
829
|
+
|
|
830
|
+
Can search by:
|
|
831
|
+
|
|
832
|
+
- ``output_node`` + ``column``: "where does this output column come from?"
|
|
833
|
+
- ``table`` + ``column`` at any hop: "where does this source column flow to?"
|
|
834
|
+
|
|
835
|
+
``limit`` applies to chain count (distinct
|
|
836
|
+
``output_node``/``output_column``/``chain_index`` combinations),
|
|
837
|
+
not raw hop rows.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
table: Filter by hop table name.
|
|
841
|
+
column: Filter by column name (output or any hop, depending
|
|
842
|
+
on whether ``output_node`` is also set).
|
|
843
|
+
output_node: Filter by the node that produces the output column.
|
|
844
|
+
repo: Filter by repo name.
|
|
845
|
+
limit: Maximum number of lineage chains to return.
|
|
846
|
+
offset: Pagination offset (in chains).
|
|
847
|
+
|
|
848
|
+
Returns:
|
|
849
|
+
Dict with keys ``"chains"`` (list of chain dicts, each with
|
|
850
|
+
``output_node``, ``output_column``, ``chain_index``, ``hops``,
|
|
851
|
+
``file``, and ``repo``) and ``"total_count"`` (int).
|
|
852
|
+
"""
|
|
853
|
+
# Build WHERE clauses for both outer (cl) and inner (cl2) aliases
|
|
854
|
+
outer_where: list[str] = []
|
|
855
|
+
inner_where: list[str] = []
|
|
856
|
+
params: list = []
|
|
857
|
+
|
|
858
|
+
if output_node:
|
|
859
|
+
outer_where.append("cl.output_node = ?")
|
|
860
|
+
inner_where.append("cl2.output_node = ?")
|
|
861
|
+
params.append(output_node)
|
|
862
|
+
if column:
|
|
863
|
+
if output_node:
|
|
864
|
+
outer_where.append("cl.output_column = ?")
|
|
865
|
+
inner_where.append("cl2.output_column = ?")
|
|
866
|
+
params.append(column)
|
|
867
|
+
else:
|
|
868
|
+
outer_where.append("(cl.output_column = ? OR cl.hop_column = ?)")
|
|
869
|
+
inner_where.append("(cl2.output_column = ? OR cl2.hop_column = ?)")
|
|
870
|
+
params.extend([column, column])
|
|
871
|
+
if table:
|
|
872
|
+
outer_where.append("cl.hop_table = ?")
|
|
873
|
+
inner_where.append("cl2.hop_table = ?")
|
|
874
|
+
params.append(table)
|
|
875
|
+
if repo:
|
|
876
|
+
outer_where.append("r.name = ?")
|
|
877
|
+
inner_where.append("r2.name = ?")
|
|
878
|
+
params.append(repo)
|
|
879
|
+
|
|
880
|
+
if not outer_where:
|
|
881
|
+
return {"chains": [], "total_count": 0}
|
|
882
|
+
|
|
883
|
+
# True total count of matching chains (before pagination)
|
|
884
|
+
count_sql = (
|
|
885
|
+
"SELECT COUNT(*) FROM ("
|
|
886
|
+
" SELECT DISTINCT cl2.output_node, cl2.output_column, cl2.chain_index "
|
|
887
|
+
" FROM column_lineage cl2 "
|
|
888
|
+
" JOIN files f2 ON cl2.file_id = f2.file_id "
|
|
889
|
+
" JOIN repos r2 ON f2.repo_id = r2.repo_id "
|
|
890
|
+
f" WHERE {' AND '.join(inner_where)} "
|
|
891
|
+
")"
|
|
892
|
+
)
|
|
893
|
+
total_count = self._execute_read(count_sql, params).fetchone()[0]
|
|
894
|
+
|
|
895
|
+
# Subquery selects distinct chains with LIMIT, then outer query
|
|
896
|
+
# fetches all hops for those chains. This ensures LIMIT counts chains,
|
|
897
|
+
# not individual hop rows.
|
|
898
|
+
sql = (
|
|
899
|
+
"SELECT cl.output_node, cl.output_column, cl.chain_index, cl.hop_index, "
|
|
900
|
+
"cl.hop_column, cl.hop_table, cl.hop_expression, "
|
|
901
|
+
"f.path, r.name as repo_name "
|
|
902
|
+
"FROM column_lineage cl "
|
|
903
|
+
"JOIN files f ON cl.file_id = f.file_id "
|
|
904
|
+
"JOIN repos r ON f.repo_id = r.repo_id "
|
|
905
|
+
"WHERE (cl.output_node, cl.output_column, cl.chain_index) IN ("
|
|
906
|
+
" SELECT DISTINCT cl2.output_node, cl2.output_column, cl2.chain_index "
|
|
907
|
+
" FROM column_lineage cl2 "
|
|
908
|
+
" JOIN files f2 ON cl2.file_id = f2.file_id "
|
|
909
|
+
" JOIN repos r2 ON f2.repo_id = r2.repo_id "
|
|
910
|
+
f" WHERE {' AND '.join(inner_where)} "
|
|
911
|
+
" ORDER BY cl2.output_node, cl2.output_column, cl2.chain_index "
|
|
912
|
+
" LIMIT ? OFFSET ?"
|
|
913
|
+
") "
|
|
914
|
+
"ORDER BY cl.output_node, cl.output_column, cl.chain_index, cl.hop_index"
|
|
915
|
+
)
|
|
916
|
+
|
|
917
|
+
# params duplicated: once for inner subquery, once not needed for outer
|
|
918
|
+
# (outer filters via the IN subquery)
|
|
919
|
+
rows = self._execute_read(sql, params + [limit, offset]).fetchall()
|
|
920
|
+
|
|
921
|
+
# Group by (output_node, output_column, chain_index) into chains
|
|
922
|
+
chains: dict[tuple[str, str, int], list] = {}
|
|
923
|
+
for r in rows:
|
|
924
|
+
key = (r[0], r[1], r[2])
|
|
925
|
+
if key not in chains:
|
|
926
|
+
chains[key] = {
|
|
927
|
+
"output_node": r[0],
|
|
928
|
+
"output_column": r[1],
|
|
929
|
+
"chain_index": r[2],
|
|
930
|
+
"hops": [],
|
|
931
|
+
"file": r[7],
|
|
932
|
+
"repo": r[8],
|
|
933
|
+
}
|
|
934
|
+
chains[key]["hops"].append(
|
|
935
|
+
{
|
|
936
|
+
"index": r[3],
|
|
937
|
+
"column": r[4],
|
|
938
|
+
"table": r[5],
|
|
939
|
+
"expression": r[6],
|
|
940
|
+
}
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
return {"chains": list(chains.values()), "total_count": total_count}
|
|
944
|
+
|
|
945
|
+
# ── Schema catalog ──
|
|
946
|
+
|
|
947
|
+
def get_table_columns(self, repo_id: int | None = None) -> dict[str, dict[str, str]]:
|
|
948
|
+
"""Build a schema catalog from indexed column usage.
|
|
949
|
+
|
|
950
|
+
Suitable for passing to ``sqlglot.optimizer.qualify_columns`` or
|
|
951
|
+
``sqlglot.lineage``. All types are set to ``"TEXT"`` since the
|
|
952
|
+
indexer does not track actual column types.
|
|
953
|
+
|
|
954
|
+
Args:
|
|
955
|
+
repo_id: Restrict to columns from this repo. ``None`` returns
|
|
956
|
+
columns across all repos.
|
|
957
|
+
|
|
958
|
+
Returns:
|
|
959
|
+
``{table_name: {column_name: "TEXT", ...}}`` mapping.
|
|
960
|
+
"""
|
|
961
|
+
if repo_id:
|
|
962
|
+
rows = self._execute_read(
|
|
963
|
+
"SELECT DISTINCT cu.table_name, cu.column_name "
|
|
964
|
+
"FROM column_usage cu "
|
|
965
|
+
"JOIN files f ON cu.file_id = f.file_id "
|
|
966
|
+
"WHERE f.repo_id = ? AND cu.column_name != '*'",
|
|
967
|
+
[repo_id],
|
|
968
|
+
).fetchall()
|
|
969
|
+
else:
|
|
970
|
+
rows = self._execute_read(
|
|
971
|
+
"SELECT DISTINCT table_name, column_name FROM column_usage WHERE column_name != '*'"
|
|
972
|
+
).fetchall()
|
|
973
|
+
|
|
974
|
+
schema: dict[str, dict[str, str]] = {}
|
|
975
|
+
for table, col in rows:
|
|
976
|
+
if table not in schema:
|
|
977
|
+
schema[table] = {}
|
|
978
|
+
schema[table][col] = "TEXT"
|
|
979
|
+
return schema
|
|
980
|
+
|
|
981
|
+
# ── Snippet helper ──
|
|
982
|
+
|
|
983
|
+
def _read_snippet(
|
|
984
|
+
self,
|
|
985
|
+
repo_name: str | None,
|
|
986
|
+
file_path: str | None,
|
|
987
|
+
line_start: int | None,
|
|
988
|
+
line_end: int | None,
|
|
989
|
+
context_lines: int = 2,
|
|
990
|
+
max_lines: int = 20,
|
|
991
|
+
) -> str | None:
|
|
992
|
+
"""Read a code snippet from the source file.
|
|
993
|
+
|
|
994
|
+
Args:
|
|
995
|
+
repo_name: Repo name to look up the base path
|
|
996
|
+
file_path: Relative file path within the repo
|
|
997
|
+
line_start: First line of the entity
|
|
998
|
+
line_end: Last line of the entity
|
|
999
|
+
context_lines: Extra lines before/after to include
|
|
1000
|
+
max_lines: Cap total snippet length
|
|
1001
|
+
"""
|
|
1002
|
+
if file_path is None or line_start is None:
|
|
1003
|
+
return None
|
|
1004
|
+
|
|
1005
|
+
# Get repo base path
|
|
1006
|
+
if repo_name:
|
|
1007
|
+
row = self._execute_read("SELECT path FROM repos WHERE name = ?", [repo_name]).fetchone()
|
|
1008
|
+
if not row:
|
|
1009
|
+
return None
|
|
1010
|
+
base = Path(row[0])
|
|
1011
|
+
else:
|
|
1012
|
+
return None
|
|
1013
|
+
|
|
1014
|
+
full_path = base / file_path
|
|
1015
|
+
if not full_path.exists():
|
|
1016
|
+
return None
|
|
1017
|
+
|
|
1018
|
+
lines = _read_file_lines(str(full_path))
|
|
1019
|
+
if lines is None:
|
|
1020
|
+
return None
|
|
1021
|
+
|
|
1022
|
+
start = max(0, line_start - 1 - context_lines)
|
|
1023
|
+
end_line = line_end or line_start
|
|
1024
|
+
end = min(len(lines), end_line + context_lines)
|
|
1025
|
+
|
|
1026
|
+
# Cap to max_lines
|
|
1027
|
+
if end - start > max_lines:
|
|
1028
|
+
end = start + max_lines
|
|
1029
|
+
|
|
1030
|
+
snippet_lines = lines[start:end]
|
|
1031
|
+
# Add line numbers
|
|
1032
|
+
numbered = [f"{start + i + 1:4d} | {line}" for i, line in enumerate(snippet_lines)]
|
|
1033
|
+
return "\n".join(numbered)
|
|
1034
|
+
|
|
1035
|
+
# ── Query methods (used by MCP tools) ──
|
|
1036
|
+
|
|
1037
|
+
def query_references(
|
|
1038
|
+
self,
|
|
1039
|
+
name: str,
|
|
1040
|
+
kind: str | None = None,
|
|
1041
|
+
schema: str | None = None,
|
|
1042
|
+
repo: str | None = None,
|
|
1043
|
+
direction: str = "both",
|
|
1044
|
+
include_snippets: bool = True,
|
|
1045
|
+
limit: int = 100,
|
|
1046
|
+
offset: int = 0,
|
|
1047
|
+
) -> dict:
|
|
1048
|
+
"""Find all references to/from a named entity.
|
|
1049
|
+
|
|
1050
|
+
Args:
|
|
1051
|
+
name: Entity name to look up.
|
|
1052
|
+
kind: Optional node kind filter.
|
|
1053
|
+
schema: Optional database schema filter.
|
|
1054
|
+
repo: Optional repo name filter.
|
|
1055
|
+
direction: ``"both"``, ``"inbound"``, or ``"outbound"``.
|
|
1056
|
+
include_snippets: Attach source code snippets when ``True``.
|
|
1057
|
+
limit: Maximum edges per direction.
|
|
1058
|
+
offset: Pagination offset.
|
|
1059
|
+
|
|
1060
|
+
Returns:
|
|
1061
|
+
Dict with keys ``"entity"`` (list of matched node dicts or
|
|
1062
|
+
``None``), ``"inbound"`` (list of referencing entities), and
|
|
1063
|
+
``"outbound"`` (list of referenced entities). Each entry
|
|
1064
|
+
contains ``name``, ``kind``, ``relationship``, ``context``,
|
|
1065
|
+
``file``, ``repo``, ``line``, and optionally ``snippet``.
|
|
1066
|
+
"""
|
|
1067
|
+
# Find the target node(s)
|
|
1068
|
+
where_clauses = ["n.name = ?"]
|
|
1069
|
+
params: list = [name]
|
|
1070
|
+
if kind:
|
|
1071
|
+
where_clauses.append("n.kind = ?")
|
|
1072
|
+
params.append(kind)
|
|
1073
|
+
if schema:
|
|
1074
|
+
where_clauses.append("n.schema = ?")
|
|
1075
|
+
params.append(schema)
|
|
1076
|
+
|
|
1077
|
+
where_str = " AND ".join(where_clauses)
|
|
1078
|
+
node_query = f"SELECT n.node_id, n.kind, n.name FROM nodes n WHERE {where_str}"
|
|
1079
|
+
target_nodes = self._execute_read(node_query, params).fetchall()
|
|
1080
|
+
|
|
1081
|
+
if not target_nodes:
|
|
1082
|
+
return {"entity": None, "inbound": [], "outbound": []}
|
|
1083
|
+
|
|
1084
|
+
node_ids = [row[0] for row in target_nodes]
|
|
1085
|
+
placeholders = ",".join(["?"] * len(node_ids))
|
|
1086
|
+
|
|
1087
|
+
result = {
|
|
1088
|
+
"entity": [{"node_id": r[0], "kind": r[1], "name": r[2]} for r in target_nodes],
|
|
1089
|
+
"inbound": [],
|
|
1090
|
+
"outbound": [],
|
|
1091
|
+
}
|
|
1092
|
+
|
|
1093
|
+
if direction in ("both", "inbound"):
|
|
1094
|
+
inbound_sql = (
|
|
1095
|
+
f"SELECT n2.name, n2.kind, e.relationship, e.context, "
|
|
1096
|
+
f"f2.path, r2.name as repo_name, n2.line_start, n2.line_end "
|
|
1097
|
+
f"FROM edges e "
|
|
1098
|
+
f"JOIN nodes n2 ON e.source_id = n2.node_id "
|
|
1099
|
+
f"LEFT JOIN files f2 ON n2.file_id = f2.file_id "
|
|
1100
|
+
f"LEFT JOIN repos r2 ON f2.repo_id = r2.repo_id "
|
|
1101
|
+
f"WHERE e.target_id IN ({placeholders}) "
|
|
1102
|
+
f"LIMIT ? OFFSET ?"
|
|
1103
|
+
)
|
|
1104
|
+
for r in self._execute_read(inbound_sql, node_ids + [limit, offset]).fetchall():
|
|
1105
|
+
entry = {
|
|
1106
|
+
"name": r[0],
|
|
1107
|
+
"kind": r[1],
|
|
1108
|
+
"relationship": r[2],
|
|
1109
|
+
"context": r[3],
|
|
1110
|
+
"file": r[4],
|
|
1111
|
+
"repo": r[5],
|
|
1112
|
+
"line": r[6],
|
|
1113
|
+
}
|
|
1114
|
+
if include_snippets:
|
|
1115
|
+
snippet = self._read_snippet(r[5], r[4], r[6], r[7])
|
|
1116
|
+
if snippet:
|
|
1117
|
+
entry["snippet"] = snippet
|
|
1118
|
+
result["inbound"].append(entry)
|
|
1119
|
+
|
|
1120
|
+
if direction in ("both", "outbound"):
|
|
1121
|
+
outbound_sql = (
|
|
1122
|
+
f"SELECT n2.name, n2.kind, e.relationship, e.context, "
|
|
1123
|
+
f"f2.path, r2.name as repo_name, n2.line_start, n2.line_end "
|
|
1124
|
+
f"FROM edges e "
|
|
1125
|
+
f"JOIN nodes n2 ON e.target_id = n2.node_id "
|
|
1126
|
+
f"LEFT JOIN files f2 ON n2.file_id = f2.file_id "
|
|
1127
|
+
f"LEFT JOIN repos r2 ON f2.repo_id = r2.repo_id "
|
|
1128
|
+
f"WHERE e.source_id IN ({placeholders}) "
|
|
1129
|
+
f"LIMIT ? OFFSET ?"
|
|
1130
|
+
)
|
|
1131
|
+
for r in self._execute_read(outbound_sql, node_ids + [limit, offset]).fetchall():
|
|
1132
|
+
entry = {
|
|
1133
|
+
"name": r[0],
|
|
1134
|
+
"kind": r[1],
|
|
1135
|
+
"relationship": r[2],
|
|
1136
|
+
"context": r[3],
|
|
1137
|
+
"file": r[4],
|
|
1138
|
+
"repo": r[5],
|
|
1139
|
+
"line": r[6],
|
|
1140
|
+
}
|
|
1141
|
+
if include_snippets:
|
|
1142
|
+
snippet = self._read_snippet(r[5], r[4], r[6], r[7])
|
|
1143
|
+
if snippet:
|
|
1144
|
+
entry["snippet"] = snippet
|
|
1145
|
+
result["outbound"].append(entry)
|
|
1146
|
+
|
|
1147
|
+
return result
|
|
1148
|
+
|
|
1149
|
+
def query_column_usage(
|
|
1150
|
+
self,
|
|
1151
|
+
table: str,
|
|
1152
|
+
column: str | None = None,
|
|
1153
|
+
usage_type: str | None = None,
|
|
1154
|
+
repo: str | None = None,
|
|
1155
|
+
limit: int = 100,
|
|
1156
|
+
offset: int = 0,
|
|
1157
|
+
) -> dict:
|
|
1158
|
+
"""Find column usage records for a table.
|
|
1159
|
+
|
|
1160
|
+
Args:
|
|
1161
|
+
table: Table name to search for.
|
|
1162
|
+
column: Optional column name filter.
|
|
1163
|
+
usage_type: Optional usage type filter (e.g. ``"select"``, ``"where"``).
|
|
1164
|
+
repo: Optional repo name filter.
|
|
1165
|
+
limit: Maximum records to return.
|
|
1166
|
+
offset: Pagination offset.
|
|
1167
|
+
|
|
1168
|
+
Returns:
|
|
1169
|
+
Dict with keys ``"usage"`` (list of usage dicts with ``table``,
|
|
1170
|
+
``column``, ``usage_type``, ``alias``, ``node_name``,
|
|
1171
|
+
``node_kind``, ``file``, ``repo``, ``line``, and optionally
|
|
1172
|
+
``transform``), ``"summary"`` (dict mapping usage_type to count),
|
|
1173
|
+
and ``"total_count"`` (int).
|
|
1174
|
+
"""
|
|
1175
|
+
where = ["cu.table_name = ?"]
|
|
1176
|
+
params: list = [table]
|
|
1177
|
+
if column:
|
|
1178
|
+
where.append("cu.column_name = ?")
|
|
1179
|
+
params.append(column)
|
|
1180
|
+
if usage_type:
|
|
1181
|
+
where.append("cu.usage_type = ?")
|
|
1182
|
+
params.append(usage_type)
|
|
1183
|
+
|
|
1184
|
+
joins = (
|
|
1185
|
+
"JOIN nodes n ON cu.node_id = n.node_id "
|
|
1186
|
+
"JOIN files f ON cu.file_id = f.file_id "
|
|
1187
|
+
"JOIN repos r ON f.repo_id = r.repo_id"
|
|
1188
|
+
)
|
|
1189
|
+
if repo:
|
|
1190
|
+
where.append("r.name = ?")
|
|
1191
|
+
params.append(repo)
|
|
1192
|
+
|
|
1193
|
+
sql = (
|
|
1194
|
+
f"SELECT cu.table_name, cu.column_name, cu.usage_type, cu.alias, "
|
|
1195
|
+
f"n.name as node_name, n.kind as node_kind, f.path, r.name as repo_name, n.line_start, "
|
|
1196
|
+
f"cu.transform "
|
|
1197
|
+
f"FROM column_usage cu {joins} "
|
|
1198
|
+
f"WHERE {' AND '.join(where)} "
|
|
1199
|
+
f"ORDER BY cu.table_name, cu.column_name, cu.usage_type "
|
|
1200
|
+
f"LIMIT ? OFFSET ?"
|
|
1201
|
+
)
|
|
1202
|
+
|
|
1203
|
+
rows = self._execute_read(sql, params + [limit, offset]).fetchall()
|
|
1204
|
+
|
|
1205
|
+
usage = []
|
|
1206
|
+
for r in rows:
|
|
1207
|
+
entry = {
|
|
1208
|
+
"table": r[0],
|
|
1209
|
+
"column": r[1],
|
|
1210
|
+
"usage_type": r[2],
|
|
1211
|
+
"alias": r[3],
|
|
1212
|
+
"node_name": r[4],
|
|
1213
|
+
"node_kind": r[5],
|
|
1214
|
+
"file": r[6],
|
|
1215
|
+
"repo": r[7],
|
|
1216
|
+
"line": r[8],
|
|
1217
|
+
}
|
|
1218
|
+
if r[9]:
|
|
1219
|
+
entry["transform"] = r[9]
|
|
1220
|
+
usage.append(entry)
|
|
1221
|
+
|
|
1222
|
+
# True total count (before pagination)
|
|
1223
|
+
count_sql = f"SELECT COUNT(*) FROM column_usage cu {joins} WHERE {' AND '.join(where)}"
|
|
1224
|
+
total_count = self._execute_read(count_sql, params).fetchone()[0]
|
|
1225
|
+
|
|
1226
|
+
# Summary by usage_type
|
|
1227
|
+
summary: dict[str, int] = {}
|
|
1228
|
+
for u in usage:
|
|
1229
|
+
summary[u["usage_type"]] = summary.get(u["usage_type"], 0) + 1
|
|
1230
|
+
|
|
1231
|
+
return {"usage": usage, "summary": summary, "total_count": total_count}
|
|
1232
|
+
|
|
1233
|
+
def query_search(
|
|
1234
|
+
self,
|
|
1235
|
+
pattern: str,
|
|
1236
|
+
kind: str | None = None,
|
|
1237
|
+
language: str | None = None,
|
|
1238
|
+
schema: str | None = None,
|
|
1239
|
+
repo: str | None = None,
|
|
1240
|
+
limit: int = 20,
|
|
1241
|
+
offset: int = 0,
|
|
1242
|
+
include_snippets: bool = True,
|
|
1243
|
+
) -> dict:
|
|
1244
|
+
"""Search nodes by name pattern (case-insensitive ``ILIKE``).
|
|
1245
|
+
|
|
1246
|
+
Args:
|
|
1247
|
+
pattern: Substring to match against node names.
|
|
1248
|
+
kind: Filter by node kind (e.g. ``"table"``, ``"view"``).
|
|
1249
|
+
language: Filter by language (e.g. ``"sql"``).
|
|
1250
|
+
schema: Filter by database schema.
|
|
1251
|
+
repo: Filter by repo name.
|
|
1252
|
+
limit: Maximum number of matches to return.
|
|
1253
|
+
offset: Number of matches to skip (for pagination).
|
|
1254
|
+
include_snippets: If ``True``, attach source code snippets to results.
|
|
1255
|
+
|
|
1256
|
+
Returns:
|
|
1257
|
+
Dict with keys ``"matches"`` (list of match dicts with ``name``,
|
|
1258
|
+
``kind``, ``language``, ``file``, ``repo``, ``line_start``,
|
|
1259
|
+
``line_end``, and optionally ``snippet``) and ``"total_count"``
|
|
1260
|
+
(int, total matching nodes before pagination).
|
|
1261
|
+
"""
|
|
1262
|
+
escaped = pattern.replace("\\", "\\\\").replace("%", "\\%").replace("_", "\\_")
|
|
1263
|
+
where = ["n.name ILIKE ? ESCAPE '\\'"]
|
|
1264
|
+
params: list = [f"%{escaped}%"]
|
|
1265
|
+
if kind:
|
|
1266
|
+
where.append("n.kind = ?")
|
|
1267
|
+
params.append(kind)
|
|
1268
|
+
if language:
|
|
1269
|
+
where.append("n.language = ?")
|
|
1270
|
+
params.append(language)
|
|
1271
|
+
if schema:
|
|
1272
|
+
where.append("n.schema = ?")
|
|
1273
|
+
params.append(schema)
|
|
1274
|
+
|
|
1275
|
+
joins = "LEFT JOIN files f ON n.file_id = f.file_id LEFT JOIN repos r ON f.repo_id = r.repo_id"
|
|
1276
|
+
if repo:
|
|
1277
|
+
where.append("r.name = ?")
|
|
1278
|
+
params.append(repo)
|
|
1279
|
+
|
|
1280
|
+
count_sql = f"SELECT COUNT(*) FROM nodes n {joins} WHERE {' AND '.join(where)}"
|
|
1281
|
+
total = self._execute_read(count_sql, params).fetchone()[0]
|
|
1282
|
+
|
|
1283
|
+
sql = (
|
|
1284
|
+
f"SELECT n.name, n.kind, n.language, f.path, r.name as repo_name, "
|
|
1285
|
+
f"n.line_start, n.line_end "
|
|
1286
|
+
f"FROM nodes n {joins} "
|
|
1287
|
+
f"WHERE {' AND '.join(where)} "
|
|
1288
|
+
f"ORDER BY n.name "
|
|
1289
|
+
f"LIMIT ? OFFSET ?"
|
|
1290
|
+
)
|
|
1291
|
+
rows = self._execute_read(sql, params + [limit, offset]).fetchall()
|
|
1292
|
+
|
|
1293
|
+
matches = []
|
|
1294
|
+
for r in rows:
|
|
1295
|
+
match = {
|
|
1296
|
+
"name": r[0],
|
|
1297
|
+
"kind": r[1],
|
|
1298
|
+
"language": r[2],
|
|
1299
|
+
"file": r[3],
|
|
1300
|
+
"repo": r[4],
|
|
1301
|
+
"line_start": r[5],
|
|
1302
|
+
"line_end": r[6],
|
|
1303
|
+
}
|
|
1304
|
+
if include_snippets:
|
|
1305
|
+
snippet = self._read_snippet(r[4], r[3], r[5], r[6])
|
|
1306
|
+
if snippet:
|
|
1307
|
+
match["snippet"] = snippet
|
|
1308
|
+
matches.append(match)
|
|
1309
|
+
|
|
1310
|
+
return {"matches": matches, "total_count": total}
|
|
1311
|
+
|
|
1312
|
+
def query_trace(
|
|
1313
|
+
self,
|
|
1314
|
+
name: str,
|
|
1315
|
+
kind: str | None = None,
|
|
1316
|
+
direction: str = "downstream",
|
|
1317
|
+
max_depth: int = 3,
|
|
1318
|
+
repo: str | None = None,
|
|
1319
|
+
include_snippets: bool = False,
|
|
1320
|
+
limit: int = 100,
|
|
1321
|
+
exclude_edges: set[tuple[str, str]] | None = None,
|
|
1322
|
+
) -> dict:
|
|
1323
|
+
"""Trace multi-hop dependency chains using a recursive CTE.
|
|
1324
|
+
|
|
1325
|
+
Args:
|
|
1326
|
+
name: Starting entity name.
|
|
1327
|
+
kind: Optional node kind filter for the starting node.
|
|
1328
|
+
direction: ``"downstream"``, ``"upstream"``, or ``"both"``.
|
|
1329
|
+
max_depth: Maximum hops to follow (capped at 10).
|
|
1330
|
+
repo: Optional repo name filter.
|
|
1331
|
+
include_snippets: Attach source code snippets when ``True``.
|
|
1332
|
+
limit: Maximum result rows.
|
|
1333
|
+
exclude_edges: Optional set of ``(source_name, target_name)``
|
|
1334
|
+
tuples. Any edge whose source and target names match a
|
|
1335
|
+
tuple in this set will be excluded from traversal. Used
|
|
1336
|
+
by PR-impact v2 to approximate a base-commit graph by
|
|
1337
|
+
removing newly-added edges from the HEAD graph.
|
|
1338
|
+
|
|
1339
|
+
Returns:
|
|
1340
|
+
Dict with keys ``"root"`` (starting node dict or ``None``),
|
|
1341
|
+
``"paths"`` (list of path-step dicts with ``name``, ``kind``,
|
|
1342
|
+
``language``, ``relationship``, ``context``, ``depth``,
|
|
1343
|
+
``file``, ``repo``, and optionally ``snippet``),
|
|
1344
|
+
``"depth_summary"`` (``{depth: count}``), and
|
|
1345
|
+
``"repos_affected"`` (sorted list of repo names). When
|
|
1346
|
+
``direction="both"``, paths are split into ``"downstream"``
|
|
1347
|
+
and ``"upstream"`` keys instead of a single ``"paths"``.
|
|
1348
|
+
"""
|
|
1349
|
+
max_depth = min(max_depth, 10)
|
|
1350
|
+
# Find starting node(s)
|
|
1351
|
+
where = ["name = ?"]
|
|
1352
|
+
params: list = [name]
|
|
1353
|
+
if kind:
|
|
1354
|
+
where.append("kind = ?")
|
|
1355
|
+
params.append(kind)
|
|
1356
|
+
|
|
1357
|
+
start_nodes = self._execute_read(
|
|
1358
|
+
f"SELECT node_id, name, kind FROM nodes WHERE {' AND '.join(where)}",
|
|
1359
|
+
params,
|
|
1360
|
+
).fetchall()
|
|
1361
|
+
|
|
1362
|
+
if not start_nodes:
|
|
1363
|
+
return {"root": None, "paths": [], "depth_summary": {}, "repos_affected": []}
|
|
1364
|
+
|
|
1365
|
+
start_id = start_nodes[0][0]
|
|
1366
|
+
|
|
1367
|
+
# Direction determines which side of the edge we follow
|
|
1368
|
+
if direction == "downstream":
|
|
1369
|
+
source_col, target_col = "source_id", "target_id"
|
|
1370
|
+
elif direction == "upstream":
|
|
1371
|
+
source_col, target_col = "target_id", "source_id"
|
|
1372
|
+
else:
|
|
1373
|
+
# Both — run downstream and upstream separately and merge
|
|
1374
|
+
down = self.query_trace(
|
|
1375
|
+
name,
|
|
1376
|
+
kind,
|
|
1377
|
+
"downstream",
|
|
1378
|
+
max_depth,
|
|
1379
|
+
repo,
|
|
1380
|
+
include_snippets,
|
|
1381
|
+
limit,
|
|
1382
|
+
exclude_edges,
|
|
1383
|
+
)
|
|
1384
|
+
up = self.query_trace(
|
|
1385
|
+
name,
|
|
1386
|
+
kind,
|
|
1387
|
+
"upstream",
|
|
1388
|
+
max_depth,
|
|
1389
|
+
repo,
|
|
1390
|
+
include_snippets,
|
|
1391
|
+
limit,
|
|
1392
|
+
exclude_edges,
|
|
1393
|
+
)
|
|
1394
|
+
return {
|
|
1395
|
+
"root": down["root"],
|
|
1396
|
+
"downstream": down["paths"],
|
|
1397
|
+
"upstream": up["paths"],
|
|
1398
|
+
"depth_summary": {
|
|
1399
|
+
depth: down["depth_summary"].get(depth, 0) + up["depth_summary"].get(depth, 0)
|
|
1400
|
+
for depth in set(down["depth_summary"]) | set(up["depth_summary"])
|
|
1401
|
+
},
|
|
1402
|
+
"repos_affected": list(set(down["repos_affected"] + up["repos_affected"])),
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
# Pre-resolve exclude_edges name pairs to ID pairs
|
|
1406
|
+
exclude_clause = ""
|
|
1407
|
+
if exclude_edges:
|
|
1408
|
+
excluded_id_pairs: set[tuple[int, int]] = set()
|
|
1409
|
+
for src_name, tgt_name in exclude_edges:
|
|
1410
|
+
rows_ex = self._execute_read(
|
|
1411
|
+
"SELECT s.node_id, t.node_id FROM nodes s, nodes t WHERE s.name = ? AND t.name = ?",
|
|
1412
|
+
[src_name, tgt_name],
|
|
1413
|
+
).fetchall()
|
|
1414
|
+
for row_ex in rows_ex:
|
|
1415
|
+
excluded_id_pairs.add((row_ex[0], row_ex[1]))
|
|
1416
|
+
if excluded_id_pairs:
|
|
1417
|
+
pairs_sql = ", ".join(f"({s}, {t})" for s, t in excluded_id_pairs)
|
|
1418
|
+
exclude_clause = f"AND (e.source_id, e.target_id) NOT IN (VALUES {pairs_sql})"
|
|
1419
|
+
|
|
1420
|
+
recursive_sql = f"""
|
|
1421
|
+
WITH RECURSIVE trace AS (
|
|
1422
|
+
SELECT
|
|
1423
|
+
e.{target_col} as node_id,
|
|
1424
|
+
e.relationship,
|
|
1425
|
+
e.context,
|
|
1426
|
+
1 as depth,
|
|
1427
|
+
ARRAY[e.{source_col}] as path
|
|
1428
|
+
FROM edges e
|
|
1429
|
+
WHERE e.{source_col} = ?
|
|
1430
|
+
{exclude_clause}
|
|
1431
|
+
|
|
1432
|
+
UNION ALL
|
|
1433
|
+
|
|
1434
|
+
SELECT
|
|
1435
|
+
e.{target_col},
|
|
1436
|
+
e.relationship,
|
|
1437
|
+
e.context,
|
|
1438
|
+
t.depth + 1,
|
|
1439
|
+
array_append(t.path, e.{source_col})
|
|
1440
|
+
FROM edges e
|
|
1441
|
+
JOIN trace t ON e.{source_col} = t.node_id
|
|
1442
|
+
WHERE t.depth < ?
|
|
1443
|
+
AND NOT array_contains(t.path, e.{target_col})
|
|
1444
|
+
{exclude_clause}
|
|
1445
|
+
)
|
|
1446
|
+
SELECT DISTINCT
|
|
1447
|
+
t.node_id, t.relationship, t.context, t.depth,
|
|
1448
|
+
n.name, n.kind, n.language,
|
|
1449
|
+
f.path as file_path, r.name as repo_name,
|
|
1450
|
+
n.line_start, n.line_end
|
|
1451
|
+
FROM trace t
|
|
1452
|
+
JOIN nodes n ON t.node_id = n.node_id
|
|
1453
|
+
LEFT JOIN files f ON n.file_id = f.file_id
|
|
1454
|
+
LEFT JOIN repos r ON f.repo_id = r.repo_id
|
|
1455
|
+
ORDER BY t.depth, n.name
|
|
1456
|
+
LIMIT ?
|
|
1457
|
+
"""
|
|
1458
|
+
|
|
1459
|
+
rows = self._execute_read(recursive_sql, [start_id, max_depth, limit]).fetchall()
|
|
1460
|
+
|
|
1461
|
+
paths = []
|
|
1462
|
+
for r in rows:
|
|
1463
|
+
entry = {
|
|
1464
|
+
"name": r[4],
|
|
1465
|
+
"kind": r[5],
|
|
1466
|
+
"language": r[6],
|
|
1467
|
+
"relationship": r[1],
|
|
1468
|
+
"context": r[2],
|
|
1469
|
+
"depth": r[3],
|
|
1470
|
+
"file": r[7],
|
|
1471
|
+
"repo": r[8],
|
|
1472
|
+
}
|
|
1473
|
+
if include_snippets:
|
|
1474
|
+
snippet = self._read_snippet(r[8], r[7], r[9], r[10])
|
|
1475
|
+
if snippet:
|
|
1476
|
+
entry["snippet"] = snippet
|
|
1477
|
+
paths.append(entry)
|
|
1478
|
+
|
|
1479
|
+
depth_summary: dict[int, int] = {}
|
|
1480
|
+
repos_affected: set[str] = set()
|
|
1481
|
+
for p in paths:
|
|
1482
|
+
depth_summary[p["depth"]] = depth_summary.get(p["depth"], 0) + 1
|
|
1483
|
+
if p["repo"]:
|
|
1484
|
+
repos_affected.add(p["repo"])
|
|
1485
|
+
|
|
1486
|
+
return {
|
|
1487
|
+
"root": {"name": start_nodes[0][1], "kind": start_nodes[0][2]},
|
|
1488
|
+
"paths": paths,
|
|
1489
|
+
"depth_summary": depth_summary,
|
|
1490
|
+
"repos_affected": sorted(repos_affected),
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
def get_index_status(self) -> dict:
|
|
1494
|
+
"""Return a summary of the current index state.
|
|
1495
|
+
|
|
1496
|
+
Returns:
|
|
1497
|
+
Dict with keys ``"repos"`` (list of repo summaries with
|
|
1498
|
+
``name``, ``path``, ``last_commit``, ``last_branch``,
|
|
1499
|
+
``indexed_at``, ``file_count``, ``node_count``),
|
|
1500
|
+
``"totals"`` (aggregate counts for ``files``, ``nodes``,
|
|
1501
|
+
``edges``, ``column_usage_records``,
|
|
1502
|
+
``column_lineage_chains``), ``"phantom_nodes"`` (int), and
|
|
1503
|
+
``"schema_version"`` (str).
|
|
1504
|
+
"""
|
|
1505
|
+
repos = self._execute_read(
|
|
1506
|
+
"SELECT r.name, r.path, r.last_commit, r.last_branch, r.indexed_at, "
|
|
1507
|
+
"COUNT(DISTINCT f.file_id) as file_count, "
|
|
1508
|
+
"COUNT(DISTINCT n.node_id) as node_count "
|
|
1509
|
+
"FROM repos r "
|
|
1510
|
+
"LEFT JOIN files f ON r.repo_id = f.repo_id "
|
|
1511
|
+
"LEFT JOIN nodes n ON f.file_id = n.file_id "
|
|
1512
|
+
"GROUP BY r.repo_id, r.name, r.path, r.last_commit, r.last_branch, r.indexed_at"
|
|
1513
|
+
).fetchall()
|
|
1514
|
+
|
|
1515
|
+
totals = self._execute_read(
|
|
1516
|
+
"SELECT "
|
|
1517
|
+
"(SELECT COUNT(*) FROM files), "
|
|
1518
|
+
"(SELECT COUNT(*) FROM nodes), "
|
|
1519
|
+
"(SELECT COUNT(*) FROM edges), "
|
|
1520
|
+
"(SELECT COUNT(*) FROM column_usage), "
|
|
1521
|
+
"(SELECT COUNT(*) FROM nodes WHERE file_id IS NULL), "
|
|
1522
|
+
"(SELECT COUNT(DISTINCT output_node || '.' || output_column) FROM column_lineage)"
|
|
1523
|
+
).fetchone()
|
|
1524
|
+
|
|
1525
|
+
return {
|
|
1526
|
+
"repos": [
|
|
1527
|
+
{
|
|
1528
|
+
"name": r[0],
|
|
1529
|
+
"path": r[1],
|
|
1530
|
+
"last_commit": r[2],
|
|
1531
|
+
"last_branch": r[3],
|
|
1532
|
+
"indexed_at": str(r[4]) if r[4] else None,
|
|
1533
|
+
"file_count": r[5],
|
|
1534
|
+
"node_count": r[6],
|
|
1535
|
+
}
|
|
1536
|
+
for r in repos
|
|
1537
|
+
],
|
|
1538
|
+
"totals": {
|
|
1539
|
+
"files": totals[0],
|
|
1540
|
+
"nodes": totals[1],
|
|
1541
|
+
"edges": totals[2],
|
|
1542
|
+
"column_usage_records": totals[3],
|
|
1543
|
+
"column_lineage_chains": totals[5],
|
|
1544
|
+
},
|
|
1545
|
+
"phantom_nodes": totals[4],
|
|
1546
|
+
"schema_version": "1.0",
|
|
1547
|
+
}
|