sql-code-graph 1.2.2__py3-none-any.whl → 1.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/METADATA +2 -4
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/RECORD +31 -30
- sqlcg/__init__.py +1 -1
- sqlcg/cli/commands/analyze.py +138 -127
- sqlcg/cli/commands/db.py +49 -51
- sqlcg/cli/commands/find.py +5 -9
- sqlcg/cli/commands/gain.py +14 -16
- sqlcg/cli/commands/git.py +11 -4
- sqlcg/cli/commands/index.py +173 -21
- sqlcg/cli/commands/mcp.py +70 -3
- sqlcg/cli/commands/reindex.py +147 -77
- sqlcg/cli/commands/uninstall.py +9 -20
- sqlcg/core/__init__.py +1 -3
- sqlcg/core/config.py +25 -81
- sqlcg/core/duckdb_backend.py +764 -0
- sqlcg/core/freshness.py +1 -1
- sqlcg/core/graph_db.py +20 -4
- sqlcg/core/queries.py +26 -7
- sqlcg/core/queries.sql +249 -0
- sqlcg/core/schema.py +1 -1
- sqlcg/indexer/indexer.py +27 -36
- sqlcg/metrics/store.py +49 -1
- sqlcg/server/control.py +1 -1
- sqlcg/server/noise_filter.py +1 -1
- sqlcg/server/read_client.py +2 -2
- sqlcg/server/server.py +184 -86
- sqlcg/server/skill.py +2 -2
- sqlcg/server/tools.py +119 -41
- sqlcg/server/writer.py +459 -0
- sqlcg/core/kuzu_backend.py +0 -445
- sqlcg/core/neo4j_backend.py +0 -233
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/WHEEL +0 -0
- {sql_code_graph-1.2.2.dist-info → sql_code_graph-1.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,764 @@
|
|
|
1
|
+
"""DuckDB implementation of GraphBackend.
|
|
2
|
+
|
|
3
|
+
Replaces KuzuDB with a relational model: one table per node label, one table
|
|
4
|
+
per edge type (mirroring core/schema.py). Lineage traversal uses recursive CTEs
|
|
5
|
+
with a cycle guard and bounded depth — proven equivalent to Kuzu on the real DWH
|
|
6
|
+
corpus (Phase 0, 2026-06-05).
|
|
7
|
+
|
|
8
|
+
Concurrency contract (DuckDB single-process MVCC):
|
|
9
|
+
- One R/W connection held for the process lifetime (no RO/RW escalation needed).
|
|
10
|
+
- DuckDB MVCC guarantees no torn reads: a read never observes a partially-applied
|
|
11
|
+
rebuild — it sees either the prior committed graph or the post-COMMIT graph.
|
|
12
|
+
- NOTE: in the server, reads and the write drain are serialized through a single
|
|
13
|
+
``backend_lock`` on the shared connection (server.py / writer.py), so a read
|
|
14
|
+
issued *during* a rebuild waits for that rebuild to finish rather than being
|
|
15
|
+
served the old snapshot concurrently. The reindex path is fast (seconds); a
|
|
16
|
+
full re-index blocks reads for its duration. A cursor-per-read path to deliver
|
|
17
|
+
true non-blocking reads during a rebuild is a tracked follow-up (v1.4.x).
|
|
18
|
+
- Cross-process: whichever process opens the file first holds an exclusive lock;
|
|
19
|
+
other processes cannot open it at all (even read-only). This is handled by the
|
|
20
|
+
existing socket-routing layer (read_client.py / server.py).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from collections.abc import Iterator
|
|
26
|
+
from contextlib import contextmanager
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
import duckdb
|
|
30
|
+
|
|
31
|
+
from sqlcg.core.graph_db import GraphBackend
|
|
32
|
+
from sqlcg.core.schema import SCHEMA_VERSION, NodeLabel, RelType
|
|
33
|
+
from sqlcg.utils.logging import getLogger
|
|
34
|
+
|
|
35
|
+
logger = getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Schema DDL — one CREATE TABLE per node label + edge type
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
_NODE_DDLS = [
|
|
42
|
+
# --- node tables ---
|
|
43
|
+
"""
|
|
44
|
+
CREATE TABLE IF NOT EXISTS "Repo" (
|
|
45
|
+
path VARCHAR PRIMARY KEY,
|
|
46
|
+
name VARCHAR
|
|
47
|
+
)
|
|
48
|
+
""",
|
|
49
|
+
"""
|
|
50
|
+
CREATE TABLE IF NOT EXISTS "File" (
|
|
51
|
+
path VARCHAR PRIMARY KEY,
|
|
52
|
+
repo_path VARCHAR,
|
|
53
|
+
sha VARCHAR,
|
|
54
|
+
dialect VARCHAR,
|
|
55
|
+
parse_failed BOOLEAN,
|
|
56
|
+
parse_cause VARCHAR
|
|
57
|
+
)
|
|
58
|
+
""",
|
|
59
|
+
"""
|
|
60
|
+
CREATE TABLE IF NOT EXISTS "SqlTable" (
|
|
61
|
+
qualified VARCHAR PRIMARY KEY,
|
|
62
|
+
catalog VARCHAR,
|
|
63
|
+
db VARCHAR,
|
|
64
|
+
name VARCHAR,
|
|
65
|
+
kind VARCHAR,
|
|
66
|
+
defined_in_file VARCHAR
|
|
67
|
+
)
|
|
68
|
+
""",
|
|
69
|
+
"""
|
|
70
|
+
CREATE TABLE IF NOT EXISTS "SqlColumn" (
|
|
71
|
+
id VARCHAR PRIMARY KEY,
|
|
72
|
+
catalog VARCHAR,
|
|
73
|
+
db VARCHAR,
|
|
74
|
+
table_name VARCHAR,
|
|
75
|
+
col_name VARCHAR,
|
|
76
|
+
table_qualified VARCHAR
|
|
77
|
+
)
|
|
78
|
+
""",
|
|
79
|
+
"""
|
|
80
|
+
CREATE TABLE IF NOT EXISTS "SqlQuery" (
|
|
81
|
+
id VARCHAR PRIMARY KEY,
|
|
82
|
+
file_path VARCHAR,
|
|
83
|
+
statement_index BIGINT,
|
|
84
|
+
sql VARCHAR,
|
|
85
|
+
kind VARCHAR,
|
|
86
|
+
target_table VARCHAR,
|
|
87
|
+
parse_failed BOOLEAN,
|
|
88
|
+
confidence FLOAT,
|
|
89
|
+
parsing_mode VARCHAR,
|
|
90
|
+
start_line BIGINT
|
|
91
|
+
)
|
|
92
|
+
""",
|
|
93
|
+
"""
|
|
94
|
+
CREATE TABLE IF NOT EXISTS "SchemaVersion" (
|
|
95
|
+
version VARCHAR PRIMARY KEY,
|
|
96
|
+
indexed_sha VARCHAR
|
|
97
|
+
)
|
|
98
|
+
""",
|
|
99
|
+
"""
|
|
100
|
+
CREATE TABLE IF NOT EXISTS "ExternalConsumer" (
|
|
101
|
+
name VARCHAR PRIMARY KEY,
|
|
102
|
+
consumer_type VARCHAR
|
|
103
|
+
)
|
|
104
|
+
""",
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
_EDGE_DDLS = [
|
|
108
|
+
"""
|
|
109
|
+
CREATE TABLE IF NOT EXISTS "BELONGS_TO" (
|
|
110
|
+
src_key VARCHAR NOT NULL,
|
|
111
|
+
dst_key VARCHAR NOT NULL,
|
|
112
|
+
PRIMARY KEY (src_key, dst_key)
|
|
113
|
+
)
|
|
114
|
+
""",
|
|
115
|
+
"""
|
|
116
|
+
CREATE TABLE IF NOT EXISTS "DEFINED_IN" (
|
|
117
|
+
src_key VARCHAR NOT NULL,
|
|
118
|
+
dst_key VARCHAR NOT NULL,
|
|
119
|
+
PRIMARY KEY (src_key, dst_key)
|
|
120
|
+
)
|
|
121
|
+
""",
|
|
122
|
+
"""
|
|
123
|
+
CREATE TABLE IF NOT EXISTS "QUERY_DEFINED_IN" (
|
|
124
|
+
src_key VARCHAR NOT NULL,
|
|
125
|
+
dst_key VARCHAR NOT NULL,
|
|
126
|
+
PRIMARY KEY (src_key, dst_key)
|
|
127
|
+
)
|
|
128
|
+
""",
|
|
129
|
+
"""
|
|
130
|
+
CREATE TABLE IF NOT EXISTS "HAS_COLUMN" (
|
|
131
|
+
src_key VARCHAR NOT NULL,
|
|
132
|
+
dst_key VARCHAR NOT NULL,
|
|
133
|
+
source VARCHAR,
|
|
134
|
+
PRIMARY KEY (src_key, dst_key)
|
|
135
|
+
)
|
|
136
|
+
""",
|
|
137
|
+
"""
|
|
138
|
+
CREATE TABLE IF NOT EXISTS "SELECTS_FROM" (
|
|
139
|
+
src_key VARCHAR NOT NULL,
|
|
140
|
+
dst_key VARCHAR NOT NULL,
|
|
141
|
+
PRIMARY KEY (src_key, dst_key)
|
|
142
|
+
)
|
|
143
|
+
""",
|
|
144
|
+
"""
|
|
145
|
+
CREATE TABLE IF NOT EXISTS "INSERTS_INTO" (
|
|
146
|
+
src_key VARCHAR NOT NULL,
|
|
147
|
+
dst_key VARCHAR NOT NULL,
|
|
148
|
+
PRIMARY KEY (src_key, dst_key)
|
|
149
|
+
)
|
|
150
|
+
""",
|
|
151
|
+
"""
|
|
152
|
+
CREATE TABLE IF NOT EXISTS "DELETES_FROM" (
|
|
153
|
+
src_key VARCHAR NOT NULL,
|
|
154
|
+
dst_key VARCHAR NOT NULL,
|
|
155
|
+
PRIMARY KEY (src_key, dst_key)
|
|
156
|
+
)
|
|
157
|
+
""",
|
|
158
|
+
"""
|
|
159
|
+
CREATE TABLE IF NOT EXISTS "UPDATES" (
|
|
160
|
+
src_key VARCHAR NOT NULL,
|
|
161
|
+
dst_key VARCHAR NOT NULL,
|
|
162
|
+
PRIMARY KEY (src_key, dst_key)
|
|
163
|
+
)
|
|
164
|
+
""",
|
|
165
|
+
"""
|
|
166
|
+
CREATE TABLE IF NOT EXISTS "COLUMN_LINEAGE" (
|
|
167
|
+
src_key VARCHAR NOT NULL,
|
|
168
|
+
dst_key VARCHAR NOT NULL,
|
|
169
|
+
transform VARCHAR,
|
|
170
|
+
confidence FLOAT,
|
|
171
|
+
query_id VARCHAR,
|
|
172
|
+
PRIMARY KEY (src_key, dst_key)
|
|
173
|
+
)
|
|
174
|
+
""",
|
|
175
|
+
"""
|
|
176
|
+
CREATE TABLE IF NOT EXISTS "DECLARES" (
|
|
177
|
+
src_key VARCHAR NOT NULL,
|
|
178
|
+
dst_key VARCHAR NOT NULL,
|
|
179
|
+
PRIMARY KEY (src_key, dst_key)
|
|
180
|
+
)
|
|
181
|
+
""",
|
|
182
|
+
"""
|
|
183
|
+
CREATE TABLE IF NOT EXISTS "STAR_SOURCE" (
|
|
184
|
+
src_key VARCHAR NOT NULL,
|
|
185
|
+
dst_key VARCHAR NOT NULL,
|
|
186
|
+
qualifier VARCHAR,
|
|
187
|
+
target_table VARCHAR,
|
|
188
|
+
confidence FLOAT,
|
|
189
|
+
PRIMARY KEY (src_key, dst_key)
|
|
190
|
+
)
|
|
191
|
+
""",
|
|
192
|
+
"""
|
|
193
|
+
CREATE TABLE IF NOT EXISTS "CONSUMED_BY" (
|
|
194
|
+
src_key VARCHAR NOT NULL,
|
|
195
|
+
dst_key VARCHAR NOT NULL,
|
|
196
|
+
PRIMARY KEY (src_key, dst_key)
|
|
197
|
+
)
|
|
198
|
+
""",
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
_INDEX_DDLS = [
|
|
202
|
+
'CREATE INDEX IF NOT EXISTS idx_BELONGS_TO_src ON "BELONGS_TO" (src_key)',
|
|
203
|
+
'CREATE INDEX IF NOT EXISTS idx_BELONGS_TO_dst ON "BELONGS_TO" (dst_key)',
|
|
204
|
+
'CREATE INDEX IF NOT EXISTS idx_DEFINED_IN_src ON "DEFINED_IN" (src_key)',
|
|
205
|
+
'CREATE INDEX IF NOT EXISTS idx_DEFINED_IN_dst ON "DEFINED_IN" (dst_key)',
|
|
206
|
+
'CREATE INDEX IF NOT EXISTS idx_QUERY_DEFINED_IN_src ON "QUERY_DEFINED_IN" (src_key)',
|
|
207
|
+
'CREATE INDEX IF NOT EXISTS idx_QUERY_DEFINED_IN_dst ON "QUERY_DEFINED_IN" (dst_key)',
|
|
208
|
+
'CREATE INDEX IF NOT EXISTS idx_HAS_COLUMN_src ON "HAS_COLUMN" (src_key)',
|
|
209
|
+
'CREATE INDEX IF NOT EXISTS idx_HAS_COLUMN_dst ON "HAS_COLUMN" (dst_key)',
|
|
210
|
+
'CREATE INDEX IF NOT EXISTS idx_SELECTS_FROM_src ON "SELECTS_FROM" (src_key)',
|
|
211
|
+
'CREATE INDEX IF NOT EXISTS idx_SELECTS_FROM_dst ON "SELECTS_FROM" (dst_key)',
|
|
212
|
+
'CREATE INDEX IF NOT EXISTS idx_INSERTS_INTO_src ON "INSERTS_INTO" (src_key)',
|
|
213
|
+
'CREATE INDEX IF NOT EXISTS idx_INSERTS_INTO_dst ON "INSERTS_INTO" (dst_key)',
|
|
214
|
+
'CREATE INDEX IF NOT EXISTS idx_DELETES_FROM_src ON "DELETES_FROM" (src_key)',
|
|
215
|
+
'CREATE INDEX IF NOT EXISTS idx_DELETES_FROM_dst ON "DELETES_FROM" (dst_key)',
|
|
216
|
+
'CREATE INDEX IF NOT EXISTS idx_UPDATES_src ON "UPDATES" (src_key)',
|
|
217
|
+
'CREATE INDEX IF NOT EXISTS idx_UPDATES_dst ON "UPDATES" (dst_key)',
|
|
218
|
+
'CREATE INDEX IF NOT EXISTS idx_COLUMN_LINEAGE_src ON "COLUMN_LINEAGE" (src_key)',
|
|
219
|
+
'CREATE INDEX IF NOT EXISTS idx_COLUMN_LINEAGE_dst ON "COLUMN_LINEAGE" (dst_key)',
|
|
220
|
+
'CREATE INDEX IF NOT EXISTS idx_DECLARES_src ON "DECLARES" (src_key)',
|
|
221
|
+
'CREATE INDEX IF NOT EXISTS idx_DECLARES_dst ON "DECLARES" (dst_key)',
|
|
222
|
+
'CREATE INDEX IF NOT EXISTS idx_STAR_SOURCE_src ON "STAR_SOURCE" (src_key)',
|
|
223
|
+
'CREATE INDEX IF NOT EXISTS idx_STAR_SOURCE_dst ON "STAR_SOURCE" (dst_key)',
|
|
224
|
+
'CREATE INDEX IF NOT EXISTS idx_CONSUMED_BY_src ON "CONSUMED_BY" (src_key)',
|
|
225
|
+
'CREATE INDEX IF NOT EXISTS idx_CONSUMED_BY_dst ON "CONSUMED_BY" (dst_key)',
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
# Node label → column list (all columns of that table, in order)
|
|
229
|
+
_NODE_COLUMNS: dict[str, list[str]] = {
|
|
230
|
+
NodeLabel.REPO: ["path", "name"],
|
|
231
|
+
NodeLabel.FILE: ["path", "repo_path", "sha", "dialect", "parse_failed", "parse_cause"],
|
|
232
|
+
NodeLabel.TABLE: ["qualified", "catalog", "db", "name", "kind", "defined_in_file"],
|
|
233
|
+
NodeLabel.COLUMN: ["id", "catalog", "db", "table_name", "col_name", "table_qualified"],
|
|
234
|
+
NodeLabel.QUERY: [
|
|
235
|
+
"id",
|
|
236
|
+
"file_path",
|
|
237
|
+
"statement_index",
|
|
238
|
+
"sql",
|
|
239
|
+
"kind",
|
|
240
|
+
"target_table",
|
|
241
|
+
"parse_failed",
|
|
242
|
+
"confidence",
|
|
243
|
+
"parsing_mode",
|
|
244
|
+
"start_line",
|
|
245
|
+
],
|
|
246
|
+
NodeLabel.SCHEMA_VERSION: ["version", "indexed_sha"],
|
|
247
|
+
NodeLabel.EXTERNAL_CONSUMER: ["name", "consumer_type"],
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
# Edge rel type → extra property columns beyond (src_key, dst_key)
|
|
251
|
+
_EDGE_EXTRA_COLUMNS: dict[str, list[str]] = {
|
|
252
|
+
RelType.BELONGS_TO: [],
|
|
253
|
+
RelType.DEFINED_IN: [],
|
|
254
|
+
RelType.QUERY_DEFINED_IN: [],
|
|
255
|
+
RelType.HAS_COLUMN: ["source"],
|
|
256
|
+
RelType.SELECTS_FROM: [],
|
|
257
|
+
RelType.INSERTS_INTO: [],
|
|
258
|
+
RelType.DELETES_FROM: [],
|
|
259
|
+
RelType.UPDATES: [],
|
|
260
|
+
RelType.COLUMN_LINEAGE: ["transform", "confidence", "query_id"],
|
|
261
|
+
RelType.DECLARES: [],
|
|
262
|
+
RelType.STAR_SOURCE: ["qualifier", "target_table", "confidence"],
|
|
263
|
+
RelType.CONSUMED_BY: [],
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
class DuckDBBackend(GraphBackend):
|
|
268
|
+
"""DuckDB implementation of the graph database backend.
|
|
269
|
+
|
|
270
|
+
Uses a single R/W connection for the process lifetime. DuckDB's MVCC
|
|
271
|
+
provides concurrent read-safety during write transactions: readers see
|
|
272
|
+
the last committed state until COMMIT flips the new graph atomically.
|
|
273
|
+
|
|
274
|
+
``init_schema`` is idempotent (CREATE IF NOT EXISTS everywhere).
|
|
275
|
+
``transaction`` provides real BEGIN/COMMIT/ROLLBACK semantics —
|
|
276
|
+
overriding the ABC no-op (ARCHITECTURE_REVIEW §3.1, HIGH).
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
def __init__(self, db_path: str) -> None:
|
|
280
|
+
"""Open (or create) a DuckDB database at *db_path*.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
db_path: Path to the DuckDB file, or ``':memory:'`` for in-memory.
|
|
284
|
+
|
|
285
|
+
Raises:
|
|
286
|
+
duckdb.IOException: If the file is already held by another process.
|
|
287
|
+
"""
|
|
288
|
+
self._db_path = db_path
|
|
289
|
+
self._txn_depth = 0
|
|
290
|
+
try:
|
|
291
|
+
self._conn = duckdb.connect(db_path)
|
|
292
|
+
except duckdb.IOException as exc:
|
|
293
|
+
exc_str = str(exc)
|
|
294
|
+
if "No such file or directory" in exc_str or "cannot open" in exc_str.lower():
|
|
295
|
+
# Parent directory does not exist — database was never initialized.
|
|
296
|
+
raise RuntimeError(
|
|
297
|
+
f"Database path does not exist: {db_path}. "
|
|
298
|
+
f"Run 'sqlcg db init' then 'sqlcg index <path>' to initialize."
|
|
299
|
+
) from exc
|
|
300
|
+
msg = (
|
|
301
|
+
f"Database is locked — another sqlcg process is running. "
|
|
302
|
+
f"Wait for it to finish or stop it with: sqlcg mcp stop\n"
|
|
303
|
+
f"(DuckDB path: {db_path})"
|
|
304
|
+
)
|
|
305
|
+
raise duckdb.IOException(msg) from exc
|
|
306
|
+
|
|
307
|
+
# ------------------------------------------------------------------
|
|
308
|
+
# Schema management
|
|
309
|
+
# ------------------------------------------------------------------
|
|
310
|
+
|
|
311
|
+
def init_schema(self) -> None:
|
|
312
|
+
"""Create all node/edge tables and indexes (idempotent, one transaction).
|
|
313
|
+
|
|
314
|
+
CREATE … IF NOT EXISTS everywhere so re-calling after schema exists
|
|
315
|
+
is a no-op. Wrapped in one transaction so a partial failure rolls
|
|
316
|
+
back cleanly (ARCHITECTURE_REVIEW §3.1).
|
|
317
|
+
"""
|
|
318
|
+
with self.transaction():
|
|
319
|
+
for ddl in _NODE_DDLS:
|
|
320
|
+
self._conn.execute(ddl)
|
|
321
|
+
for ddl in _EDGE_DDLS:
|
|
322
|
+
self._conn.execute(ddl)
|
|
323
|
+
for ddl in _INDEX_DDLS:
|
|
324
|
+
self._conn.execute(ddl)
|
|
325
|
+
# Upsert the schema version row.
|
|
326
|
+
self._conn.execute(
|
|
327
|
+
'INSERT OR REPLACE INTO "SchemaVersion" (version, indexed_sha) '
|
|
328
|
+
"VALUES (?, COALESCE("
|
|
329
|
+
' (SELECT indexed_sha FROM "SchemaVersion" WHERE version = ?), NULL'
|
|
330
|
+
"))",
|
|
331
|
+
[SCHEMA_VERSION, SCHEMA_VERSION],
|
|
332
|
+
)
|
|
333
|
+
logger.debug("DuckDB schema initialized (version %s)", SCHEMA_VERSION)
|
|
334
|
+
|
|
335
|
+
# ------------------------------------------------------------------
|
|
336
|
+
# Single-row upsert helpers (used by reindex_file / tests)
|
|
337
|
+
# ------------------------------------------------------------------
|
|
338
|
+
|
|
339
|
+
def upsert_node(self, label: str, key: str, properties: dict[str, Any]) -> None:
|
|
340
|
+
"""Upsert one node (MERGE semantics via INSERT OR REPLACE)."""
|
|
341
|
+
self._validate_props(properties)
|
|
342
|
+
pk = self._pk_field(label)
|
|
343
|
+
cols = _NODE_COLUMNS.get(label)
|
|
344
|
+
if cols is None:
|
|
345
|
+
raise ValueError(f"Unknown node label: {label!r}")
|
|
346
|
+
row = {pk: key}
|
|
347
|
+
row.update(properties)
|
|
348
|
+
# Build a full row with None for missing columns
|
|
349
|
+
values = [row.get(c) for c in cols]
|
|
350
|
+
placeholders = ", ".join("?" * len(cols))
|
|
351
|
+
col_list = ", ".join(f'"{c}"' for c in cols)
|
|
352
|
+
self._conn.execute(
|
|
353
|
+
f'INSERT OR REPLACE INTO "{label}" ({col_list}) VALUES ({placeholders})',
|
|
354
|
+
values,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def upsert_edge(
|
|
358
|
+
self,
|
|
359
|
+
src_label: str,
|
|
360
|
+
src_key: str,
|
|
361
|
+
dst_label: str,
|
|
362
|
+
dst_key: str,
|
|
363
|
+
rel_type: str,
|
|
364
|
+
properties: dict[str, Any],
|
|
365
|
+
) -> None:
|
|
366
|
+
"""Upsert one edge (MERGE semantics via INSERT OR REPLACE)."""
|
|
367
|
+
self._validate_props(properties)
|
|
368
|
+
extra = _EDGE_EXTRA_COLUMNS.get(rel_type, [])
|
|
369
|
+
all_cols = ["src_key", "dst_key"] + extra
|
|
370
|
+
values = [src_key, dst_key] + [properties.get(c) for c in extra]
|
|
371
|
+
placeholders = ", ".join("?" * len(all_cols))
|
|
372
|
+
col_list = ", ".join(f'"{c}"' for c in all_cols)
|
|
373
|
+
self._conn.execute(
|
|
374
|
+
f'INSERT OR REPLACE INTO "{rel_type}" ({col_list}) VALUES ({placeholders})',
|
|
375
|
+
values,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# ------------------------------------------------------------------
|
|
379
|
+
# Bulk upsert (performance-critical path — CLAUDE.md perf invariant)
|
|
380
|
+
# ------------------------------------------------------------------
|
|
381
|
+
|
|
382
|
+
def upsert_nodes_bulk(self, label: str, rows: list[dict[str, Any]]) -> None:
|
|
383
|
+
"""Bulk-upsert nodes of one label in a single backend call.
|
|
384
|
+
|
|
385
|
+
Uses INSERT OR REPLACE with unnest() — one execute() per label per batch.
|
|
386
|
+
This is the bulk-upsert invariant from CLAUDE.md.
|
|
387
|
+
"""
|
|
388
|
+
if not rows:
|
|
389
|
+
return
|
|
390
|
+
for row in rows:
|
|
391
|
+
self._validate_props(row)
|
|
392
|
+
|
|
393
|
+
cols = _NODE_COLUMNS.get(label)
|
|
394
|
+
if cols is None:
|
|
395
|
+
raise ValueError(f"Unknown node label: {label!r}")
|
|
396
|
+
|
|
397
|
+
# Require homogeneous rows including the primary key — a heterogeneous
|
|
398
|
+
# batch signals an upstream bug (the row builder dropped/added a field).
|
|
399
|
+
pk_field = self._pk_field(label)
|
|
400
|
+
keys = set(rows[0].keys())
|
|
401
|
+
if pk_field not in keys:
|
|
402
|
+
raise ValueError(
|
|
403
|
+
f"upsert_nodes_bulk({label}): every row must include primary key '{pk_field}'"
|
|
404
|
+
)
|
|
405
|
+
for i, row in enumerate(rows[1:], 1):
|
|
406
|
+
if set(row.keys()) != keys:
|
|
407
|
+
raise ValueError(
|
|
408
|
+
f"upsert_nodes_bulk({label}): row {i} has property keys "
|
|
409
|
+
f"{sorted(row.keys())}, expected {sorted(keys)}"
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Build column arrays from the row list (None for missing fields).
|
|
413
|
+
# DuckDB: INSERT OR REPLACE via unnest(?::VARCHAR[]) — one execute() per label
|
|
414
|
+
# per batch (CLAUDE.md perf invariant: _flush_row_batch calls each bulk once).
|
|
415
|
+
arrays = [[row.get(c) for row in rows] for c in cols]
|
|
416
|
+
col_list = ", ".join(f'"{c}"' for c in cols)
|
|
417
|
+
unnest_cols = ", ".join(f'unnest(?::VARCHAR[]) AS "{c}"' for c in cols)
|
|
418
|
+
self._conn.execute(
|
|
419
|
+
f'INSERT OR REPLACE INTO "{label}" ({col_list}) SELECT {unnest_cols}',
|
|
420
|
+
arrays,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def upsert_edges_bulk(
|
|
424
|
+
self,
|
|
425
|
+
src_label: str,
|
|
426
|
+
dst_label: str,
|
|
427
|
+
rel_type: str,
|
|
428
|
+
rows: list[dict[str, Any]],
|
|
429
|
+
) -> None:
|
|
430
|
+
"""Bulk-upsert edges of one (src_label, rel_type, dst_label) triple.
|
|
431
|
+
|
|
432
|
+
One execute() per rel_type per batch — bulk-upsert invariant from CLAUDE.md.
|
|
433
|
+
"""
|
|
434
|
+
if not rows:
|
|
435
|
+
return
|
|
436
|
+
for row in rows:
|
|
437
|
+
props = {k: v for k, v in row.items() if k not in ("src_key", "dst_key")}
|
|
438
|
+
self._validate_props(props)
|
|
439
|
+
|
|
440
|
+
# Every edge row must carry src_key and dst_key, and the batch must be
|
|
441
|
+
# homogeneous — guards against a malformed row reaching the NOT NULL
|
|
442
|
+
# constraint as an opaque ConstraintException.
|
|
443
|
+
keys = set(rows[0].keys())
|
|
444
|
+
for required in ("src_key", "dst_key"):
|
|
445
|
+
if required not in keys:
|
|
446
|
+
raise ValueError(
|
|
447
|
+
f"upsert_edges_bulk({src_label}->{rel_type}->{dst_label}): "
|
|
448
|
+
f"every row must include '{required}'"
|
|
449
|
+
)
|
|
450
|
+
for i, row in enumerate(rows[1:], 1):
|
|
451
|
+
if set(row.keys()) != keys:
|
|
452
|
+
raise ValueError(
|
|
453
|
+
f"upsert_edges_bulk: row {i} has property keys {sorted(row.keys())}, "
|
|
454
|
+
f"expected {sorted(keys)}"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
extra = _EDGE_EXTRA_COLUMNS.get(rel_type, [])
|
|
458
|
+
all_cols = ["src_key", "dst_key"] + extra
|
|
459
|
+
arrays = [[row.get(c) for row in rows] for c in all_cols]
|
|
460
|
+
col_list = ", ".join(f'"{c}"' for c in all_cols)
|
|
461
|
+
unnest_cols = ", ".join(f'unnest(?::VARCHAR[]) AS "{c}"' for c in all_cols)
|
|
462
|
+
self._conn.execute(
|
|
463
|
+
f'INSERT OR REPLACE INTO "{rel_type}" ({col_list}) SELECT {unnest_cols}',
|
|
464
|
+
arrays,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
# ------------------------------------------------------------------
|
|
468
|
+
# Read / write
|
|
469
|
+
# ------------------------------------------------------------------
|
|
470
|
+
|
|
471
|
+
@staticmethod
|
|
472
|
+
def _params_list(params: dict[str, Any]) -> list[Any]:
|
|
473
|
+
"""Convert a params dict to a positional list for DuckDB's ? placeholder.
|
|
474
|
+
|
|
475
|
+
DuckDB expects positional parameters as a list. Callers that use
|
|
476
|
+
named-param dicts (``{"name": value}``) get values in insertion order
|
|
477
|
+
(Python 3.7+ dicts). Callers that need a raw list pass ``{0: v, …}``
|
|
478
|
+
with integer keys — we return ``list(params.values())`` in both cases.
|
|
479
|
+
"""
|
|
480
|
+
return list(params.values())
|
|
481
|
+
|
|
482
|
+
def run_read(self, query: str, params: dict[str, Any]) -> list[dict[str, Any]]:
|
|
483
|
+
"""Execute a read-only SQL query and return results as list of dicts."""
|
|
484
|
+
try:
|
|
485
|
+
if params:
|
|
486
|
+
result = self._conn.execute(query, self._params_list(params))
|
|
487
|
+
else:
|
|
488
|
+
result = self._conn.execute(query)
|
|
489
|
+
columns = [desc[0] for desc in result.description or []]
|
|
490
|
+
return [dict(zip(columns, row, strict=False)) for row in result.fetchall()]
|
|
491
|
+
except Exception as exc:
|
|
492
|
+
logger.error("run_read failed: %s", exc)
|
|
493
|
+
raise
|
|
494
|
+
|
|
495
|
+
def run_write(self, query: str, params: dict[str, Any]) -> None:
|
|
496
|
+
"""Execute a write SQL statement."""
|
|
497
|
+
try:
|
|
498
|
+
if params:
|
|
499
|
+
self._conn.execute(query, self._params_list(params))
|
|
500
|
+
else:
|
|
501
|
+
self._conn.execute(query)
|
|
502
|
+
except Exception as exc:
|
|
503
|
+
logger.error("run_write failed: %s", exc)
|
|
504
|
+
raise
|
|
505
|
+
|
|
506
|
+
# ------------------------------------------------------------------
|
|
507
|
+
# Delete helpers
|
|
508
|
+
# ------------------------------------------------------------------
|
|
509
|
+
|
|
510
|
+
def delete_nodes_for_file(self, file_path: str) -> None:
|
|
511
|
+
"""Delete all nodes and edges for *file_path*.
|
|
512
|
+
|
|
513
|
+
Deletion order:
|
|
514
|
+
A. SqlColumn nodes for tables DEFINED_IN this file
|
|
515
|
+
B. SqlQuery nodes QUERY_DEFINED_IN this file (all their edges via cascades
|
|
516
|
+
implemented as explicit DELETE on each edge table)
|
|
517
|
+
C. SqlTable nodes DEFINED_IN this file
|
|
518
|
+
D. The File node itself
|
|
519
|
+
|
|
520
|
+
DuckDB has no DETACH DELETE — we delete edge rows first, then node rows.
|
|
521
|
+
"""
|
|
522
|
+
try:
|
|
523
|
+
# A: columns for tables defined in this file
|
|
524
|
+
self._conn.execute(
|
|
525
|
+
'DELETE FROM "SqlColumn" WHERE id IN ('
|
|
526
|
+
' SELECT dst_key FROM "HAS_COLUMN" WHERE src_key IN ('
|
|
527
|
+
' SELECT src_key FROM "DEFINED_IN" WHERE dst_key = ?'
|
|
528
|
+
" )"
|
|
529
|
+
")",
|
|
530
|
+
[file_path],
|
|
531
|
+
)
|
|
532
|
+
# Remove HAS_COLUMN edges for those tables
|
|
533
|
+
self._conn.execute(
|
|
534
|
+
'DELETE FROM "HAS_COLUMN" WHERE src_key IN ('
|
|
535
|
+
' SELECT src_key FROM "DEFINED_IN" WHERE dst_key = ?'
|
|
536
|
+
")",
|
|
537
|
+
[file_path],
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# B: query edges + query nodes
|
|
541
|
+
# Collect all query IDs for this file (via file_path column or QUERY_DEFINED_IN edge).
|
|
542
|
+
query_ids_result = self._conn.execute(
|
|
543
|
+
'SELECT id FROM "SqlQuery" WHERE file_path = ?'
|
|
544
|
+
" UNION"
|
|
545
|
+
' SELECT src_key FROM "QUERY_DEFINED_IN" WHERE dst_key = ?',
|
|
546
|
+
[file_path, file_path],
|
|
547
|
+
)
|
|
548
|
+
query_ids = [row[0] for row in query_ids_result.fetchall()]
|
|
549
|
+
|
|
550
|
+
if query_ids:
|
|
551
|
+
placeholders = ", ".join("?" * len(query_ids))
|
|
552
|
+
for edge_table in (
|
|
553
|
+
"SELECTS_FROM",
|
|
554
|
+
"INSERTS_INTO",
|
|
555
|
+
"DELETES_FROM",
|
|
556
|
+
"UPDATES",
|
|
557
|
+
"DECLARES",
|
|
558
|
+
"STAR_SOURCE",
|
|
559
|
+
):
|
|
560
|
+
self._conn.execute(
|
|
561
|
+
f'DELETE FROM "{edge_table}" WHERE src_key IN ({placeholders})',
|
|
562
|
+
query_ids,
|
|
563
|
+
)
|
|
564
|
+
# COLUMN_LINEAGE edges referencing queries from this file
|
|
565
|
+
self._conn.execute(
|
|
566
|
+
f'DELETE FROM "COLUMN_LINEAGE" WHERE query_id IN ({placeholders})',
|
|
567
|
+
query_ids,
|
|
568
|
+
)
|
|
569
|
+
# Remove QUERY_DEFINED_IN edges for this file
|
|
570
|
+
self._conn.execute(
|
|
571
|
+
'DELETE FROM "QUERY_DEFINED_IN" WHERE dst_key = ?',
|
|
572
|
+
[file_path],
|
|
573
|
+
)
|
|
574
|
+
# Delete query nodes
|
|
575
|
+
if query_ids:
|
|
576
|
+
self._conn.execute(
|
|
577
|
+
f'DELETE FROM "SqlQuery" WHERE id IN ({placeholders})',
|
|
578
|
+
query_ids,
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# C: table nodes + their DEFINED_IN edges
|
|
582
|
+
# Remove SqlTable nodes that are defined in (or linked via DEFINED_IN to) this file.
|
|
583
|
+
# We look at both the DEFINED_IN edge table and the defined_in_file column.
|
|
584
|
+
self._conn.execute(
|
|
585
|
+
'DELETE FROM "SqlTable" WHERE qualified IN ('
|
|
586
|
+
' SELECT src_key FROM "DEFINED_IN" WHERE dst_key = ?'
|
|
587
|
+
" UNION"
|
|
588
|
+
' SELECT qualified FROM "SqlTable" WHERE defined_in_file = ?'
|
|
589
|
+
")",
|
|
590
|
+
[file_path, file_path],
|
|
591
|
+
)
|
|
592
|
+
self._conn.execute(
|
|
593
|
+
'DELETE FROM "DEFINED_IN" WHERE dst_key = ?',
|
|
594
|
+
[file_path],
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# D: File node
|
|
598
|
+
self._conn.execute('DELETE FROM "File" WHERE path = ?', [file_path])
|
|
599
|
+
|
|
600
|
+
logger.debug("delete_nodes_for_file: %s", file_path)
|
|
601
|
+
except Exception as exc:
|
|
602
|
+
logger.error("delete_nodes_for_file failed for %s: %s", file_path, exc)
|
|
603
|
+
raise
|
|
604
|
+
|
|
605
|
+
# ------------------------------------------------------------------
|
|
606
|
+
# Schema version / SHA tracking
|
|
607
|
+
# ------------------------------------------------------------------
|
|
608
|
+
|
|
609
|
+
def get_schema_version(self) -> str | None:
|
|
610
|
+
"""Return the stored schema version, or None."""
|
|
611
|
+
try:
|
|
612
|
+
result = self._conn.execute('SELECT version FROM "SchemaVersion" LIMIT 1')
|
|
613
|
+
row = result.fetchone()
|
|
614
|
+
return row[0] if row else None
|
|
615
|
+
except Exception as exc:
|
|
616
|
+
logger.warning("Failed to read schema version: %s", exc)
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
def set_indexed_sha(self, sha: str) -> None:
|
|
620
|
+
"""Persist the git SHA of the last successful index."""
|
|
621
|
+
try:
|
|
622
|
+
self._conn.execute(
|
|
623
|
+
'INSERT OR REPLACE INTO "SchemaVersion" (version, indexed_sha) VALUES (?, ?)',
|
|
624
|
+
[SCHEMA_VERSION, sha],
|
|
625
|
+
)
|
|
626
|
+
except Exception as exc:
|
|
627
|
+
logger.warning("Failed to write indexed_sha: %s", exc)
|
|
628
|
+
|
|
629
|
+
def get_indexed_sha(self) -> str | None:
|
|
630
|
+
"""Return the stored git SHA, or None."""
|
|
631
|
+
try:
|
|
632
|
+
result = self._conn.execute('SELECT indexed_sha FROM "SchemaVersion" LIMIT 1')
|
|
633
|
+
row = result.fetchone()
|
|
634
|
+
return row[0] if row else None
|
|
635
|
+
except Exception as exc:
|
|
636
|
+
logger.warning("Failed to read indexed_sha: %s", exc)
|
|
637
|
+
return None
|
|
638
|
+
|
|
639
|
+
# ------------------------------------------------------------------
|
|
640
|
+
# Star expansion (replaces Cypher MERGE-based EXPAND_STAR_SOURCES)
|
|
641
|
+
# ------------------------------------------------------------------
|
|
642
|
+
|
|
643
|
+
def expand_star_sources(self) -> int:
|
|
644
|
+
"""Run the post-ingestion star expansion in three DML steps.
|
|
645
|
+
|
|
646
|
+
Replaces the single Kuzu EXPAND_STAR_SOURCES Cypher query which used
|
|
647
|
+
MERGE. DuckDB uses INSERT OR REPLACE (idempotent) across three steps:
|
|
648
|
+
1. Upsert destination SqlColumn nodes.
|
|
649
|
+
2. Upsert HAS_COLUMN edges for the destination columns.
|
|
650
|
+
3. Upsert COLUMN_LINEAGE edges with transform='STAR_EXPANSION'.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
Number of COLUMN_LINEAGE STAR_EXPANSION edges present after expansion.
|
|
654
|
+
"""
|
|
655
|
+
from sqlcg.core.queries import (
|
|
656
|
+
EXPAND_STAR_SOURCES_HAS_COLUMN_QUERY,
|
|
657
|
+
EXPAND_STAR_SOURCES_LINEAGE_QUERY,
|
|
658
|
+
EXPAND_STAR_SOURCES_QUERY,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
self._conn.execute(EXPAND_STAR_SOURCES_QUERY)
|
|
662
|
+
self._conn.execute(EXPAND_STAR_SOURCES_HAS_COLUMN_QUERY)
|
|
663
|
+
self._conn.execute(EXPAND_STAR_SOURCES_LINEAGE_QUERY)
|
|
664
|
+
result = self._conn.execute(
|
|
665
|
+
'SELECT count(*) AS n FROM "COLUMN_LINEAGE" WHERE transform = ?',
|
|
666
|
+
["STAR_EXPANSION"],
|
|
667
|
+
)
|
|
668
|
+
row = result.fetchone()
|
|
669
|
+
return int(row[0]) if row else 0
|
|
670
|
+
|
|
671
|
+
# ------------------------------------------------------------------
|
|
672
|
+
# Full-rebuild helpers
|
|
673
|
+
# ------------------------------------------------------------------
|
|
674
|
+
|
|
675
|
+
def clear_all_tables(self) -> None:
|
|
676
|
+
"""Delete all node and edge rows, preserving the schema structure.
|
|
677
|
+
|
|
678
|
+
Used by the reindex drain body (Phase 4): called inside a transaction
|
|
679
|
+
before re-inserting all rows so readers on MVCC snapshots see the old
|
|
680
|
+
graph until COMMIT flips to the new one atomically.
|
|
681
|
+
|
|
682
|
+
``SchemaVersion`` is preserved (its ``indexed_sha`` is updated by the
|
|
683
|
+
indexer after the rebuild; its ``version`` row keeps the schema gate
|
|
684
|
+
intact across re-opens). All other node and edge tables are truncated.
|
|
685
|
+
"""
|
|
686
|
+
node_tables = [
|
|
687
|
+
"Repo",
|
|
688
|
+
"File",
|
|
689
|
+
"SqlTable",
|
|
690
|
+
"SqlColumn",
|
|
691
|
+
"SqlQuery",
|
|
692
|
+
"ExternalConsumer",
|
|
693
|
+
]
|
|
694
|
+
edge_tables = [
|
|
695
|
+
"BELONGS_TO",
|
|
696
|
+
"DEFINED_IN",
|
|
697
|
+
"QUERY_DEFINED_IN",
|
|
698
|
+
"HAS_COLUMN",
|
|
699
|
+
"SELECTS_FROM",
|
|
700
|
+
"INSERTS_INTO",
|
|
701
|
+
"DELETES_FROM",
|
|
702
|
+
"UPDATES",
|
|
703
|
+
"COLUMN_LINEAGE",
|
|
704
|
+
"DECLARES",
|
|
705
|
+
"STAR_SOURCE",
|
|
706
|
+
"CONSUMED_BY",
|
|
707
|
+
]
|
|
708
|
+
for tbl in edge_tables + node_tables:
|
|
709
|
+
self._conn.execute(f'DELETE FROM "{tbl}"')
|
|
710
|
+
logger.debug("DuckDBBackend: all node/edge tables cleared")
|
|
711
|
+
|
|
712
|
+
# ------------------------------------------------------------------
|
|
713
|
+
# Lifecycle
|
|
714
|
+
# ------------------------------------------------------------------
|
|
715
|
+
|
|
716
|
+
def close(self) -> None:
|
|
717
|
+
"""Close the DuckDB connection."""
|
|
718
|
+
try:
|
|
719
|
+
self._conn.close()
|
|
720
|
+
logger.debug("DuckDBBackend connection closed")
|
|
721
|
+
except Exception as exc:
|
|
722
|
+
logger.error("Error closing DuckDBBackend: %s", exc)
|
|
723
|
+
raise
|
|
724
|
+
|
|
725
|
+
# ------------------------------------------------------------------
|
|
726
|
+
# Transaction
|
|
727
|
+
# ------------------------------------------------------------------
|
|
728
|
+
|
|
729
|
+
@contextmanager
|
|
730
|
+
def transaction(self) -> Iterator[DuckDBBackend]:
|
|
731
|
+
"""Real BEGIN … COMMIT / ROLLBACK transaction context manager.
|
|
732
|
+
|
|
733
|
+
Overrides the ABC no-op (ARCHITECTURE_REVIEW §3.1, HIGH).
|
|
734
|
+
DuckDB DDL is transactional so schema creation in init_schema
|
|
735
|
+
is also covered.
|
|
736
|
+
|
|
737
|
+
Reentrant: DuckDB rejects ``BEGIN`` inside an open transaction, so a
|
|
738
|
+
nested call is a no-op that joins the outer transaction — the
|
|
739
|
+
outermost level owns COMMIT/ROLLBACK. This lets the drain body wrap
|
|
740
|
+
``clear_all_tables()`` + ``index_repo()`` (which opens its own
|
|
741
|
+
per-batch transactions) in one atomic rebuild (Phase 4, C3/C6).
|
|
742
|
+
"""
|
|
743
|
+
if self._txn_depth > 0:
|
|
744
|
+
# Joined the outer transaction — an exception propagates out and
|
|
745
|
+
# the outermost level rolls everything back.
|
|
746
|
+
self._txn_depth += 1
|
|
747
|
+
try:
|
|
748
|
+
yield self
|
|
749
|
+
finally:
|
|
750
|
+
self._txn_depth -= 1
|
|
751
|
+
return
|
|
752
|
+
self._conn.execute("BEGIN")
|
|
753
|
+
self._txn_depth = 1
|
|
754
|
+
try:
|
|
755
|
+
yield self
|
|
756
|
+
self._conn.execute("COMMIT")
|
|
757
|
+
except Exception:
|
|
758
|
+
try:
|
|
759
|
+
self._conn.execute("ROLLBACK")
|
|
760
|
+
except Exception as rb_err:
|
|
761
|
+
logger.debug("ROLLBACK failed: %s", rb_err)
|
|
762
|
+
raise
|
|
763
|
+
finally:
|
|
764
|
+
self._txn_depth = 0
|