codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,701 @@
|
|
|
1
|
+
"""Typed read/write accessors. All SQL lives here."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import sqlite3
|
|
7
|
+
from typing import Any, Iterable, Optional, Sequence
|
|
8
|
+
|
|
9
|
+
from ..parsers.base import Chunk, Symbol
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def upsert_file(
|
|
13
|
+
conn: sqlite3.Connection,
|
|
14
|
+
*,
|
|
15
|
+
path: str,
|
|
16
|
+
lang: Optional[str],
|
|
17
|
+
size_bytes: int,
|
|
18
|
+
sha256: str,
|
|
19
|
+
mtime_ns: int,
|
|
20
|
+
git_status: Optional[str],
|
|
21
|
+
parser: str,
|
|
22
|
+
indexed_at: str,
|
|
23
|
+
is_generated: bool,
|
|
24
|
+
) -> int:
|
|
25
|
+
"""Insert or update a file row keyed by repo-relative path."""
|
|
26
|
+
row = conn.execute(
|
|
27
|
+
"""
|
|
28
|
+
INSERT INTO files
|
|
29
|
+
(path, lang, size_bytes, sha256, mtime_ns, git_status, parser, indexed_at, is_generated)
|
|
30
|
+
VALUES
|
|
31
|
+
(:path, :lang, :size_bytes, :sha256, :mtime_ns, :git_status, :parser, :indexed_at, :is_generated)
|
|
32
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
33
|
+
lang = excluded.lang,
|
|
34
|
+
size_bytes = excluded.size_bytes,
|
|
35
|
+
sha256 = excluded.sha256,
|
|
36
|
+
mtime_ns = excluded.mtime_ns,
|
|
37
|
+
git_status = excluded.git_status,
|
|
38
|
+
parser = excluded.parser,
|
|
39
|
+
indexed_at = excluded.indexed_at,
|
|
40
|
+
is_generated = excluded.is_generated
|
|
41
|
+
RETURNING id
|
|
42
|
+
""",
|
|
43
|
+
{
|
|
44
|
+
"path": path,
|
|
45
|
+
"lang": lang,
|
|
46
|
+
"size_bytes": size_bytes,
|
|
47
|
+
"sha256": sha256,
|
|
48
|
+
"mtime_ns": mtime_ns,
|
|
49
|
+
"git_status": git_status,
|
|
50
|
+
"parser": parser,
|
|
51
|
+
"indexed_at": indexed_at,
|
|
52
|
+
"is_generated": 1 if is_generated else 0,
|
|
53
|
+
},
|
|
54
|
+
).fetchone()
|
|
55
|
+
return int(row[0])
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_file(conn: sqlite3.Connection, path: str) -> Optional[sqlite3.Row]:
|
|
59
|
+
return conn.execute("SELECT * FROM files WHERE path = ?", (path,)).fetchone()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def all_paths(conn: sqlite3.Connection) -> set[str]:
|
|
63
|
+
return {r[0] for r in conn.execute("SELECT path FROM files")}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def delete_files(conn: sqlite3.Connection, paths: Iterable[str]) -> int:
|
|
67
|
+
paths = list(paths)
|
|
68
|
+
if not paths:
|
|
69
|
+
return 0
|
|
70
|
+
before = conn.total_changes
|
|
71
|
+
conn.executemany("DELETE FROM files WHERE path = ?", [(p,) for p in paths])
|
|
72
|
+
return conn.total_changes - before
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def count_files(conn: sqlite3.Connection) -> int:
|
|
76
|
+
return int(conn.execute("SELECT COUNT(*) FROM files").fetchone()[0])
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_meta(conn: sqlite3.Connection, key: str) -> Optional[str]:
|
|
80
|
+
row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
|
|
81
|
+
return row[0] if row else None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def set_meta(conn: sqlite3.Connection, key: str, value: str) -> None:
|
|
85
|
+
conn.execute(
|
|
86
|
+
"INSERT INTO meta(key, value) VALUES (?, ?) "
|
|
87
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
|
88
|
+
(key, value),
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def replace_chunks(
|
|
93
|
+
conn: sqlite3.Connection,
|
|
94
|
+
file_id: int,
|
|
95
|
+
chunks: Sequence[Chunk],
|
|
96
|
+
symbol_ids: Optional[Sequence[int]] = None,
|
|
97
|
+
) -> int:
|
|
98
|
+
conn.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,))
|
|
99
|
+
|
|
100
|
+
def _symbol_id(chunk: Chunk) -> Optional[int]:
|
|
101
|
+
if chunk.symbol_index is not None and symbol_ids is not None:
|
|
102
|
+
return symbol_ids[chunk.symbol_index]
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
# symbol_names is denormalized into the chunk (see schema.sql): resolve the
|
|
106
|
+
# name from the just-inserted symbol row (replace_symbols runs first). Stored so
|
|
107
|
+
# the FTS triggers can replay it verbatim on delete/update.
|
|
108
|
+
conn.executemany(
|
|
109
|
+
"""
|
|
110
|
+
INSERT INTO chunks
|
|
111
|
+
(file_id, line_start, line_end, kind, symbol_id, content, token_est, symbol_names)
|
|
112
|
+
VALUES
|
|
113
|
+
(?, ?, ?, ?, ?, ?, ?, COALESCE((SELECT name FROM symbols WHERE id = ?), ''))
|
|
114
|
+
""",
|
|
115
|
+
[
|
|
116
|
+
(
|
|
117
|
+
file_id,
|
|
118
|
+
c.line_start,
|
|
119
|
+
c.line_end,
|
|
120
|
+
c.kind,
|
|
121
|
+
_symbol_id(c),
|
|
122
|
+
c.content,
|
|
123
|
+
c.token_est,
|
|
124
|
+
_symbol_id(c),
|
|
125
|
+
)
|
|
126
|
+
for c in chunks
|
|
127
|
+
],
|
|
128
|
+
)
|
|
129
|
+
return len(chunks)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def append_chunks(
|
|
133
|
+
conn: sqlite3.Connection,
|
|
134
|
+
file_id: int,
|
|
135
|
+
chunks: Sequence[Chunk],
|
|
136
|
+
) -> int:
|
|
137
|
+
"""Append chunks without deleting existing ones (for doc chunks)."""
|
|
138
|
+
conn.executemany(
|
|
139
|
+
"""
|
|
140
|
+
INSERT INTO chunks
|
|
141
|
+
(file_id, line_start, line_end, kind, symbol_id, content, token_est)
|
|
142
|
+
VALUES
|
|
143
|
+
(?, ?, ?, ?, NULL, ?, ?)
|
|
144
|
+
""",
|
|
145
|
+
[
|
|
146
|
+
(
|
|
147
|
+
file_id,
|
|
148
|
+
c.line_start,
|
|
149
|
+
c.line_end,
|
|
150
|
+
c.kind,
|
|
151
|
+
c.content,
|
|
152
|
+
c.token_est,
|
|
153
|
+
)
|
|
154
|
+
for c in chunks
|
|
155
|
+
],
|
|
156
|
+
)
|
|
157
|
+
return len(chunks)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def chunks_for_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row]:
|
|
161
|
+
return conn.execute(
|
|
162
|
+
"SELECT * FROM chunks WHERE file_id = ? ORDER BY line_start", (file_id,)
|
|
163
|
+
).fetchall()
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def count_chunks(conn: sqlite3.Connection) -> int:
|
|
167
|
+
return int(conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0])
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def replace_symbols(
|
|
171
|
+
conn: sqlite3.Connection, file_id: int, symbols: Sequence[Symbol]
|
|
172
|
+
) -> list[int]:
|
|
173
|
+
conn.execute("DELETE FROM symbols WHERE file_id = ?", (file_id,))
|
|
174
|
+
ids: list[int] = []
|
|
175
|
+
for symbol in symbols:
|
|
176
|
+
cur = conn.execute(
|
|
177
|
+
"""
|
|
178
|
+
INSERT INTO symbols
|
|
179
|
+
(file_id, name, qualified, kind, line_start, line_end, signature,
|
|
180
|
+
parent_id, docstring, in_degree, out_degree)
|
|
181
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, NULL, ?, 0, 0)
|
|
182
|
+
""",
|
|
183
|
+
(
|
|
184
|
+
file_id,
|
|
185
|
+
symbol.name,
|
|
186
|
+
symbol.qualified,
|
|
187
|
+
symbol.kind,
|
|
188
|
+
symbol.line_start,
|
|
189
|
+
symbol.line_end,
|
|
190
|
+
symbol.signature,
|
|
191
|
+
symbol.docstring,
|
|
192
|
+
),
|
|
193
|
+
)
|
|
194
|
+
assert cur.lastrowid is not None
|
|
195
|
+
ids.append(int(cur.lastrowid))
|
|
196
|
+
for symbol, symbol_id in zip(symbols, ids):
|
|
197
|
+
if symbol.parent_index is not None:
|
|
198
|
+
conn.execute(
|
|
199
|
+
"UPDATE symbols SET parent_id = ? WHERE id = ?",
|
|
200
|
+
(ids[symbol.parent_index], symbol_id),
|
|
201
|
+
)
|
|
202
|
+
return ids
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def symbols_by_name(
|
|
206
|
+
conn: sqlite3.Connection,
|
|
207
|
+
name: str,
|
|
208
|
+
*,
|
|
209
|
+
kind: Optional[str] = None,
|
|
210
|
+
exact: bool = True,
|
|
211
|
+
) -> list[sqlite3.Row]:
|
|
212
|
+
sql = """
|
|
213
|
+
SELECT s.*, f.path AS path
|
|
214
|
+
FROM symbols s JOIN files f ON f.id = s.file_id
|
|
215
|
+
WHERE s.name {op} ?
|
|
216
|
+
""".format(op="=" if exact else "LIKE")
|
|
217
|
+
params: list[Any] = [name if exact else f"{name}%"]
|
|
218
|
+
if kind:
|
|
219
|
+
sql += " AND s.kind = ?"
|
|
220
|
+
params.append(kind)
|
|
221
|
+
sql += " ORDER BY s.name, f.path, s.line_start"
|
|
222
|
+
return conn.execute(sql, params).fetchall()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def count_symbols(conn: sqlite3.Connection) -> int:
|
|
226
|
+
return int(conn.execute("SELECT COUNT(*) FROM symbols").fetchone()[0])
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def treesitter_coverage(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
230
|
+
"""Per-language (files, symbols) for tree-sitter files only.
|
|
231
|
+
|
|
232
|
+
Powers Guardrail 2: a tree-sitter language with files but ~0 symbols is a yellow flag
|
|
233
|
+
(silent extraction failure), surfaced by `doctor`.
|
|
234
|
+
"""
|
|
235
|
+
return conn.execute(
|
|
236
|
+
"""
|
|
237
|
+
SELECT f.lang AS lang,
|
|
238
|
+
COUNT(DISTINCT f.id) AS files,
|
|
239
|
+
COUNT(s.id) AS symbols
|
|
240
|
+
FROM files f
|
|
241
|
+
LEFT JOIN symbols s ON s.file_id = f.id
|
|
242
|
+
WHERE f.parser = 'treesitter'
|
|
243
|
+
GROUP BY f.lang
|
|
244
|
+
ORDER BY files DESC
|
|
245
|
+
"""
|
|
246
|
+
).fetchall()
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def replace_edges(
|
|
250
|
+
conn: sqlite3.Connection, file_id: int, edges: Sequence[dict[str, Any]]
|
|
251
|
+
) -> int:
|
|
252
|
+
conn.execute("DELETE FROM edges WHERE file_id = ?", (file_id,))
|
|
253
|
+
conn.executemany(
|
|
254
|
+
"""
|
|
255
|
+
INSERT INTO edges
|
|
256
|
+
(edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line,
|
|
257
|
+
resolved, confidence)
|
|
258
|
+
VALUES
|
|
259
|
+
(:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line,
|
|
260
|
+
:resolved, :confidence)
|
|
261
|
+
""",
|
|
262
|
+
# confidence defaults to 'extracted' for callers (and tests) that predate the
|
|
263
|
+
# audit-trail column; the global graph pass refines it (see graph/builder.py).
|
|
264
|
+
[{"confidence": "extracted", **edge, "file_id": file_id} for edge in edges],
|
|
265
|
+
)
|
|
266
|
+
return len(edges)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def count_edges(conn: sqlite3.Connection) -> int:
|
|
270
|
+
return int(conn.execute("SELECT COUNT(*) FROM edges").fetchone()[0])
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def refs_for_name(conn: sqlite3.Connection, name: str) -> list[sqlite3.Row]:
|
|
274
|
+
return conn.execute(
|
|
275
|
+
"""
|
|
276
|
+
SELECT e.line AS line, f.path AS path, e.edge_type AS edge_type,
|
|
277
|
+
e.resolved AS resolved, e.src_id AS src_id, e.src_kind AS src_kind,
|
|
278
|
+
e.confidence AS confidence,
|
|
279
|
+
src.name AS src_name, src.qualified AS src_qualified
|
|
280
|
+
FROM edges e
|
|
281
|
+
JOIN files f ON f.id = e.file_id
|
|
282
|
+
LEFT JOIN symbols src ON src.id = e.src_id AND e.src_kind = 'symbol'
|
|
283
|
+
WHERE e.dst_name = ? AND e.edge_type = 'call'
|
|
284
|
+
ORDER BY f.path, e.line
|
|
285
|
+
""",
|
|
286
|
+
(name,),
|
|
287
|
+
).fetchall()
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def fts_search(
|
|
291
|
+
conn: sqlite3.Connection, match_query: str, *, limit: int
|
|
292
|
+
) -> list[sqlite3.Row]:
|
|
293
|
+
if not match_query.strip():
|
|
294
|
+
return []
|
|
295
|
+
return conn.execute(
|
|
296
|
+
"""
|
|
297
|
+
SELECT c.id AS chunk_id,
|
|
298
|
+
f.path AS path,
|
|
299
|
+
c.line_start AS line_start,
|
|
300
|
+
c.line_end AS line_end,
|
|
301
|
+
c.content AS content,
|
|
302
|
+
c.token_est AS token_est,
|
|
303
|
+
bm25(fts_chunks) AS bm25,
|
|
304
|
+
c.kind AS kind
|
|
305
|
+
FROM fts_chunks
|
|
306
|
+
JOIN chunks c ON c.id = fts_chunks.rowid
|
|
307
|
+
JOIN files f ON f.id = c.file_id
|
|
308
|
+
WHERE fts_chunks MATCH ?
|
|
309
|
+
ORDER BY bm25(fts_chunks)
|
|
310
|
+
LIMIT ?
|
|
311
|
+
""",
|
|
312
|
+
(match_query, limit),
|
|
313
|
+
).fetchall()
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def path_search(
|
|
317
|
+
conn: sqlite3.Connection, query: str, *, limit: int
|
|
318
|
+
) -> list[sqlite3.Row]:
|
|
319
|
+
"""Match files whose path contains query tokens. Score = number of tokens hit."""
|
|
320
|
+
tokens = [t for t in re.split(r"[\s/.\\]+", query.strip()) if t]
|
|
321
|
+
if not tokens:
|
|
322
|
+
return []
|
|
323
|
+
score_expr = " + ".join(["(path LIKE ?)"] * len(tokens))
|
|
324
|
+
like_args = [f"%{t}%" for t in tokens]
|
|
325
|
+
return conn.execute(
|
|
326
|
+
f"""
|
|
327
|
+
SELECT id AS file_id, path, mtime_ns, is_generated,
|
|
328
|
+
({score_expr}) AS hits
|
|
329
|
+
FROM files
|
|
330
|
+
WHERE {' OR '.join(['path LIKE ?'] * len(tokens))}
|
|
331
|
+
ORDER BY hits DESC, length(path) ASC
|
|
332
|
+
LIMIT ?
|
|
333
|
+
""",
|
|
334
|
+
(*like_args, *like_args, limit),
|
|
335
|
+
).fetchall()
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def symbol_search(
|
|
339
|
+
conn: sqlite3.Connection,
|
|
340
|
+
name: str,
|
|
341
|
+
*,
|
|
342
|
+
limit: int,
|
|
343
|
+
kind: Optional[str] = None,
|
|
344
|
+
exact: bool = False,
|
|
345
|
+
) -> list[sqlite3.Row]:
|
|
346
|
+
"""Symbol lookup: exact name first, then prefix, then substring (fuzzy)."""
|
|
347
|
+
name = name.strip()
|
|
348
|
+
if not name:
|
|
349
|
+
return []
|
|
350
|
+
kind_clause = "AND s.kind = :kind" if kind else ""
|
|
351
|
+
name_clause = "s.name = :exact COLLATE NOCASE" if exact else (
|
|
352
|
+
"(s.name = :exact COLLATE NOCASE "
|
|
353
|
+
"OR s.name LIKE :prefix COLLATE NOCASE "
|
|
354
|
+
"OR s.name LIKE :sub COLLATE NOCASE)"
|
|
355
|
+
)
|
|
356
|
+
return conn.execute(
|
|
357
|
+
f"""
|
|
358
|
+
SELECT s.name, s.kind, s.signature, s.line_start, s.line_end,
|
|
359
|
+
s.in_degree, s.out_degree, f.path, f.mtime_ns, f.is_generated,
|
|
360
|
+
(s.name = :exact COLLATE NOCASE) AS is_exact
|
|
361
|
+
FROM symbols s
|
|
362
|
+
JOIN files f ON f.id = s.file_id
|
|
363
|
+
WHERE {name_clause} {kind_clause}
|
|
364
|
+
ORDER BY is_exact DESC,
|
|
365
|
+
(s.name LIKE :prefix COLLATE NOCASE) DESC,
|
|
366
|
+
s.in_degree DESC
|
|
367
|
+
LIMIT :limit
|
|
368
|
+
""",
|
|
369
|
+
{
|
|
370
|
+
"exact": name,
|
|
371
|
+
"prefix": f"{name}%",
|
|
372
|
+
"sub": f"%{name}%",
|
|
373
|
+
"kind": kind,
|
|
374
|
+
"limit": limit,
|
|
375
|
+
},
|
|
376
|
+
).fetchall()
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def unresolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
380
|
+
return conn.execute(
|
|
381
|
+
"SELECT e.id AS id, e.edge_type AS edge_type, e.dst_name AS dst_name, "
|
|
382
|
+
" f.lang AS lang "
|
|
383
|
+
"FROM edges e JOIN files f ON f.id = e.file_id "
|
|
384
|
+
"WHERE e.resolved = 0 AND e.dst_name IS NOT NULL ORDER BY e.id"
|
|
385
|
+
).fetchall()
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def resolve_edge(conn: sqlite3.Connection, edge_id: int, dst_kind: str, dst_id: int) -> None:
|
|
389
|
+
conn.execute(
|
|
390
|
+
"UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1 WHERE id = ?",
|
|
391
|
+
(dst_kind, dst_id, edge_id),
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def resolve_edges_bulk(
|
|
396
|
+
conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int, str]]
|
|
397
|
+
) -> None:
|
|
398
|
+
"""Apply (dst_kind, dst_id, edge_id, confidence) resolutions in one executemany.
|
|
399
|
+
|
|
400
|
+
confidence records *how* the target was found: 'extracted' for an exact match
|
|
401
|
+
(a repo-unique symbol name), 'inferred' for a heuristic (import path-suffix).
|
|
402
|
+
"""
|
|
403
|
+
conn.executemany(
|
|
404
|
+
"UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1, confidence = ? WHERE id = ?",
|
|
405
|
+
[(dst_kind, dst_id, confidence, edge_id) for dst_kind, dst_id, edge_id, confidence in resolutions],
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def mark_ambiguous_edges(conn: sqlite3.Connection) -> int:
|
|
410
|
+
"""Flag every still-unresolved edge that names a target as 'ambiguous'.
|
|
411
|
+
|
|
412
|
+
Run after the global resolution pass: an edge with a dst_name that no unique
|
|
413
|
+
symbol/file claims is one we could not pin down (a non-unique name, or an import
|
|
414
|
+
of code outside the repo). Marking it keeps refs/impact honest — an empty or
|
|
415
|
+
short answer over ambiguous edges is inconclusive, not proof of "no callers".
|
|
416
|
+
"""
|
|
417
|
+
cur = conn.execute(
|
|
418
|
+
"UPDATE edges SET confidence = 'ambiguous' "
|
|
419
|
+
"WHERE resolved = 0 AND dst_name IS NOT NULL AND confidence != 'ambiguous'"
|
|
420
|
+
)
|
|
421
|
+
return cur.rowcount if cur.rowcount is not None else 0
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def all_resolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
425
|
+
"""Every resolved edge as (src_kind, src_id, dst_kind, dst_id, edge_type, confidence).
|
|
426
|
+
|
|
427
|
+
The in-memory adjacency the graph analysis (communities / god nodes / bridges)
|
|
428
|
+
is built from. Unresolved edges are skipped — they have no concrete endpoint.
|
|
429
|
+
"""
|
|
430
|
+
return conn.execute(
|
|
431
|
+
"SELECT src_kind, src_id, dst_kind, dst_id, edge_type, confidence FROM edges "
|
|
432
|
+
"WHERE resolved = 1 AND dst_id IS NOT NULL"
|
|
433
|
+
).fetchall()
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def all_graph_nodes(conn: sqlite3.Connection) -> dict[str, list[sqlite3.Row]]:
|
|
437
|
+
"""File and symbol rows keyed by kind, for labelling graph-analysis nodes."""
|
|
438
|
+
return {
|
|
439
|
+
"file": conn.execute("SELECT id, path FROM files").fetchall(),
|
|
440
|
+
"symbol": conn.execute(
|
|
441
|
+
"SELECT s.id AS id, s.name AS name, s.kind AS kind, f.path AS path, "
|
|
442
|
+
" s.line_start AS line_start, "
|
|
443
|
+
" s.in_degree AS in_degree, s.out_degree AS out_degree "
|
|
444
|
+
"FROM symbols s JOIN files f ON f.id = s.file_id"
|
|
445
|
+
).fetchall(),
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def name_ref_counts(conn: sqlite3.Connection, names: Sequence[str]) -> dict[str, int]:
|
|
450
|
+
"""Count edges targeting each name (any resolution state), keyed by dst_name.
|
|
451
|
+
|
|
452
|
+
A damped centrality proxy for symbols whose precise in_degree is 0 because their
|
|
453
|
+
name is not globally unique (ambiguous edges never resolve). Over-counts across
|
|
454
|
+
same-named symbols by design — it is only used as a weak tiebreak fallback.
|
|
455
|
+
"""
|
|
456
|
+
uniq = [n for n in dict.fromkeys(names) if n]
|
|
457
|
+
if not uniq:
|
|
458
|
+
return {}
|
|
459
|
+
placeholders = ",".join("?" * len(uniq))
|
|
460
|
+
rows = conn.execute(
|
|
461
|
+
f"SELECT dst_name, COUNT(*) AS c FROM edges "
|
|
462
|
+
f"WHERE dst_name IN ({placeholders}) GROUP BY dst_name",
|
|
463
|
+
tuple(uniq),
|
|
464
|
+
).fetchall()
|
|
465
|
+
return {row["dst_name"]: int(row["c"]) for row in rows}
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def unique_symbol_ids_by_name(conn: sqlite3.Connection) -> dict[str, int]:
|
|
469
|
+
"""Map symbol name -> id for names defined exactly once in the repo."""
|
|
470
|
+
return {
|
|
471
|
+
row["name"]: int(row["sym_id"])
|
|
472
|
+
for row in conn.execute(
|
|
473
|
+
"SELECT name, MIN(id) AS sym_id FROM symbols GROUP BY name HAVING COUNT(*) = 1"
|
|
474
|
+
)
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def all_file_ids_with_paths(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
479
|
+
return conn.execute("SELECT id, path FROM files").fetchall()
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def symbol_id_for_unique_name(conn: sqlite3.Connection, name: str) -> Optional[int]:
|
|
483
|
+
rows = conn.execute(
|
|
484
|
+
"SELECT id FROM symbols WHERE name = ? LIMIT 2", (name,)
|
|
485
|
+
).fetchall()
|
|
486
|
+
return int(rows[0]["id"]) if len(rows) == 1 else None
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def files_with_suffix(conn: sqlite3.Connection, suffix: str) -> list[sqlite3.Row]:
|
|
490
|
+
return conn.execute(
|
|
491
|
+
"SELECT id, path FROM files WHERE path = ? OR path LIKE ? ORDER BY length(path), path",
|
|
492
|
+
(suffix, f"%/{suffix}"),
|
|
493
|
+
).fetchall()
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def file_by_path(conn: sqlite3.Connection, path: str) -> Optional[sqlite3.Row]:
|
|
497
|
+
return conn.execute("SELECT id, path FROM files WHERE path = ?", (path,)).fetchone()
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def symbols_in_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row]:
|
|
501
|
+
return conn.execute(
|
|
502
|
+
"SELECT id, name, kind, line_start, in_degree FROM symbols "
|
|
503
|
+
"WHERE file_id = ? ORDER BY line_start",
|
|
504
|
+
(file_id,),
|
|
505
|
+
).fetchall()
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]:
|
|
509
|
+
return conn.execute(
|
|
510
|
+
"SELECT id, edge_type, src_kind, src_id, file_id, line, confidence FROM edges "
|
|
511
|
+
"WHERE resolved = 1 AND dst_kind = ? AND dst_id = ?",
|
|
512
|
+
(kind, node_id),
|
|
513
|
+
).fetchall()
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def outgoing_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]:
|
|
517
|
+
return conn.execute(
|
|
518
|
+
"SELECT id, edge_type, dst_kind, dst_id, file_id, line, confidence FROM edges "
|
|
519
|
+
"WHERE resolved = 1 AND src_kind = ? AND src_id = ?",
|
|
520
|
+
(kind, node_id),
|
|
521
|
+
).fetchall()
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def recompute_degrees(conn: sqlite3.Connection) -> None:
|
|
525
|
+
conn.execute(
|
|
526
|
+
"UPDATE symbols SET "
|
|
527
|
+
"out_degree = (SELECT COUNT(*) FROM edges "
|
|
528
|
+
" WHERE resolved = 1 AND src_kind = 'symbol' AND src_id = symbols.id), "
|
|
529
|
+
"in_degree = (SELECT COUNT(*) FROM edges "
|
|
530
|
+
" WHERE resolved = 1 AND dst_kind = 'symbol' AND dst_id = symbols.id)"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def count_resolved_edges(conn: sqlite3.Connection) -> int:
|
|
535
|
+
return int(conn.execute("SELECT COUNT(*) FROM edges WHERE resolved = 1").fetchone()[0])
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def ensure_vec_tables(conn: sqlite3.Connection, *, dim: int) -> None:
|
|
539
|
+
"""Create vec_chunks (sqlite-vec) + vec_meta + vec_cache if absent. dim is fixed per build."""
|
|
540
|
+
dim = int(dim)
|
|
541
|
+
conn.execute(
|
|
542
|
+
f"CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0("
|
|
543
|
+
f"chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{dim}])"
|
|
544
|
+
)
|
|
545
|
+
conn.execute("CREATE TABLE IF NOT EXISTS vec_meta (model TEXT, dim INTEGER, built_at TEXT)")
|
|
546
|
+
# Content-addressed embedding cache: chunk ids churn on every full rebuild
|
|
547
|
+
# (replace_chunks deletes + re-inserts), so a chunk-id keyed store alone would
|
|
548
|
+
# re-embed the whole repo each time. Keyed by (model, content_sha) the cache
|
|
549
|
+
# survives that churn and lets unchanged content reuse its vector for free.
|
|
550
|
+
conn.execute(
|
|
551
|
+
"CREATE TABLE IF NOT EXISTS vec_cache ("
|
|
552
|
+
"model TEXT NOT NULL, content_sha TEXT NOT NULL, embedding BLOB NOT NULL, "
|
|
553
|
+
"PRIMARY KEY (model, content_sha))"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def set_vec_meta(conn: sqlite3.Connection, *, model: str, dim: int, built_at: str) -> None:
|
|
558
|
+
conn.execute("DELETE FROM vec_meta")
|
|
559
|
+
conn.execute(
|
|
560
|
+
"INSERT INTO vec_meta (model, dim, built_at) VALUES (?,?,?)", (model, int(dim), built_at)
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def get_vec_meta(conn: sqlite3.Connection) -> "Optional[sqlite3.Row]":
|
|
565
|
+
return conn.execute("SELECT model, dim, built_at FROM vec_meta LIMIT 1").fetchone()
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def chunks_for_embedding(conn: sqlite3.Connection) -> list[sqlite3.Row]:
|
|
569
|
+
return conn.execute("SELECT id, content FROM chunks ORDER BY id").fetchall()
|
|
570
|
+
|
|
571
|
+
|
|
572
|
+
def upsert_chunk_vector(
|
|
573
|
+
conn: sqlite3.Connection, chunk_id: int, embedding: list[float]
|
|
574
|
+
) -> None:
|
|
575
|
+
import sqlite_vec # type: ignore[import-untyped]
|
|
576
|
+
|
|
577
|
+
upsert_chunk_vector_blob(conn, chunk_id, sqlite_vec.serialize_float32(embedding))
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def upsert_chunk_vector_blob(conn: sqlite3.Connection, chunk_id: int, blob: bytes) -> None:
|
|
581
|
+
"""Write a pre-serialized float32 embedding blob for a chunk (cache-reuse path)."""
|
|
582
|
+
upsert_chunk_vector_blobs(conn, [(chunk_id, blob)])
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def upsert_chunk_vector_blobs(
|
|
586
|
+
conn: sqlite3.Connection, items: Sequence[tuple[int, bytes]]
|
|
587
|
+
) -> None:
|
|
588
|
+
"""Batch-write pre-serialized embedding blobs (one executemany per statement)."""
|
|
589
|
+
if not items:
|
|
590
|
+
return
|
|
591
|
+
conn.executemany(
|
|
592
|
+
"DELETE FROM vec_chunks WHERE chunk_id = ?", [(int(cid),) for cid, _ in items]
|
|
593
|
+
)
|
|
594
|
+
conn.executemany(
|
|
595
|
+
"INSERT INTO vec_chunks (chunk_id, embedding) VALUES (?, ?)",
|
|
596
|
+
[(int(cid), blob) for cid, blob in items],
|
|
597
|
+
)
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def cached_embeddings(
|
|
601
|
+
conn: sqlite3.Connection, *, model: str, shas: Iterable[str]
|
|
602
|
+
) -> dict[str, bytes]:
|
|
603
|
+
"""Return {content_sha: serialized embedding blob} already cached for this model."""
|
|
604
|
+
shas = list(dict.fromkeys(shas))
|
|
605
|
+
if not shas:
|
|
606
|
+
return {}
|
|
607
|
+
out: dict[str, bytes] = {}
|
|
608
|
+
# Chunk the IN list to stay well under SQLite's variable limit on huge repos.
|
|
609
|
+
for start in range(0, len(shas), 500):
|
|
610
|
+
batch = shas[start : start + 500]
|
|
611
|
+
placeholders = ",".join("?" * len(batch))
|
|
612
|
+
rows = conn.execute(
|
|
613
|
+
f"SELECT content_sha, embedding FROM vec_cache "
|
|
614
|
+
f"WHERE model = ? AND content_sha IN ({placeholders})",
|
|
615
|
+
(model, *batch),
|
|
616
|
+
).fetchall()
|
|
617
|
+
for r in rows:
|
|
618
|
+
out[r[0]] = r[1]
|
|
619
|
+
return out
|
|
620
|
+
|
|
621
|
+
|
|
622
|
+
def store_cached_embeddings(
|
|
623
|
+
conn: sqlite3.Connection, *, model: str, items: Sequence[tuple[str, bytes]]
|
|
624
|
+
) -> None:
|
|
625
|
+
"""Insert (content_sha, blob) pairs into the content-addressed embedding cache."""
|
|
626
|
+
if not items:
|
|
627
|
+
return
|
|
628
|
+
conn.executemany(
|
|
629
|
+
"INSERT OR REPLACE INTO vec_cache (model, content_sha, embedding) VALUES (?, ?, ?)",
|
|
630
|
+
[(model, sha, blob) for sha, blob in items],
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def clear_vectors(conn: sqlite3.Connection) -> None:
|
|
635
|
+
conn.execute("DELETE FROM vec_chunks")
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def count_vectors(conn: sqlite3.Connection) -> int:
|
|
639
|
+
return int(conn.execute("SELECT COUNT(*) FROM vec_chunks").fetchone()[0])
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
def embedded_chunk_ids(conn: sqlite3.Connection) -> set[int]:
|
|
643
|
+
"""Return chunk IDs that already have a vector embedding."""
|
|
644
|
+
try:
|
|
645
|
+
rows = conn.execute("SELECT chunk_id FROM vec_chunks").fetchall()
|
|
646
|
+
return {int(r[0]) for r in rows}
|
|
647
|
+
except sqlite3.OperationalError:
|
|
648
|
+
return set() # vec tables not created yet (embeddings never enabled)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def prune_orphan_vectors(conn: sqlite3.Connection) -> int:
|
|
652
|
+
"""Delete vec_chunks entries whose chunk no longer exists. Returns count deleted."""
|
|
653
|
+
try:
|
|
654
|
+
current_ids = {r[0] for r in conn.execute("SELECT id FROM chunks").fetchall()}
|
|
655
|
+
orphan_ids = [
|
|
656
|
+
(r[0],)
|
|
657
|
+
for r in conn.execute("SELECT chunk_id FROM vec_chunks").fetchall()
|
|
658
|
+
if r[0] not in current_ids
|
|
659
|
+
]
|
|
660
|
+
if orphan_ids:
|
|
661
|
+
conn.executemany("DELETE FROM vec_chunks WHERE chunk_id = ?", orphan_ids)
|
|
662
|
+
return len(orphan_ids)
|
|
663
|
+
except sqlite3.OperationalError:
|
|
664
|
+
return 0 # vec tables not created yet (embeddings never enabled)
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def path_mtimes(conn: sqlite3.Connection) -> dict[str, int]:
|
|
668
|
+
"""Map every indexed file's repo-relative path to its stored mtime_ns."""
|
|
669
|
+
return {
|
|
670
|
+
row["path"]: int(row["mtime_ns"])
|
|
671
|
+
for row in conn.execute("SELECT path, mtime_ns FROM files").fetchall()
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def fingerprints(conn: sqlite3.Connection) -> dict[str, tuple[int, int, str]]:
|
|
676
|
+
"""Map every indexed path to its (mtime_ns, size_bytes, sha256) for incremental update."""
|
|
677
|
+
return {
|
|
678
|
+
row["path"]: (int(row["mtime_ns"]), int(row["size_bytes"]), row["sha256"])
|
|
679
|
+
for row in conn.execute(
|
|
680
|
+
"SELECT path, mtime_ns, size_bytes, sha256 FROM files"
|
|
681
|
+
).fetchall()
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
def vector_search(
|
|
686
|
+
conn: sqlite3.Connection, query_embedding: list[float], *, limit: int
|
|
687
|
+
) -> list[sqlite3.Row]:
|
|
688
|
+
"""KNN over vec_chunks; joins back to chunks/files for a uniform result row."""
|
|
689
|
+
import sqlite_vec # type: ignore[import-untyped]
|
|
690
|
+
|
|
691
|
+
return conn.execute(
|
|
692
|
+
"SELECT v.chunk_id AS chunk_id, v.distance AS distance, f.path AS path, "
|
|
693
|
+
" c.line_start AS line_start, c.line_end AS line_end, "
|
|
694
|
+
" c.content AS content, c.token_est AS token_est "
|
|
695
|
+
"FROM vec_chunks v "
|
|
696
|
+
"JOIN chunks c ON c.id = v.chunk_id "
|
|
697
|
+
"JOIN files f ON f.id = c.file_id "
|
|
698
|
+
"WHERE v.embedding MATCH ? AND k = ? "
|
|
699
|
+
"ORDER BY v.distance",
|
|
700
|
+
(sqlite_vec.serialize_float32(query_embedding), int(limit)),
|
|
701
|
+
).fetchall()
|