codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,701 @@
1
+ """Typed read/write accessors. All SQL lives here."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import sqlite3
7
+ from typing import Any, Iterable, Optional, Sequence
8
+
9
+ from ..parsers.base import Chunk, Symbol
10
+
11
+
12
+ def upsert_file(
13
+ conn: sqlite3.Connection,
14
+ *,
15
+ path: str,
16
+ lang: Optional[str],
17
+ size_bytes: int,
18
+ sha256: str,
19
+ mtime_ns: int,
20
+ git_status: Optional[str],
21
+ parser: str,
22
+ indexed_at: str,
23
+ is_generated: bool,
24
+ ) -> int:
25
+ """Insert or update a file row keyed by repo-relative path."""
26
+ row = conn.execute(
27
+ """
28
+ INSERT INTO files
29
+ (path, lang, size_bytes, sha256, mtime_ns, git_status, parser, indexed_at, is_generated)
30
+ VALUES
31
+ (:path, :lang, :size_bytes, :sha256, :mtime_ns, :git_status, :parser, :indexed_at, :is_generated)
32
+ ON CONFLICT(path) DO UPDATE SET
33
+ lang = excluded.lang,
34
+ size_bytes = excluded.size_bytes,
35
+ sha256 = excluded.sha256,
36
+ mtime_ns = excluded.mtime_ns,
37
+ git_status = excluded.git_status,
38
+ parser = excluded.parser,
39
+ indexed_at = excluded.indexed_at,
40
+ is_generated = excluded.is_generated
41
+ RETURNING id
42
+ """,
43
+ {
44
+ "path": path,
45
+ "lang": lang,
46
+ "size_bytes": size_bytes,
47
+ "sha256": sha256,
48
+ "mtime_ns": mtime_ns,
49
+ "git_status": git_status,
50
+ "parser": parser,
51
+ "indexed_at": indexed_at,
52
+ "is_generated": 1 if is_generated else 0,
53
+ },
54
+ ).fetchone()
55
+ return int(row[0])
56
+
57
+
58
+ def get_file(conn: sqlite3.Connection, path: str) -> Optional[sqlite3.Row]:
59
+ return conn.execute("SELECT * FROM files WHERE path = ?", (path,)).fetchone()
60
+
61
+
62
+ def all_paths(conn: sqlite3.Connection) -> set[str]:
63
+ return {r[0] for r in conn.execute("SELECT path FROM files")}
64
+
65
+
66
+ def delete_files(conn: sqlite3.Connection, paths: Iterable[str]) -> int:
67
+ paths = list(paths)
68
+ if not paths:
69
+ return 0
70
+ before = conn.total_changes
71
+ conn.executemany("DELETE FROM files WHERE path = ?", [(p,) for p in paths])
72
+ return conn.total_changes - before
73
+
74
+
75
+ def count_files(conn: sqlite3.Connection) -> int:
76
+ return int(conn.execute("SELECT COUNT(*) FROM files").fetchone()[0])
77
+
78
+
79
+ def get_meta(conn: sqlite3.Connection, key: str) -> Optional[str]:
80
+ row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
81
+ return row[0] if row else None
82
+
83
+
84
+ def set_meta(conn: sqlite3.Connection, key: str, value: str) -> None:
85
+ conn.execute(
86
+ "INSERT INTO meta(key, value) VALUES (?, ?) "
87
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
88
+ (key, value),
89
+ )
90
+
91
+
92
+ def replace_chunks(
93
+ conn: sqlite3.Connection,
94
+ file_id: int,
95
+ chunks: Sequence[Chunk],
96
+ symbol_ids: Optional[Sequence[int]] = None,
97
+ ) -> int:
98
+ conn.execute("DELETE FROM chunks WHERE file_id = ?", (file_id,))
99
+
100
+ def _symbol_id(chunk: Chunk) -> Optional[int]:
101
+ if chunk.symbol_index is not None and symbol_ids is not None:
102
+ return symbol_ids[chunk.symbol_index]
103
+ return None
104
+
105
+ # symbol_names is denormalized into the chunk (see schema.sql): resolve the
106
+ # name from the just-inserted symbol row (replace_symbols runs first). Stored so
107
+ # the FTS triggers can replay it verbatim on delete/update.
108
+ conn.executemany(
109
+ """
110
+ INSERT INTO chunks
111
+ (file_id, line_start, line_end, kind, symbol_id, content, token_est, symbol_names)
112
+ VALUES
113
+ (?, ?, ?, ?, ?, ?, ?, COALESCE((SELECT name FROM symbols WHERE id = ?), ''))
114
+ """,
115
+ [
116
+ (
117
+ file_id,
118
+ c.line_start,
119
+ c.line_end,
120
+ c.kind,
121
+ _symbol_id(c),
122
+ c.content,
123
+ c.token_est,
124
+ _symbol_id(c),
125
+ )
126
+ for c in chunks
127
+ ],
128
+ )
129
+ return len(chunks)
130
+
131
+
132
+ def append_chunks(
133
+ conn: sqlite3.Connection,
134
+ file_id: int,
135
+ chunks: Sequence[Chunk],
136
+ ) -> int:
137
+ """Append chunks without deleting existing ones (for doc chunks)."""
138
+ conn.executemany(
139
+ """
140
+ INSERT INTO chunks
141
+ (file_id, line_start, line_end, kind, symbol_id, content, token_est)
142
+ VALUES
143
+ (?, ?, ?, ?, NULL, ?, ?)
144
+ """,
145
+ [
146
+ (
147
+ file_id,
148
+ c.line_start,
149
+ c.line_end,
150
+ c.kind,
151
+ c.content,
152
+ c.token_est,
153
+ )
154
+ for c in chunks
155
+ ],
156
+ )
157
+ return len(chunks)
158
+
159
+
160
+ def chunks_for_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row]:
161
+ return conn.execute(
162
+ "SELECT * FROM chunks WHERE file_id = ? ORDER BY line_start", (file_id,)
163
+ ).fetchall()
164
+
165
+
166
+ def count_chunks(conn: sqlite3.Connection) -> int:
167
+ return int(conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0])
168
+
169
+
170
+ def replace_symbols(
171
+ conn: sqlite3.Connection, file_id: int, symbols: Sequence[Symbol]
172
+ ) -> list[int]:
173
+ conn.execute("DELETE FROM symbols WHERE file_id = ?", (file_id,))
174
+ ids: list[int] = []
175
+ for symbol in symbols:
176
+ cur = conn.execute(
177
+ """
178
+ INSERT INTO symbols
179
+ (file_id, name, qualified, kind, line_start, line_end, signature,
180
+ parent_id, docstring, in_degree, out_degree)
181
+ VALUES (?, ?, ?, ?, ?, ?, ?, NULL, ?, 0, 0)
182
+ """,
183
+ (
184
+ file_id,
185
+ symbol.name,
186
+ symbol.qualified,
187
+ symbol.kind,
188
+ symbol.line_start,
189
+ symbol.line_end,
190
+ symbol.signature,
191
+ symbol.docstring,
192
+ ),
193
+ )
194
+ assert cur.lastrowid is not None
195
+ ids.append(int(cur.lastrowid))
196
+ for symbol, symbol_id in zip(symbols, ids):
197
+ if symbol.parent_index is not None:
198
+ conn.execute(
199
+ "UPDATE symbols SET parent_id = ? WHERE id = ?",
200
+ (ids[symbol.parent_index], symbol_id),
201
+ )
202
+ return ids
203
+
204
+
205
+ def symbols_by_name(
206
+ conn: sqlite3.Connection,
207
+ name: str,
208
+ *,
209
+ kind: Optional[str] = None,
210
+ exact: bool = True,
211
+ ) -> list[sqlite3.Row]:
212
+ sql = """
213
+ SELECT s.*, f.path AS path
214
+ FROM symbols s JOIN files f ON f.id = s.file_id
215
+ WHERE s.name {op} ?
216
+ """.format(op="=" if exact else "LIKE")
217
+ params: list[Any] = [name if exact else f"{name}%"]
218
+ if kind:
219
+ sql += " AND s.kind = ?"
220
+ params.append(kind)
221
+ sql += " ORDER BY s.name, f.path, s.line_start"
222
+ return conn.execute(sql, params).fetchall()
223
+
224
+
225
+ def count_symbols(conn: sqlite3.Connection) -> int:
226
+ return int(conn.execute("SELECT COUNT(*) FROM symbols").fetchone()[0])
227
+
228
+
229
+ def treesitter_coverage(conn: sqlite3.Connection) -> list[sqlite3.Row]:
230
+ """Per-language (files, symbols) for tree-sitter files only.
231
+
232
+ Powers Guardrail 2: a tree-sitter language with files but ~0 symbols is a yellow flag
233
+ (silent extraction failure), surfaced by `doctor`.
234
+ """
235
+ return conn.execute(
236
+ """
237
+ SELECT f.lang AS lang,
238
+ COUNT(DISTINCT f.id) AS files,
239
+ COUNT(s.id) AS symbols
240
+ FROM files f
241
+ LEFT JOIN symbols s ON s.file_id = f.id
242
+ WHERE f.parser = 'treesitter'
243
+ GROUP BY f.lang
244
+ ORDER BY files DESC
245
+ """
246
+ ).fetchall()
247
+
248
+
249
+ def replace_edges(
250
+ conn: sqlite3.Connection, file_id: int, edges: Sequence[dict[str, Any]]
251
+ ) -> int:
252
+ conn.execute("DELETE FROM edges WHERE file_id = ?", (file_id,))
253
+ conn.executemany(
254
+ """
255
+ INSERT INTO edges
256
+ (edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line,
257
+ resolved, confidence)
258
+ VALUES
259
+ (:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line,
260
+ :resolved, :confidence)
261
+ """,
262
+ # confidence defaults to 'extracted' for callers (and tests) that predate the
263
+ # audit-trail column; the global graph pass refines it (see graph/builder.py).
264
+ [{"confidence": "extracted", **edge, "file_id": file_id} for edge in edges],
265
+ )
266
+ return len(edges)
267
+
268
+
269
+ def count_edges(conn: sqlite3.Connection) -> int:
270
+ return int(conn.execute("SELECT COUNT(*) FROM edges").fetchone()[0])
271
+
272
+
273
+ def refs_for_name(conn: sqlite3.Connection, name: str) -> list[sqlite3.Row]:
274
+ return conn.execute(
275
+ """
276
+ SELECT e.line AS line, f.path AS path, e.edge_type AS edge_type,
277
+ e.resolved AS resolved, e.src_id AS src_id, e.src_kind AS src_kind,
278
+ e.confidence AS confidence,
279
+ src.name AS src_name, src.qualified AS src_qualified
280
+ FROM edges e
281
+ JOIN files f ON f.id = e.file_id
282
+ LEFT JOIN symbols src ON src.id = e.src_id AND e.src_kind = 'symbol'
283
+ WHERE e.dst_name = ? AND e.edge_type = 'call'
284
+ ORDER BY f.path, e.line
285
+ """,
286
+ (name,),
287
+ ).fetchall()
288
+
289
+
290
+ def fts_search(
291
+ conn: sqlite3.Connection, match_query: str, *, limit: int
292
+ ) -> list[sqlite3.Row]:
293
+ if not match_query.strip():
294
+ return []
295
+ return conn.execute(
296
+ """
297
+ SELECT c.id AS chunk_id,
298
+ f.path AS path,
299
+ c.line_start AS line_start,
300
+ c.line_end AS line_end,
301
+ c.content AS content,
302
+ c.token_est AS token_est,
303
+ bm25(fts_chunks) AS bm25,
304
+ c.kind AS kind
305
+ FROM fts_chunks
306
+ JOIN chunks c ON c.id = fts_chunks.rowid
307
+ JOIN files f ON f.id = c.file_id
308
+ WHERE fts_chunks MATCH ?
309
+ ORDER BY bm25(fts_chunks)
310
+ LIMIT ?
311
+ """,
312
+ (match_query, limit),
313
+ ).fetchall()
314
+
315
+
316
+ def path_search(
317
+ conn: sqlite3.Connection, query: str, *, limit: int
318
+ ) -> list[sqlite3.Row]:
319
+ """Match files whose path contains query tokens. Score = number of tokens hit."""
320
+ tokens = [t for t in re.split(r"[\s/.\\]+", query.strip()) if t]
321
+ if not tokens:
322
+ return []
323
+ score_expr = " + ".join(["(path LIKE ?)"] * len(tokens))
324
+ like_args = [f"%{t}%" for t in tokens]
325
+ return conn.execute(
326
+ f"""
327
+ SELECT id AS file_id, path, mtime_ns, is_generated,
328
+ ({score_expr}) AS hits
329
+ FROM files
330
+ WHERE {' OR '.join(['path LIKE ?'] * len(tokens))}
331
+ ORDER BY hits DESC, length(path) ASC
332
+ LIMIT ?
333
+ """,
334
+ (*like_args, *like_args, limit),
335
+ ).fetchall()
336
+
337
+
338
+ def symbol_search(
339
+ conn: sqlite3.Connection,
340
+ name: str,
341
+ *,
342
+ limit: int,
343
+ kind: Optional[str] = None,
344
+ exact: bool = False,
345
+ ) -> list[sqlite3.Row]:
346
+ """Symbol lookup: exact name first, then prefix, then substring (fuzzy)."""
347
+ name = name.strip()
348
+ if not name:
349
+ return []
350
+ kind_clause = "AND s.kind = :kind" if kind else ""
351
+ name_clause = "s.name = :exact COLLATE NOCASE" if exact else (
352
+ "(s.name = :exact COLLATE NOCASE "
353
+ "OR s.name LIKE :prefix COLLATE NOCASE "
354
+ "OR s.name LIKE :sub COLLATE NOCASE)"
355
+ )
356
+ return conn.execute(
357
+ f"""
358
+ SELECT s.name, s.kind, s.signature, s.line_start, s.line_end,
359
+ s.in_degree, s.out_degree, f.path, f.mtime_ns, f.is_generated,
360
+ (s.name = :exact COLLATE NOCASE) AS is_exact
361
+ FROM symbols s
362
+ JOIN files f ON f.id = s.file_id
363
+ WHERE {name_clause} {kind_clause}
364
+ ORDER BY is_exact DESC,
365
+ (s.name LIKE :prefix COLLATE NOCASE) DESC,
366
+ s.in_degree DESC
367
+ LIMIT :limit
368
+ """,
369
+ {
370
+ "exact": name,
371
+ "prefix": f"{name}%",
372
+ "sub": f"%{name}%",
373
+ "kind": kind,
374
+ "limit": limit,
375
+ },
376
+ ).fetchall()
377
+
378
+
379
+ def unresolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]:
380
+ return conn.execute(
381
+ "SELECT e.id AS id, e.edge_type AS edge_type, e.dst_name AS dst_name, "
382
+ " f.lang AS lang "
383
+ "FROM edges e JOIN files f ON f.id = e.file_id "
384
+ "WHERE e.resolved = 0 AND e.dst_name IS NOT NULL ORDER BY e.id"
385
+ ).fetchall()
386
+
387
+
388
+ def resolve_edge(conn: sqlite3.Connection, edge_id: int, dst_kind: str, dst_id: int) -> None:
389
+ conn.execute(
390
+ "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1 WHERE id = ?",
391
+ (dst_kind, dst_id, edge_id),
392
+ )
393
+
394
+
395
+ def resolve_edges_bulk(
396
+ conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int, str]]
397
+ ) -> None:
398
+ """Apply (dst_kind, dst_id, edge_id, confidence) resolutions in one executemany.
399
+
400
+ confidence records *how* the target was found: 'extracted' for an exact match
401
+ (a repo-unique symbol name), 'inferred' for a heuristic (import path-suffix).
402
+ """
403
+ conn.executemany(
404
+ "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1, confidence = ? WHERE id = ?",
405
+ [(dst_kind, dst_id, confidence, edge_id) for dst_kind, dst_id, edge_id, confidence in resolutions],
406
+ )
407
+
408
+
409
+ def mark_ambiguous_edges(conn: sqlite3.Connection) -> int:
410
+ """Flag every still-unresolved edge that names a target as 'ambiguous'.
411
+
412
+ Run after the global resolution pass: an edge with a dst_name that no unique
413
+ symbol/file claims is one we could not pin down (a non-unique name, or an import
414
+ of code outside the repo). Marking it keeps refs/impact honest — an empty or
415
+ short answer over ambiguous edges is inconclusive, not proof of "no callers".
416
+ """
417
+ cur = conn.execute(
418
+ "UPDATE edges SET confidence = 'ambiguous' "
419
+ "WHERE resolved = 0 AND dst_name IS NOT NULL AND confidence != 'ambiguous'"
420
+ )
421
+ return cur.rowcount if cur.rowcount is not None else 0
422
+
423
+
424
+ def all_resolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]:
425
+ """Every resolved edge as (src_kind, src_id, dst_kind, dst_id, edge_type, confidence).
426
+
427
+ The in-memory adjacency the graph analysis (communities / god nodes / bridges)
428
+ is built from. Unresolved edges are skipped — they have no concrete endpoint.
429
+ """
430
+ return conn.execute(
431
+ "SELECT src_kind, src_id, dst_kind, dst_id, edge_type, confidence FROM edges "
432
+ "WHERE resolved = 1 AND dst_id IS NOT NULL"
433
+ ).fetchall()
434
+
435
+
436
+ def all_graph_nodes(conn: sqlite3.Connection) -> dict[str, list[sqlite3.Row]]:
437
+ """File and symbol rows keyed by kind, for labelling graph-analysis nodes."""
438
+ return {
439
+ "file": conn.execute("SELECT id, path FROM files").fetchall(),
440
+ "symbol": conn.execute(
441
+ "SELECT s.id AS id, s.name AS name, s.kind AS kind, f.path AS path, "
442
+ " s.line_start AS line_start, "
443
+ " s.in_degree AS in_degree, s.out_degree AS out_degree "
444
+ "FROM symbols s JOIN files f ON f.id = s.file_id"
445
+ ).fetchall(),
446
+ }
447
+
448
+
449
+ def name_ref_counts(conn: sqlite3.Connection, names: Sequence[str]) -> dict[str, int]:
450
+ """Count edges targeting each name (any resolution state), keyed by dst_name.
451
+
452
+ A damped centrality proxy for symbols whose precise in_degree is 0 because their
453
+ name is not globally unique (ambiguous edges never resolve). Over-counts across
454
+ same-named symbols by design — it is only used as a weak tiebreak fallback.
455
+ """
456
+ uniq = [n for n in dict.fromkeys(names) if n]
457
+ if not uniq:
458
+ return {}
459
+ placeholders = ",".join("?" * len(uniq))
460
+ rows = conn.execute(
461
+ f"SELECT dst_name, COUNT(*) AS c FROM edges "
462
+ f"WHERE dst_name IN ({placeholders}) GROUP BY dst_name",
463
+ tuple(uniq),
464
+ ).fetchall()
465
+ return {row["dst_name"]: int(row["c"]) for row in rows}
466
+
467
+
468
+ def unique_symbol_ids_by_name(conn: sqlite3.Connection) -> dict[str, int]:
469
+ """Map symbol name -> id for names defined exactly once in the repo."""
470
+ return {
471
+ row["name"]: int(row["sym_id"])
472
+ for row in conn.execute(
473
+ "SELECT name, MIN(id) AS sym_id FROM symbols GROUP BY name HAVING COUNT(*) = 1"
474
+ )
475
+ }
476
+
477
+
478
+ def all_file_ids_with_paths(conn: sqlite3.Connection) -> list[sqlite3.Row]:
479
+ return conn.execute("SELECT id, path FROM files").fetchall()
480
+
481
+
482
+ def symbol_id_for_unique_name(conn: sqlite3.Connection, name: str) -> Optional[int]:
483
+ rows = conn.execute(
484
+ "SELECT id FROM symbols WHERE name = ? LIMIT 2", (name,)
485
+ ).fetchall()
486
+ return int(rows[0]["id"]) if len(rows) == 1 else None
487
+
488
+
489
+ def files_with_suffix(conn: sqlite3.Connection, suffix: str) -> list[sqlite3.Row]:
490
+ return conn.execute(
491
+ "SELECT id, path FROM files WHERE path = ? OR path LIKE ? ORDER BY length(path), path",
492
+ (suffix, f"%/{suffix}"),
493
+ ).fetchall()
494
+
495
+
496
+ def file_by_path(conn: sqlite3.Connection, path: str) -> Optional[sqlite3.Row]:
497
+ return conn.execute("SELECT id, path FROM files WHERE path = ?", (path,)).fetchone()
498
+
499
+
500
+ def symbols_in_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row]:
501
+ return conn.execute(
502
+ "SELECT id, name, kind, line_start, in_degree FROM symbols "
503
+ "WHERE file_id = ? ORDER BY line_start",
504
+ (file_id,),
505
+ ).fetchall()
506
+
507
+
508
+ def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]:
509
+ return conn.execute(
510
+ "SELECT id, edge_type, src_kind, src_id, file_id, line, confidence FROM edges "
511
+ "WHERE resolved = 1 AND dst_kind = ? AND dst_id = ?",
512
+ (kind, node_id),
513
+ ).fetchall()
514
+
515
+
516
+ def outgoing_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]:
517
+ return conn.execute(
518
+ "SELECT id, edge_type, dst_kind, dst_id, file_id, line, confidence FROM edges "
519
+ "WHERE resolved = 1 AND src_kind = ? AND src_id = ?",
520
+ (kind, node_id),
521
+ ).fetchall()
522
+
523
+
524
+ def recompute_degrees(conn: sqlite3.Connection) -> None:
525
+ conn.execute(
526
+ "UPDATE symbols SET "
527
+ "out_degree = (SELECT COUNT(*) FROM edges "
528
+ " WHERE resolved = 1 AND src_kind = 'symbol' AND src_id = symbols.id), "
529
+ "in_degree = (SELECT COUNT(*) FROM edges "
530
+ " WHERE resolved = 1 AND dst_kind = 'symbol' AND dst_id = symbols.id)"
531
+ )
532
+
533
+
534
+ def count_resolved_edges(conn: sqlite3.Connection) -> int:
535
+ return int(conn.execute("SELECT COUNT(*) FROM edges WHERE resolved = 1").fetchone()[0])
536
+
537
+
538
+ def ensure_vec_tables(conn: sqlite3.Connection, *, dim: int) -> None:
539
+ """Create vec_chunks (sqlite-vec) + vec_meta + vec_cache if absent. dim is fixed per build."""
540
+ dim = int(dim)
541
+ conn.execute(
542
+ f"CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0("
543
+ f"chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{dim}])"
544
+ )
545
+ conn.execute("CREATE TABLE IF NOT EXISTS vec_meta (model TEXT, dim INTEGER, built_at TEXT)")
546
+ # Content-addressed embedding cache: chunk ids churn on every full rebuild
547
+ # (replace_chunks deletes + re-inserts), so a chunk-id keyed store alone would
548
+ # re-embed the whole repo each time. Keyed by (model, content_sha) the cache
549
+ # survives that churn and lets unchanged content reuse its vector for free.
550
+ conn.execute(
551
+ "CREATE TABLE IF NOT EXISTS vec_cache ("
552
+ "model TEXT NOT NULL, content_sha TEXT NOT NULL, embedding BLOB NOT NULL, "
553
+ "PRIMARY KEY (model, content_sha))"
554
+ )
555
+
556
+
557
+ def set_vec_meta(conn: sqlite3.Connection, *, model: str, dim: int, built_at: str) -> None:
558
+ conn.execute("DELETE FROM vec_meta")
559
+ conn.execute(
560
+ "INSERT INTO vec_meta (model, dim, built_at) VALUES (?,?,?)", (model, int(dim), built_at)
561
+ )
562
+
563
+
564
+ def get_vec_meta(conn: sqlite3.Connection) -> "Optional[sqlite3.Row]":
565
+ return conn.execute("SELECT model, dim, built_at FROM vec_meta LIMIT 1").fetchone()
566
+
567
+
568
+ def chunks_for_embedding(conn: sqlite3.Connection) -> list[sqlite3.Row]:
569
+ return conn.execute("SELECT id, content FROM chunks ORDER BY id").fetchall()
570
+
571
+
572
+ def upsert_chunk_vector(
573
+ conn: sqlite3.Connection, chunk_id: int, embedding: list[float]
574
+ ) -> None:
575
+ import sqlite_vec # type: ignore[import-untyped]
576
+
577
+ upsert_chunk_vector_blob(conn, chunk_id, sqlite_vec.serialize_float32(embedding))
578
+
579
+
580
+ def upsert_chunk_vector_blob(conn: sqlite3.Connection, chunk_id: int, blob: bytes) -> None:
581
+ """Write a pre-serialized float32 embedding blob for a chunk (cache-reuse path)."""
582
+ upsert_chunk_vector_blobs(conn, [(chunk_id, blob)])
583
+
584
+
585
+ def upsert_chunk_vector_blobs(
586
+ conn: sqlite3.Connection, items: Sequence[tuple[int, bytes]]
587
+ ) -> None:
588
+ """Batch-write pre-serialized embedding blobs (one executemany per statement)."""
589
+ if not items:
590
+ return
591
+ conn.executemany(
592
+ "DELETE FROM vec_chunks WHERE chunk_id = ?", [(int(cid),) for cid, _ in items]
593
+ )
594
+ conn.executemany(
595
+ "INSERT INTO vec_chunks (chunk_id, embedding) VALUES (?, ?)",
596
+ [(int(cid), blob) for cid, blob in items],
597
+ )
598
+
599
+
600
+ def cached_embeddings(
601
+ conn: sqlite3.Connection, *, model: str, shas: Iterable[str]
602
+ ) -> dict[str, bytes]:
603
+ """Return {content_sha: serialized embedding blob} already cached for this model."""
604
+ shas = list(dict.fromkeys(shas))
605
+ if not shas:
606
+ return {}
607
+ out: dict[str, bytes] = {}
608
+ # Chunk the IN list to stay well under SQLite's variable limit on huge repos.
609
+ for start in range(0, len(shas), 500):
610
+ batch = shas[start : start + 500]
611
+ placeholders = ",".join("?" * len(batch))
612
+ rows = conn.execute(
613
+ f"SELECT content_sha, embedding FROM vec_cache "
614
+ f"WHERE model = ? AND content_sha IN ({placeholders})",
615
+ (model, *batch),
616
+ ).fetchall()
617
+ for r in rows:
618
+ out[r[0]] = r[1]
619
+ return out
620
+
621
+
622
+ def store_cached_embeddings(
623
+ conn: sqlite3.Connection, *, model: str, items: Sequence[tuple[str, bytes]]
624
+ ) -> None:
625
+ """Insert (content_sha, blob) pairs into the content-addressed embedding cache."""
626
+ if not items:
627
+ return
628
+ conn.executemany(
629
+ "INSERT OR REPLACE INTO vec_cache (model, content_sha, embedding) VALUES (?, ?, ?)",
630
+ [(model, sha, blob) for sha, blob in items],
631
+ )
632
+
633
+
634
+ def clear_vectors(conn: sqlite3.Connection) -> None:
635
+ conn.execute("DELETE FROM vec_chunks")
636
+
637
+
638
+ def count_vectors(conn: sqlite3.Connection) -> int:
639
+ return int(conn.execute("SELECT COUNT(*) FROM vec_chunks").fetchone()[0])
640
+
641
+
642
+ def embedded_chunk_ids(conn: sqlite3.Connection) -> set[int]:
643
+ """Return chunk IDs that already have a vector embedding."""
644
+ try:
645
+ rows = conn.execute("SELECT chunk_id FROM vec_chunks").fetchall()
646
+ return {int(r[0]) for r in rows}
647
+ except sqlite3.OperationalError:
648
+ return set() # vec tables not created yet (embeddings never enabled)
649
+
650
+
651
+ def prune_orphan_vectors(conn: sqlite3.Connection) -> int:
652
+ """Delete vec_chunks entries whose chunk no longer exists. Returns count deleted."""
653
+ try:
654
+ current_ids = {r[0] for r in conn.execute("SELECT id FROM chunks").fetchall()}
655
+ orphan_ids = [
656
+ (r[0],)
657
+ for r in conn.execute("SELECT chunk_id FROM vec_chunks").fetchall()
658
+ if r[0] not in current_ids
659
+ ]
660
+ if orphan_ids:
661
+ conn.executemany("DELETE FROM vec_chunks WHERE chunk_id = ?", orphan_ids)
662
+ return len(orphan_ids)
663
+ except sqlite3.OperationalError:
664
+ return 0 # vec tables not created yet (embeddings never enabled)
665
+
666
+
667
+ def path_mtimes(conn: sqlite3.Connection) -> dict[str, int]:
668
+ """Map every indexed file's repo-relative path to its stored mtime_ns."""
669
+ return {
670
+ row["path"]: int(row["mtime_ns"])
671
+ for row in conn.execute("SELECT path, mtime_ns FROM files").fetchall()
672
+ }
673
+
674
+
675
+ def fingerprints(conn: sqlite3.Connection) -> dict[str, tuple[int, int, str]]:
676
+ """Map every indexed path to its (mtime_ns, size_bytes, sha256) for incremental update."""
677
+ return {
678
+ row["path"]: (int(row["mtime_ns"]), int(row["size_bytes"]), row["sha256"])
679
+ for row in conn.execute(
680
+ "SELECT path, mtime_ns, size_bytes, sha256 FROM files"
681
+ ).fetchall()
682
+ }
683
+
684
+
685
+ def vector_search(
686
+ conn: sqlite3.Connection, query_embedding: list[float], *, limit: int
687
+ ) -> list[sqlite3.Row]:
688
+ """KNN over vec_chunks; joins back to chunks/files for a uniform result row."""
689
+ import sqlite_vec # type: ignore[import-untyped]
690
+
691
+ return conn.execute(
692
+ "SELECT v.chunk_id AS chunk_id, v.distance AS distance, f.path AS path, "
693
+ " c.line_start AS line_start, c.line_end AS line_end, "
694
+ " c.content AS content, c.token_est AS token_est "
695
+ "FROM vec_chunks v "
696
+ "JOIN chunks c ON c.id = v.chunk_id "
697
+ "JOIN files f ON f.id = c.file_id "
698
+ "WHERE v.embedding MATCH ? AND k = ? "
699
+ "ORDER BY v.distance",
700
+ (sqlite_vec.serialize_float32(query_embedding), int(limit)),
701
+ ).fetchall()