codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,423 @@
1
+ """Drive a build: discovery -> parse (parallel) -> write -> prune deleted -> meta."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import os
7
+ import subprocess
8
+ import sys
9
+ from concurrent.futures import ProcessPoolExecutor
10
+ from dataclasses import dataclass
11
+ from datetime import datetime, timezone
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ from ..config import Config
16
+ from ..discovery.walker import walk
17
+ from ..embeddings.backend import resolve_backend
18
+ from ..graph.builder import build_graph
19
+ from ..parsers.base import ParseResult
20
+ from ..parsers.line_chunker import chunk_text
21
+ from ..parsers.treesitter import UnsupportedLanguage, parse_file
22
+ from ..storage import repo
23
+ from ..storage.db import Database
24
+ from .doc_chunks import extract_doc_chunks
25
+
26
+ # Minimum file count before spawning a process pool (avoids spawn overhead on tiny repos)
27
+ _MIN_PARALLEL_FILES = 30
28
+
29
+ # Set by _pool_init in each worker process; avoids per-task Config serialization
30
+ _PARSE_CONFIG: Optional[Config] = None
31
+
32
+
33
+ @dataclass
34
+ class BuildStats:
35
+ indexed: int = 0
36
+ deleted: int = 0
37
+ total_bytes: int = 0
38
+ chunks: int = 0
39
+ skipped: int = 0
40
+ symbols: int = 0
41
+ edges: int = 0
42
+ edges_resolved: int = 0
43
+ vectors: int = 0
44
+ parse_failed: int = 0
45
+ treesitter_zero_symbols: int = 0
46
+
47
+
48
+ @dataclass
49
+ class _ParseOutcome:
50
+ result: ParseResult
51
+ parse_failed: bool = False
52
+ zero_symbols: bool = False
53
+
54
+
55
+ @dataclass
56
+ class _ParseResult:
57
+ sha256: str
58
+ outcome: _ParseOutcome
59
+ doc_chunks: list
60
+
61
+
62
+ def _add_stats(target: BuildStats, delta: BuildStats) -> None:
63
+ target.indexed += delta.indexed
64
+ target.deleted += delta.deleted
65
+ target.total_bytes += delta.total_bytes
66
+ target.chunks += delta.chunks
67
+ target.skipped += delta.skipped
68
+ target.symbols += delta.symbols
69
+ target.edges += delta.edges
70
+ target.edges_resolved += delta.edges_resolved
71
+ target.vectors += delta.vectors
72
+ target.parse_failed += delta.parse_failed
73
+ target.treesitter_zero_symbols += delta.treesitter_zero_symbols
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Parse phase — CPU-bound, can run in parallel
78
+ # ---------------------------------------------------------------------------
79
+
80
+ def _pool_init(config: Config) -> None:
81
+ """Initialiser for each worker process: store config in a module global."""
82
+ global _PARSE_CONFIG
83
+ _PARSE_CONFIG = config
84
+
85
+
86
+ def _parse_one(cand) -> _ParseResult:
87
+ """Parse a single file. Top-level for ProcessPoolExecutor pickling; uses _PARSE_CONFIG."""
88
+ config = _PARSE_CONFIG
89
+ assert config is not None, "_pool_init must set _PARSE_CONFIG before any worker parses"
90
+ try:
91
+ sha256 = _sha256_file(cand.path)
92
+ except OSError:
93
+ sha256 = ""
94
+ text = _read_text(cand.path)
95
+ outcome = _parse(cand.lang, cand.parser, text, config)
96
+ doc_chunks = extract_doc_chunks(text, cand.rel_path, cand.lang)
97
+ return _ParseResult(sha256=sha256, outcome=outcome, doc_chunks=doc_chunks)
98
+
99
+
100
+ def _parse_one_inline(
101
+ cand, config: Config, *, sha256: Optional[str] = None
102
+ ) -> _ParseResult:
103
+ """Sequential parse — used when pool is unavailable or repo is too small."""
104
+ if sha256 is None:
105
+ try:
106
+ sha256 = _sha256_file(cand.path)
107
+ except OSError:
108
+ sha256 = ""
109
+ text = _read_text(cand.path)
110
+ outcome = _parse(cand.lang, cand.parser, text, config)
111
+ doc_chunks = extract_doc_chunks(text, cand.rel_path, cand.lang)
112
+ return _ParseResult(sha256=sha256, outcome=outcome, doc_chunks=doc_chunks)
113
+
114
+
115
+ def _parse_all(candidates: list, config: Config) -> list[_ParseResult]:
116
+ """Parse all candidates, using a process pool for large repos."""
117
+ if len(candidates) < _MIN_PARALLEL_FILES:
118
+ return [_parse_one_inline(c, config) for c in candidates]
119
+ workers = min(len(candidates), os.cpu_count() or 1)
120
+ try:
121
+ with ProcessPoolExecutor(
122
+ max_workers=workers,
123
+ initializer=_pool_init,
124
+ initargs=(config,),
125
+ ) as pool:
126
+ return list(pool.map(_parse_one, candidates))
127
+ except Exception as exc:
128
+ print(
129
+ f"[codebase-index] parallel parse unavailable ({type(exc).__name__}: {exc}); "
130
+ f"falling back to sequential parsing for {len(candidates)} files.",
131
+ file=sys.stderr,
132
+ )
133
+ return [_parse_one_inline(c, config) for c in candidates]
134
+
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Write phase — DB writes, must be serial
138
+ # ---------------------------------------------------------------------------
139
+
140
+ def _write_candidate(conn, cand, pr: _ParseResult, now: str) -> BuildStats:
141
+ """Write a pre-parsed candidate to the database."""
142
+ stats = BuildStats(indexed=1, total_bytes=cand.size_bytes)
143
+ file_id = repo.upsert_file(
144
+ conn,
145
+ path=cand.rel_path,
146
+ lang=cand.lang,
147
+ size_bytes=cand.size_bytes,
148
+ sha256=pr.sha256,
149
+ mtime_ns=cand.path.stat().st_mtime_ns,
150
+ git_status=None,
151
+ parser=cand.parser,
152
+ indexed_at=now,
153
+ is_generated=cand.is_generated,
154
+ )
155
+ outcome = pr.outcome
156
+ parse_result = outcome.result
157
+ stats.parse_failed += int(outcome.parse_failed)
158
+ stats.treesitter_zero_symbols += int(outcome.zero_symbols)
159
+ symbol_ids = repo.replace_symbols(conn, file_id, parse_result.symbols)
160
+ repo.replace_chunks(conn, file_id, parse_result.chunks, symbol_ids=symbol_ids)
161
+ if pr.doc_chunks:
162
+ repo.append_chunks(conn, file_id, pr.doc_chunks)
163
+ stats.chunks += len(pr.doc_chunks)
164
+ edge_rows = _resolve_edges(parse_result, symbol_ids, file_id)
165
+ repo.replace_edges(conn, file_id, edge_rows)
166
+ stats.chunks += len(parse_result.chunks)
167
+ stats.symbols += len(parse_result.symbols)
168
+ stats.edges += len(edge_rows)
169
+ return stats
170
+
171
+
172
+ def build_index(config: Config, db: Database, root: Optional[Path] = None) -> BuildStats:
173
+ root = Path(root or config.root).resolve()
174
+ conn = db.conn
175
+ now = _utc_now_iso()
176
+
177
+ candidates = list(walk(root, config))
178
+ parse_results = _parse_all(candidates, config)
179
+
180
+ stats = BuildStats()
181
+ seen: set[str] = set()
182
+ for cand, pr in zip(candidates, parse_results):
183
+ _add_stats(stats, _write_candidate(conn, cand, pr, now))
184
+ seen.add(cand.rel_path)
185
+
186
+ stats.deleted = repo.delete_files(conn, repo.all_paths(conn) - seen)
187
+ repo.set_meta(conn, "built_at", now)
188
+ repo.set_meta(conn, "config_hash", config.config_hash())
189
+ if head := _git_head(root):
190
+ repo.set_meta(conn, "head_commit", head)
191
+
192
+ graph = build_graph(conn)
193
+ stats.edges_resolved = graph["resolved"]
194
+
195
+ if config.embeddings.enabled:
196
+ stats.vectors = _embed_chunks(config, db, conn)
197
+
198
+ conn.commit()
199
+ return stats
200
+
201
+
202
+ def _embed_chunks(cfg, db, conn) -> int:
203
+ """Embed only new/changed chunks (incremental). Returns count of newly embedded chunks.
204
+
205
+ Fully gated: with embeddings disabled this is never called, so no optional
206
+ dependency is imported and vec_chunks is never created.
207
+ """
208
+ backend = resolve_backend(cfg, warn=lambda m: print(m))
209
+ if not getattr(backend, "enabled", False):
210
+ return 0
211
+ import sqlite_vec # type: ignore[import-untyped]
212
+
213
+ db.enable_vectors()
214
+ repo.ensure_vec_tables(conn, dim=backend.dim)
215
+ repo.prune_orphan_vectors(conn)
216
+ existing = repo.embedded_chunk_ids(conn)
217
+ rows = [r for r in repo.chunks_for_embedding(conn) if int(r["id"]) not in existing]
218
+ if not rows:
219
+ return 0
220
+
221
+ # Content-addressed reuse: chunk ids churn on every full rebuild (replace_chunks),
222
+ # so a chunk-id keyed skip alone re-embeds the whole repo each time. Hash the content
223
+ # and only call the (potentially slow / paid) backend for text never embedded under
224
+ # this model; everything else is copied straight from the cache.
225
+ shas = [hashlib.sha256(r["content"].encode("utf-8")).hexdigest() for r in rows]
226
+ cached = repo.cached_embeddings(conn, model=backend.name, shas=shas)
227
+ misses = [(r, sha) for r, sha in zip(rows, shas) if sha not in cached]
228
+
229
+ fresh: dict[str, bytes] = {}
230
+ if misses:
231
+ vectors = backend.embed([r["content"] for r, _ in misses])
232
+ for (_row, sha), vec in zip(misses, vectors):
233
+ fresh[sha] = sqlite_vec.serialize_float32(vec)
234
+ repo.store_cached_embeddings(conn, model=backend.name, items=list(fresh.items()))
235
+
236
+ repo.upsert_chunk_vector_blobs(
237
+ conn,
238
+ [(int(row["id"]), cached.get(sha) or fresh[sha]) for row, sha in zip(rows, shas)],
239
+ )
240
+
241
+ built_at = datetime.now(timezone.utc).isoformat()
242
+ repo.set_vec_meta(conn, model=backend.name, dim=backend.dim, built_at=built_at)
243
+ return len(misses)
244
+
245
+
246
+ def _sha256_file(path: Path) -> str:
247
+ h = hashlib.sha256()
248
+ with path.open("rb") as fh:
249
+ for block in iter(lambda: fh.read(65536), b""):
250
+ h.update(block)
251
+ return h.hexdigest()
252
+
253
+
254
+ def _read_text(path: Path) -> str:
255
+ try:
256
+ return path.read_text(encoding="utf-8", errors="ignore")
257
+ except OSError:
258
+ return ""
259
+
260
+
261
+ def _parse(lang: Optional[str], parser: str, text: str, config: Config) -> _ParseOutcome:
262
+ """Parse a file to a ParseResult, recording (never swallowing) parse failures.
263
+
264
+ Routing is owned by `classify` (Guardrail 1): only files classify labels `treesitter`
265
+ attempt tree-sitter; everything else stays on the line-chunk + FTS floor (Tier C).
266
+ """
267
+ failed = False
268
+ if lang and parser == "treesitter":
269
+ try:
270
+ result = parse_file(lang, text)
271
+ return _ParseOutcome(result=result, zero_symbols=not result.symbols)
272
+ except UnsupportedLanguage:
273
+ # classify routed a tree-sitter lang with no extraction path — a Guardrail 1
274
+ # breach. Count it loudly instead of pretending the file parsed.
275
+ failed = True
276
+ except Exception:
277
+ # Any other parse error: record it (Guardrail 2) and fall back to line chunks,
278
+ # so one bad file never silently looks identical to a clean parse.
279
+ failed = True
280
+ chunks = chunk_text(
281
+ text,
282
+ window_lines=config.chunk.window_lines,
283
+ overlap_lines=config.chunk.overlap_lines,
284
+ )
285
+ return _ParseOutcome(result=ParseResult(chunks=chunks, symbols=[], edges=[]), parse_failed=failed)
286
+
287
+
288
+ def _resolve_edges(
289
+ parse_result: ParseResult, symbol_ids: list[int], file_id: int
290
+ ) -> list[dict]:
291
+ name_to_id = {
292
+ symbol.name: symbol_ids[idx]
293
+ for idx, symbol in enumerate(parse_result.symbols)
294
+ }
295
+ rows: list[dict] = []
296
+ for edge in parse_result.edges:
297
+ src_id = (
298
+ symbol_ids[edge.src_symbol_index]
299
+ if edge.src_symbol_index is not None
300
+ else file_id
301
+ )
302
+ src_kind = "symbol" if edge.src_symbol_index is not None else "file"
303
+ dst_id = name_to_id.get(edge.callee_name)
304
+ rows.append(
305
+ {
306
+ "edge_type": edge.edge_type,
307
+ "src_kind": src_kind,
308
+ "src_id": src_id,
309
+ "dst_kind": "symbol" if dst_id is not None else None,
310
+ "dst_id": dst_id,
311
+ "dst_name": edge.callee_name,
312
+ "line": edge.line,
313
+ "resolved": 1 if dst_id is not None else 0,
314
+ }
315
+ )
316
+ return rows
317
+
318
+
319
+ def _utc_now_iso() -> str:
320
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
321
+
322
+
323
+ def _git_head(root: Path) -> Optional[str]:
324
+ try:
325
+ out = subprocess.run(
326
+ ["git", "-C", str(root), "rev-parse", "HEAD"],
327
+ capture_output=True,
328
+ text=True,
329
+ timeout=5,
330
+ check=False,
331
+ )
332
+ except (OSError, subprocess.SubprocessError):
333
+ return None
334
+ return out.stdout.strip() if out.returncode == 0 else None
335
+
336
+
337
+ def update_index(
338
+ config: Config,
339
+ db: Database,
340
+ root: Optional[Path] = None,
341
+ *,
342
+ since: Optional[str] = None,
343
+ all_files: bool = False,
344
+ ) -> BuildStats:
345
+ root = Path(root or config.root).resolve()
346
+ conn = db.conn
347
+ now = _utc_now_iso()
348
+ stats = BuildStats()
349
+
350
+ indexed_fp = repo.fingerprints(conn)
351
+ scope = _git_changed_since(root, since) if since else None
352
+
353
+ seen: set[str] = set()
354
+ for cand in walk(root, config):
355
+ seen.add(cand.rel_path)
356
+ if scope is not None and cand.rel_path not in scope:
357
+ stats.skipped += 1
358
+ continue
359
+
360
+ st = cand.path.stat()
361
+ prior = indexed_fp.get(cand.rel_path)
362
+ fast_ok = (
363
+ not all_files
364
+ and prior is not None
365
+ and prior[0] == st.st_mtime_ns
366
+ and prior[1] == cand.size_bytes
367
+ )
368
+ if fast_ok:
369
+ stats.skipped += 1
370
+ continue
371
+
372
+ sha = _sha256_file(cand.path)
373
+ if prior is not None and prior[2] == sha:
374
+ conn.execute(
375
+ "UPDATE files SET mtime_ns = ?, size_bytes = ?, indexed_at = ? WHERE path = ?",
376
+ (st.st_mtime_ns, cand.size_bytes, now, cand.rel_path),
377
+ )
378
+ stats.skipped += 1
379
+ continue
380
+
381
+ pr = _parse_one_inline(cand, config, sha256=sha)
382
+ _add_stats(stats, _write_candidate(conn, cand, pr, now))
383
+
384
+ if scope is None:
385
+ gone = repo.all_paths(conn) - seen
386
+ else:
387
+ gone = {p for p in scope if p not in seen and p in indexed_fp}
388
+ repo.delete_files(conn, gone)
389
+ stats.deleted = len(gone)
390
+
391
+ if stats.indexed or stats.deleted:
392
+ graph = build_graph(conn)
393
+ stats.edges_resolved = graph["resolved"]
394
+ if config.embeddings.enabled:
395
+ stats.vectors = _embed_chunks(config, db, conn)
396
+
397
+ repo.set_meta(conn, "built_at", repo.get_meta(conn, "built_at") or now)
398
+ repo.set_meta(conn, "updated_at", now)
399
+ repo.set_meta(conn, "config_hash", config.config_hash())
400
+ if head := _git_head(root):
401
+ repo.set_meta(conn, "head_commit", head)
402
+ conn.commit()
403
+ return stats
404
+
405
+
406
+ def _git_changed_since(root: Path, ref: str) -> set[str]:
407
+ changed: set[str] = set()
408
+ try:
409
+ diff = subprocess.run(
410
+ ["git", "-C", str(root), "diff", "--name-only", ref],
411
+ capture_output=True, text=True, timeout=15, check=False,
412
+ )
413
+ if diff.returncode == 0:
414
+ changed.update(line for line in diff.stdout.splitlines() if line)
415
+ untracked = subprocess.run(
416
+ ["git", "-C", str(root), "ls-files", "--others", "--exclude-standard"],
417
+ capture_output=True, text=True, timeout=15, check=False,
418
+ )
419
+ if untracked.returncode == 0:
420
+ changed.update(line for line in untracked.stdout.splitlines() if line)
421
+ except (OSError, subprocess.SubprocessError):
422
+ return set()
423
+ return changed
@@ -0,0 +1,2 @@
1
+ """MCP adapter for codebase-index."""
2
+