code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,624 @@
1
+ """Reusable indexing pipeline — shared by the CLI (`cce index`) and MCP (`reindex`).
2
+
3
+ This module owns the full index-a-project flow so the CLI and MCP server don't
4
+ duplicate logic and can't drift. Callers pass a structured `IndexResult` back so
5
+ they can format their own output (click.echo, MCP text response, logs).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import hashlib
11
+ import logging
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Iterable
16
+
17
+ import subprocess
18
+
19
+ from context_engine.indexer.chunker import Chunker
20
+ from context_engine.indexer.embedder import Embedder
21
+ from context_engine.indexer.embedding_cache import EmbeddingCache
22
+ from context_engine.indexer.git_indexer import index_commits
23
+ from context_engine.indexer.manifest import Manifest
24
+ from context_engine.models import ChunkType, GraphNode, GraphEdge, NodeType, EdgeType
25
+ from context_engine.storage.local_backend import LocalBackend
26
+
27
+
28
+ # Map a chunk's semantic type to its graph node type. Without this every
29
+ # non-function chunk used to land as NodeType.CLASS, which polluted the graph
30
+ # (e.g. markdown / yaml / json / module-level fallback chunks all looked like
31
+ # classes and degraded related_context expansion).
32
+ _CHUNK_TO_NODE_TYPE = {
33
+ ChunkType.FUNCTION: NodeType.FUNCTION,
34
+ ChunkType.CLASS: NodeType.CLASS,
35
+ ChunkType.MODULE: NodeType.MODULE,
36
+ ChunkType.DOC: NodeType.DOC,
37
+ ChunkType.COMMENT: NodeType.DOC,
38
+ ChunkType.COMMIT: NodeType.COMMIT,
39
+ ChunkType.SESSION: NodeType.SESSION,
40
+ ChunkType.DECISION: NodeType.DECISION,
41
+ }
42
+
43
+ log = logging.getLogger(__name__)
44
+
45
+
46
+ class PathOutsideProjectError(ValueError):
47
+ """Raised when a target_path resolves outside the project root."""
48
+
49
+
50
+ def _resolve_within(project_dir: Path, target: str | Path) -> Path:
51
+ """Resolve `target` relative to project_dir and assert it stays inside.
52
+
53
+ Prevents path traversal via `target_path="../../etc/passwd"` from any caller
54
+ that hands user input to `run_indexing`. Always call this before reading or
55
+ walking `target` against the filesystem.
56
+ """
57
+ p = Path(target)
58
+ if not p.is_absolute():
59
+ p = project_dir / p
60
+ resolved = p.resolve()
61
+ project_resolved = project_dir.resolve()
62
+ try:
63
+ resolved.relative_to(project_resolved)
64
+ except ValueError as exc:
65
+ raise PathOutsideProjectError(
66
+ f"target path escapes project directory: {target}"
67
+ ) from exc
68
+ return resolved
69
+
70
+
71
+ # Serialise indexing runs so a watcher-triggered re-index can't race a manual
72
+ # `cce index` or MCP `reindex` tool call on the same LanceDB table.
73
+ _PIPELINE_LOCKS: dict[str, asyncio.Lock] = {}
74
+
75
+
76
+ def _pipeline_lock(storage_key: str) -> asyncio.Lock:
77
+ lock = _PIPELINE_LOCKS.get(storage_key)
78
+ if lock is None:
79
+ lock = asyncio.Lock()
80
+ _PIPELINE_LOCKS[storage_key] = lock
81
+ return lock
82
+
83
+ # Binary / non-text extensions to skip (images, compiled, archives, etc.)
84
+ _SKIP_EXTENSIONS = {
85
+ # Images
86
+ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".tiff", ".svg",
87
+ # Compiled / bytecode
88
+ ".pyc", ".pyo", ".class", ".o", ".so", ".dylib", ".dll", ".exe", ".wasm",
89
+ # Archives
90
+ ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".jar", ".war",
91
+ # Data / binary
92
+ ".db", ".sqlite", ".sqlite3", ".bin", ".dat", ".pkl", ".pickle",
93
+ ".parquet", ".arrow", ".lance",
94
+ # Media
95
+ ".mp3", ".mp4", ".wav", ".avi", ".mov", ".flv", ".ogg", ".webm",
96
+ # Fonts
97
+ ".ttf", ".otf", ".woff", ".woff2", ".eot",
98
+ # Documents (non-text)
99
+ ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
100
+ # Package locks (huge, not useful for context)
101
+ ".lock",
102
+ # Source maps
103
+ ".map",
104
+ }
105
+
106
+ # Known extension → language mapping for tree-sitter and chunk metadata.
107
+ # Files with unlisted extensions are still indexed as "plaintext".
108
+ _LANGUAGE_MAP = {
109
+ ".py": "python",
110
+ ".js": "javascript",
111
+ ".ts": "typescript",
112
+ ".jsx": "javascript",
113
+ ".tsx": "tsx",
114
+ ".md": "markdown",
115
+ ".php": "php",
116
+ ".html": "html",
117
+ ".htm": "html",
118
+ ".css": "css",
119
+ ".scss": "css",
120
+ ".less": "css",
121
+ ".json": "json",
122
+ ".yaml": "yaml",
123
+ ".yml": "yaml",
124
+ ".toml": "toml",
125
+ ".sh": "bash",
126
+ ".bash": "bash",
127
+ ".zsh": "bash",
128
+ ".rb": "ruby",
129
+ ".go": "go",
130
+ ".rs": "rust",
131
+ ".java": "java",
132
+ ".c": "c",
133
+ ".cpp": "cpp",
134
+ ".h": "c",
135
+ ".hpp": "cpp",
136
+ ".swift": "swift",
137
+ ".kt": "kotlin",
138
+ ".kts": "kotlin",
139
+ ".sql": "sql",
140
+ ".graphql": "graphql",
141
+ ".gql": "graphql",
142
+ ".proto": "protobuf",
143
+ ".xml": "xml",
144
+ ".r": "r",
145
+ ".R": "r",
146
+ ".lua": "lua",
147
+ ".ex": "elixir",
148
+ ".exs": "elixir",
149
+ ".erl": "erlang",
150
+ ".hs": "haskell",
151
+ ".scala": "scala",
152
+ ".clj": "clojure",
153
+ ".dart": "dart",
154
+ ".vue": "vue",
155
+ ".svelte": "svelte",
156
+ ".pl": "perl",
157
+ ".pm": "perl",
158
+ ".cs": "csharp",
159
+ ".fs": "fsharp",
160
+ ".zig": "zig",
161
+ ".nim": "nim",
162
+ ".v": "vlang",
163
+ ".tf": "terraform",
164
+ ".hcl": "hcl",
165
+ ".dockerfile": "dockerfile",
166
+ }
167
+
168
+
169
+ @dataclass
170
+ class IndexResult:
171
+ indexed_files: list[str] = field(default_factory=list)
172
+ skipped_files: list[str] = field(default_factory=list)
173
+ deleted_files: list[str] = field(default_factory=list)
174
+ total_chunks: int = 0
175
+ errors: list[str] = field(default_factory=list)
176
+ # Embedding-cache hit/miss counters from the most-recent embedder run.
177
+ # Surfaced in `cce index` output so users can see how much the cache saved.
178
+ cache_hits: int = 0
179
+ cache_misses: int = 0
180
+
181
+
182
+ def _iter_project_files(
183
+ root: Path,
184
+ ignore_set: set[str],
185
+ skip_extensions: set[str],
186
+ *,
187
+ redact_secrets: bool = True,
188
+ cceignore_patterns: list[str] | None = None,
189
+ ) -> Iterable[Path]:
190
+ """Yield files under `root` respecting ignore list, skipping symlinks.
191
+
192
+ Symlinks are skipped outright to avoid loops; callers who need symlink
193
+ following can resolve them before calling the pipeline.
194
+
195
+ When `redact_secrets` is True (default), filenames matching well-known
196
+ credential patterns (.env*, *.pem, secrets.yml, etc.) are skipped at
197
+ the filesystem walk so they're never read or embedded. See
198
+ `indexer/secrets.py` for the full pattern list.
199
+
200
+ `cceignore_patterns` (typically loaded from `.cceignore`) supplements
201
+ the name-only `ignore_set` with gitignore-style globs evaluated
202
+ against the path relative to `root`.
203
+ """
204
+ from context_engine.indexer.secrets import is_secret_file as _is_secret_file
205
+ from context_engine.indexer.ignorefile import matches_any as _ignore_matches
206
+ patterns = cceignore_patterns or []
207
+ seen: set[Path] = set()
208
+
209
+ def _rel(entry: Path) -> str:
210
+ try:
211
+ return str(entry.relative_to(root)).replace("\\", "/")
212
+ except ValueError:
213
+ return entry.name
214
+
215
+ def walk(directory: Path) -> Iterable[Path]:
216
+ try:
217
+ entries = sorted(directory.iterdir())
218
+ except (PermissionError, OSError):
219
+ return
220
+ for entry in entries:
221
+ if entry.name in ignore_set:
222
+ continue
223
+ if entry.is_symlink():
224
+ continue
225
+ try:
226
+ resolved = entry.resolve()
227
+ except (OSError, RuntimeError):
228
+ continue
229
+ if resolved in seen:
230
+ continue
231
+ seen.add(resolved)
232
+ # Evaluate .cceignore against the path relative to project root.
233
+ # Done after symlink/seen checks so we don't pay the cost on
234
+ # files we'd skip anyway.
235
+ if patterns and _ignore_matches(_rel(entry), entry.is_dir(), patterns):
236
+ continue
237
+ if entry.is_dir():
238
+ yield from walk(entry)
239
+ elif entry.is_file() and entry.suffix not in skip_extensions:
240
+ if redact_secrets and _is_secret_file(entry):
241
+ log.info("indexer: skipping secret file %s", entry)
242
+ continue
243
+ yield entry
244
+
245
+ yield from walk(root)
246
+
247
+
248
+ # Skip any single file larger than this — protects the indexer from OOM on
249
+ # accidentally-committed log dumps, generated fixtures, vendored bundles, etc.
250
+ # 2 MB easily covers normal source files (the largest module in CPython's
251
+ # stdlib is ~250 KB) while ruling out the kind of file you'd never want in
252
+ # a semantic index anyway.
253
+ _MAX_FILE_BYTES = 2 * 1024 * 1024
254
+
255
+
256
+ def _safe_read(file_path: Path) -> str | None:
257
+ """Read file as UTF-8 text; return None for binary, oversized, or unreadable files."""
258
+ try:
259
+ if file_path.stat().st_size > _MAX_FILE_BYTES:
260
+ return None
261
+ return file_path.read_text(encoding="utf-8", errors="strict")
262
+ except (UnicodeDecodeError, OSError):
263
+ return None
264
+
265
+
266
+ async def run_indexing(
267
+ config,
268
+ project_dir: str | Path,
269
+ *,
270
+ full: bool = False,
271
+ target_path: str | None = None,
272
+ log_fn=None,
273
+ progress_fn=None,
274
+ embed_progress_fn=None,
275
+ phase_fn=None,
276
+ ) -> IndexResult:
277
+ """Run the indexing pipeline. Returns a structured `IndexResult`.
278
+
279
+ `target_path` (optional) restricts indexing to a single file or subtree.
280
+ `full=True` ignores the manifest and re-indexes everything visible.
281
+ `log_fn(msg)` is called for verbose progress output if provided.
282
+ `progress_fn(current, total)` is called after each batch with file counts.
283
+ `embed_progress_fn(current, total)` is called as embedding proceeds with
284
+ chunk counts (only for cache misses; cache hits return instantly).
285
+ `phase_fn(msg)` (if provided) is called between major phases —
286
+ "Embedding 32k chunks…", "Writing to index…" — so non-verbose callers
287
+ can announce *what* is starting; embed_progress_fn then drives motion
288
+ *within* the embed phase. Both serve the same goal (don't look hung
289
+ on large repos) and are complementary: phase_fn is per-phase, embed_
290
+ progress_fn is per-batch.
291
+ """
292
+ project_dir = Path(project_dir)
293
+ project_name = project_dir.name
294
+ storage_base = Path(config.storage_path) / project_name
295
+ storage_base.mkdir(parents=True, exist_ok=True)
296
+
297
+ async with _pipeline_lock(str(storage_base)):
298
+ return await _run_indexing_locked(
299
+ config,
300
+ project_dir,
301
+ storage_base,
302
+ full=full,
303
+ target_path=target_path,
304
+ log_fn=log_fn,
305
+ progress_fn=progress_fn,
306
+ embed_progress_fn=embed_progress_fn,
307
+ phase_fn=phase_fn,
308
+ )
309
+
310
+
311
+ async def _run_indexing_locked(
312
+ config,
313
+ project_dir: Path,
314
+ storage_base: Path,
315
+ *,
316
+ full: bool,
317
+ target_path: str | None,
318
+ log_fn,
319
+ progress_fn=None,
320
+ embed_progress_fn=None,
321
+ phase_fn=None,
322
+ ) -> IndexResult:
323
+ backend = LocalBackend(base_path=str(storage_base))
324
+ chunker = Chunker()
325
+ manifest = Manifest(manifest_path=storage_base / "manifest.json")
326
+ ignore_set = set(config.indexer_ignore)
327
+ # Load .cceignore once per indexing run. Patterns are evaluated against
328
+ # paths relative to project_dir; see indexer/ignorefile.py.
329
+ from context_engine.indexer.ignorefile import load_ignore_patterns
330
+ cceignore_patterns = load_ignore_patterns(project_dir)
331
+ if cceignore_patterns and log_fn:
332
+ log_fn(f" [.cceignore] {len(cceignore_patterns)} pattern(s) loaded")
333
+ result = IndexResult()
334
+
335
+ # Determine the set of files to scan.
336
+ if target_path:
337
+ target = _resolve_within(project_dir, target_path)
338
+ if target.is_file():
339
+ file_iter = [target] if target.suffix not in _SKIP_EXTENSIONS else []
340
+ elif target.is_dir():
341
+ file_iter = list(_iter_project_files(
342
+ target, ignore_set, _SKIP_EXTENSIONS,
343
+ redact_secrets=getattr(config, "indexer_redact_secrets", True),
344
+ cceignore_patterns=cceignore_patterns,
345
+ ))
346
+ else:
347
+ result.errors.append(f"Target path not found: {target_path}")
348
+ return result
349
+ else:
350
+ file_iter = list(_iter_project_files(
351
+ project_dir, ignore_set, _SKIP_EXTENSIONS,
352
+ redact_secrets=getattr(config, "indexer_redact_secrets", True),
353
+ cceignore_patterns=cceignore_patterns,
354
+ ))
355
+
356
+ current_rel_paths: set[str] = set()
357
+ all_chunks: list = []
358
+ all_nodes: list[GraphNode] = []
359
+ all_edges: list[GraphEdge] = []
360
+ files_to_replace: list[str] = []
361
+
362
+ # Read + chunk asynchronously — both are wrapped in asyncio.to_thread so
363
+ # the I/O reads (kernel) and the chunker work (CPU-bound tree-sitter)
364
+ # both overlap across files in a batch instead of executing serially.
365
+ async def _read_file(fp: Path) -> tuple[Path, str | None]:
366
+ return fp, await asyncio.to_thread(_safe_read, fp)
367
+
368
+ async def _chunk_file(rel_path: str, content: str, language: str):
369
+ """Run the tree-sitter chunker off the event loop. Returns chunks +
370
+ imports, or (None, None) on failure (already logged by caller)."""
371
+ return await asyncio.to_thread(
372
+ chunker.chunk_with_imports, content, rel_path, language
373
+ )
374
+
375
+ # Process files in batches to pipeline I/O with chunking.
376
+ _BATCH = 50
377
+ for batch_start in range(0, len(file_iter), _BATCH):
378
+ batch_paths = file_iter[batch_start:batch_start + _BATCH]
379
+
380
+ # Async read all files in this batch concurrently
381
+ read_tasks = [_read_file(fp) for fp in batch_paths]
382
+ read_results = await asyncio.gather(*read_tasks)
383
+
384
+ # First pass: hash + manifest check, decide which files actually need
385
+ # re-chunking. This is cheap and synchronous; doing it upfront lets us
386
+ # skip the chunker for unchanged files.
387
+ to_chunk: list[tuple[Path, str, str, str, str]] = [] # (file_path, rel_path, content, content_hash, language)
388
+ for file_path, content in read_results:
389
+ rel_path = str(file_path.relative_to(project_dir))
390
+ current_rel_paths.add(rel_path)
391
+
392
+ if content is None:
393
+ result.skipped_files.append(rel_path)
394
+ if log_fn:
395
+ log_fn(f" [skip] {rel_path} (binary or unreadable)")
396
+ continue
397
+
398
+ # Content-level secret redaction. Filename-level skipping
399
+ # already happened in `_iter_project_files`, so a file
400
+ # reaching this point is "indexable" — but the file might
401
+ # still contain inline credentials (an AWS key in a config
402
+ # comment, a JWT in a fixture). Redact those before they
403
+ # reach the chunker, embedder, or vector store.
404
+ if getattr(config, "indexer_redact_secrets", True):
405
+ from context_engine.indexer.secrets import redact_secrets
406
+ content, fired = redact_secrets(content)
407
+ if fired:
408
+ log.info(
409
+ "indexer: redacted %d secret(s) in %s (kinds: %s)",
410
+ len(fired), rel_path, ",".join(sorted(set(fired))),
411
+ )
412
+ if log_fn:
413
+ log_fn(f" [redact] {rel_path} ({len(fired)} secret(s))")
414
+
415
+ content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
416
+ if not full and not manifest.has_changed(rel_path, content_hash):
417
+ if log_fn:
418
+ log_fn(f" [skip] {rel_path} (unchanged)")
419
+ continue
420
+
421
+ language = _LANGUAGE_MAP.get(file_path.suffix, "plaintext")
422
+ to_chunk.append((file_path, rel_path, content, content_hash, language))
423
+
424
+ # Chunk all changed files in this batch in parallel. tree-sitter is
425
+ # a C extension that releases the GIL during parsing, so threads do
426
+ # give real concurrency for chunking.
427
+ if to_chunk:
428
+ chunk_tasks = [
429
+ _chunk_file(rel_path, content, language)
430
+ for (_, rel_path, content, _, language) in to_chunk
431
+ ]
432
+ chunk_results = await asyncio.gather(*chunk_tasks, return_exceptions=True)
433
+
434
+ for (file_path, rel_path, content, content_hash, language), chunk_outcome in zip(
435
+ to_chunk, chunk_results
436
+ ):
437
+ if isinstance(chunk_outcome, Exception):
438
+ result.errors.append(f"Chunking failed for {rel_path}: {chunk_outcome}")
439
+ log.warning("Chunking failed for %s", rel_path, exc_info=chunk_outcome)
440
+ continue
441
+ chunks, imported_modules = chunk_outcome
442
+
443
+ # Defer the actual store delete to a single batched call below.
444
+ files_to_replace.append(rel_path)
445
+
446
+ file_node = GraphNode(
447
+ id=f"file_{rel_path}",
448
+ node_type=NodeType.FILE,
449
+ name=file_path.name,
450
+ file_path=rel_path,
451
+ )
452
+ all_nodes.append(file_node)
453
+
454
+ for module in imported_modules:
455
+ all_edges.append(
456
+ GraphEdge(
457
+ source_id=file_node.id,
458
+ target_id=f"module_{module}",
459
+ edge_type=EdgeType.IMPORTS,
460
+ )
461
+ )
462
+
463
+ for chunk in chunks:
464
+ node_type = _CHUNK_TO_NODE_TYPE.get(
465
+ chunk.chunk_type, NodeType.MODULE
466
+ )
467
+ node_name = (
468
+ chunk.content.split("(")[0].split(":")[-1].strip()
469
+ if "(" in chunk.content
470
+ else chunk.id
471
+ )
472
+ all_nodes.append(
473
+ GraphNode(
474
+ id=chunk.id,
475
+ node_type=node_type,
476
+ name=node_name,
477
+ file_path=rel_path,
478
+ )
479
+ )
480
+ all_edges.append(
481
+ GraphEdge(
482
+ source_id=file_node.id,
483
+ target_id=chunk.id,
484
+ edge_type=EdgeType.DEFINES,
485
+ )
486
+ )
487
+ all_chunks.extend(chunks)
488
+ manifest.update(rel_path, content_hash)
489
+ result.indexed_files.append(rel_path)
490
+
491
+ if progress_fn:
492
+ progress_fn(min(batch_start + len(batch_paths), len(file_iter)), len(file_iter))
493
+
494
+ # NOTE: replacement deletes for `files_to_replace` are deferred until
495
+ # after embedding succeeds — see below. Deleting up front made the index
496
+ # vulnerable to a transient embed/ingest failure wiping previously-good
497
+ # data. The single batched delete still happens, just on the durable side
498
+ # of the embedder call.
499
+
500
+ # Index git history on full runs (skip for non-git projects)
501
+ _is_git = (Path(project_dir) / ".git").is_dir()
502
+ if full and not target_path and _is_git:
503
+ try:
504
+ git_chunks, git_nodes, git_edges = await index_commits(
505
+ project_dir, since_sha=manifest.last_git_sha
506
+ )
507
+ all_chunks.extend(git_chunks)
508
+ all_nodes.extend(git_nodes)
509
+ all_edges.extend(git_edges)
510
+ if git_chunks:
511
+ head_result = await asyncio.to_thread(
512
+ subprocess.run,
513
+ ["git", "rev-parse", "HEAD"],
514
+ cwd=project_dir, capture_output=True, text=True, check=False,
515
+ )
516
+ if head_result.returncode == 0:
517
+ manifest.last_git_sha = head_result.stdout.strip()
518
+ if log_fn:
519
+ log_fn(f" [git] {len(git_chunks)} commit(s) indexed")
520
+ except Exception as exc:
521
+ log.warning("Git history indexing failed: %s", exc)
522
+
523
+ if all_chunks:
524
+ # Embedding is where first-run model downloads happen; isolate failures
525
+ # here so we don't write an index with empty vectors. Crucially, the
526
+ # replacement deletes (files_to_replace) have NOT happened yet, so a
527
+ # download or model failure leaves the previous index intact.
528
+ cache = EmbeddingCache(
529
+ storage_base / "embedding_cache.db",
530
+ model_name=config.embedding_model,
531
+ )
532
+ try:
533
+ embedder = Embedder(model_name=config.embedding_model, cache=cache)
534
+ if phase_fn:
535
+ phase_fn(
536
+ f"Embedding {len(all_chunks):,} chunks "
537
+ f"(CPU-bound, can take several minutes on large repos)…"
538
+ )
539
+ try:
540
+ embedder.embed(all_chunks, progress_fn=embed_progress_fn)
541
+ except Exception as exc:
542
+ msg = f"Embedding failed: {exc}"
543
+ result.errors.append(msg)
544
+ log.warning(msg, exc_info=exc)
545
+ # Manifest was updated in-memory in the loop but never reaches
546
+ # disk because we return before manifest.save(); the previous
547
+ # on-disk manifest + index data are still valid.
548
+ return result
549
+ result.cache_hits = cache.hits
550
+ result.cache_misses = cache.misses
551
+
552
+ # On a full re-index we know the complete set of live chunk
553
+ # hashes — opportunistically drop any cached embeddings whose
554
+ # source content is no longer present anywhere in the index.
555
+ # Without this the cache grows monotonically forever.
556
+ if full and not target_path:
557
+ try:
558
+ live_hashes = {
559
+ cache.content_hash(c.content) for c in all_chunks
560
+ }
561
+ pruned = cache.prune_orphans(live_hashes)
562
+ if pruned and log_fn:
563
+ log_fn(f" [cache] pruned {pruned} orphan embedding(s)")
564
+ except Exception as exc:
565
+ log.debug("Embedding cache prune skipped: %s", exc)
566
+ finally:
567
+ cache.close()
568
+
569
+ # Embedding succeeded — now it's safe to drop the rows we're about to
570
+ # replace. Still ordered before ingest so the new chunk IDs don't
571
+ # collide with the old ones across the three stores.
572
+ if files_to_replace:
573
+ try:
574
+ await backend.delete_by_files(files_to_replace)
575
+ except Exception as exc:
576
+ msg = f"Pre-ingest delete failed: {exc}"
577
+ result.errors.append(msg)
578
+ log.warning(msg, exc_info=exc)
579
+ return result
580
+
581
+ if phase_fn:
582
+ phase_fn(f"Writing {len(all_chunks):,} chunks to vector + FTS + graph index…")
583
+ try:
584
+ await backend.ingest(all_chunks, all_nodes, all_edges)
585
+ except Exception as exc:
586
+ msg = f"Backend ingest failed: {exc}"
587
+ result.errors.append(msg)
588
+ log.warning(msg, exc_info=exc)
589
+ return result
590
+
591
+ result.total_chunks = len(all_chunks)
592
+ elif files_to_replace:
593
+ # No new chunks (e.g. all changed files chunked to nothing) but we
594
+ # still need to drop their old rows.
595
+ try:
596
+ await backend.delete_by_files(files_to_replace)
597
+ except Exception as exc:
598
+ msg = f"Replacement delete failed: {exc}"
599
+ result.errors.append(msg)
600
+ log.warning(msg, exc_info=exc)
601
+ return result
602
+
603
+ # Prune chunks for files that were in the manifest but no longer on disk.
604
+ # Only meaningful for project-wide runs; skip when a single path was targeted.
605
+ if not target_path:
606
+ previous_rel_paths = set(manifest._entries.keys()) # noqa: SLF001
607
+ removed = list(previous_rel_paths - current_rel_paths)
608
+ if removed:
609
+ try:
610
+ await backend.delete_by_files(removed)
611
+ except Exception as exc: # pragma: no cover - defensive
612
+ result.errors.append(f"Failed to prune deleted files: {exc}")
613
+ removed = []
614
+ for deleted in removed:
615
+ try:
616
+ manifest.remove(deleted)
617
+ result.deleted_files.append(deleted)
618
+ if log_fn:
619
+ log_fn(f" [delete] {deleted} (no longer on disk)")
620
+ except Exception as exc: # pragma: no cover - defensive
621
+ result.errors.append(f"Failed to prune {deleted}: {exc}")
622
+
623
+ manifest.save()
624
+ return result