codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,608 @@
1
+ """Index builder: orchestrates full and incremental builds of the code index."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import os
7
+ import time
8
+ from concurrent.futures import ProcessPoolExecutor, as_completed
9
+ from pathlib import Path
10
+ from typing import Any, Callable, Dict, List, Optional, Tuple
11
+
12
+ from corbell.core.gitignore import load_gitignore
13
+ from corbell.core.indexing.tracker import IndexTracker
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # --------------------------------------------------------------------------- #
19
+ # Module-level worker function (must be picklable for multiprocessing) #
20
+ # --------------------------------------------------------------------------- #
21
+
22
+ def _extract_file_worker(
23
+ args: Tuple,
24
+ ) -> List[Any]:
25
+ """Extract embedding chunks from a single file.
26
+
27
+ This is a module-level function so it can be pickled by ProcessPoolExecutor.
28
+
29
+ Args:
30
+ args: Tuple of
31
+ (abs_path_str, rel_path, lang, service_id, repo_str,
32
+ chunk_size, overlap, max_file_bytes)
33
+
34
+ Returns:
35
+ List of EmbeddingRecord objects (embeddings are None at this stage).
36
+ """
37
+ (
38
+ abs_path_str,
39
+ rel_path,
40
+ lang,
41
+ service_id,
42
+ repo_str,
43
+ chunk_size,
44
+ overlap,
45
+ max_file_bytes, # noqa: F841 — kept for signature completeness
46
+ ) = args
47
+
48
+ from corbell.core.embeddings.extractor import CodeChunkExtractor
49
+
50
+ extractor = CodeChunkExtractor(chunk_size=chunk_size, overlap=overlap)
51
+ fp = Path(abs_path_str)
52
+ return extractor._extract_file(fp, rel_path, lang, service_id, repo_str)
53
+
54
+
55
+ def _get_worker_count() -> int:
56
+ """Return the number of parallel workers for indexing.
57
+
58
+ Reads ``CORBELL_INDEX_WORKERS`` env var; defaults to ``min(cpu_count, 8)``.
59
+ """
60
+ env_val = os.environ.get("CORBELL_INDEX_WORKERS", "").strip()
61
+ if env_val:
62
+ try:
63
+ return max(1, int(env_val))
64
+ except ValueError:
65
+ pass
66
+ return min(os.cpu_count() or 4, 8)
67
+
68
+
69
+ # --------------------------------------------------------------------------- #
70
+ # Batch encoding helpers #
71
+ # --------------------------------------------------------------------------- #
72
+
73
+ _API_BATCH_SIZE = 100 # conservative limit for API-backed embedding models
74
+
75
+
76
+ def _encode_chunks(model: Any, chunks: List[Any]) -> List[Any]:
77
+ """Encode chunks and attach embeddings in-place.
78
+
79
+ SentenceTransformerModel handles its own internal batching efficiently —
80
+ we pass the full list. For API-backed models (Google, Voyage) we batch
81
+ into groups of ``_API_BATCH_SIZE`` to stay within rate limits.
82
+
83
+ Args:
84
+ model: An EmbeddingModel instance.
85
+ chunks: List of EmbeddingRecord objects without embeddings.
86
+
87
+ Returns:
88
+ The same list with ``embedding`` fields populated.
89
+ """
90
+ from corbell.core.embeddings.model import GoogleEmbeddingModel, VoyageEmbeddingModel
91
+
92
+ if not chunks:
93
+ return chunks
94
+
95
+ if isinstance(model, GoogleEmbeddingModel) and model.uses_prefix_format:
96
+ texts = [
97
+ model.prepare_document(
98
+ c.content,
99
+ title=(
100
+ f"{c.file_path}:{c.symbol}"
101
+ if c.symbol
102
+ else f"{c.file_path}:L{c.start_line}-{c.end_line}"
103
+ ),
104
+ )
105
+ for c in chunks
106
+ ]
107
+ else:
108
+ texts = [c.content for c in chunks]
109
+
110
+ is_api_model = isinstance(model, (GoogleEmbeddingModel, VoyageEmbeddingModel))
111
+
112
+ if is_api_model:
113
+ vectors: List[Any] = []
114
+ for i in range(0, len(texts), _API_BATCH_SIZE):
115
+ batch_texts = texts[i : i + _API_BATCH_SIZE]
116
+ vectors.extend(model.encode(batch_texts))
117
+ else:
118
+ vectors = model.encode(texts)
119
+
120
+ for chunk, vec in zip(chunks, vectors):
121
+ chunk.embedding = vec
122
+
123
+ return chunks
124
+
125
+
126
+ # --------------------------------------------------------------------------- #
127
+ # Parallel file collection helpers #
128
+ # --------------------------------------------------------------------------- #
129
+
130
+ def _collect_repo_files(
131
+ repo_path: Path,
132
+ repo_id: str,
133
+ max_file_bytes: int,
134
+ gitignore_spec: Any,
135
+ ) -> List[Tuple[str, str, str]]:
136
+ """Walk repo_path and return (abs_path_str, rel_path, lang) for each indexable file.
137
+
138
+ Replicates the filtering logic from CodeChunkExtractor.extract_from_repo so
139
+ we can dispatch individual files to worker processes.
140
+
141
+ Args:
142
+ repo_path: Absolute path to the repo root.
143
+ repo_id: Repository identifier (unused here but kept for symmetry).
144
+ max_file_bytes: Maximum file size in bytes.
145
+ gitignore_spec: Pre-loaded PathSpec for gitignore filtering.
146
+
147
+ Returns:
148
+ List of (abs_path_str, rel_path, lang) tuples for picklable dispatch.
149
+ """
150
+ from corbell.core.constants import EXTENSION_LANG, SKIP_DIRS
151
+
152
+ file_list: List[Tuple[str, str, str]] = []
153
+ for fp in repo_path.rglob("*"):
154
+ if not fp.is_file():
155
+ continue
156
+ if any(part in SKIP_DIRS for part in fp.parts):
157
+ continue
158
+ lang = EXTENSION_LANG.get(fp.suffix)
159
+ if not lang:
160
+ continue
161
+ try:
162
+ if fp.stat().st_size > max_file_bytes:
163
+ continue
164
+ except OSError:
165
+ continue
166
+ rel = str(fp.relative_to(repo_path))
167
+ if gitignore_spec.match_file(rel.replace("\\", "/")):
168
+ continue
169
+ file_list.append((str(fp), rel, lang))
170
+ return file_list
171
+
172
+
173
+ class IndexBuilder:
174
+ """Orchestrates building and maintaining the code search index.
175
+
176
+ Handles both full builds (--rebuild) and incremental builds (changed files only).
177
+ Uses crash-safe ordering: meta is updated AFTER chunk commits so failed runs
178
+ self-heal on the next invocation.
179
+
180
+ Concurrent builds are serialised by an ``IndexLock`` (file-based lock). On
181
+ acquiring the lock the builder re-checks the stale state so a second caller
182
+ that waited for the lock can skip redundant work.
183
+ """
184
+
185
+ def build(
186
+ self,
187
+ cfg: Any, # WorkspaceConfig
188
+ db_path: Path,
189
+ rebuild: bool = False,
190
+ repo_filter: Optional[str] = None,
191
+ progress_fn: Optional[Callable[[str], None]] = None,
192
+ ) -> Dict[str, Any]:
193
+ """Build or incrementally update the code search index.
194
+
195
+ Args:
196
+ cfg: WorkspaceConfig (from build_config()).
197
+ db_path: Absolute path to the SQLite database file.
198
+ rebuild: If True, clears all tables and does a full rebuild.
199
+ repo_filter: If set, only process the repo with this ID.
200
+
201
+ Returns:
202
+ Summary dict with stats about the build.
203
+
204
+ Raises:
205
+ ValueError: If the embedding model has changed and --rebuild is not set.
206
+ """
207
+ from corbell.core.embeddings.extractor import CodeChunkExtractor
208
+ from corbell.core.embeddings.model import (
209
+ SentenceTransformerModel, GoogleEmbeddingModel, VoyageEmbeddingModel, EmbeddingModel,
210
+ )
211
+ from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
212
+ from corbell.core.graph.sqlite_store import SQLiteGraphStore
213
+
214
+ emb_store = SQLiteEmbeddingStore(db_path)
215
+ graph_store = SQLiteGraphStore(db_path)
216
+ tracker = IndexTracker(db_path)
217
+
218
+ # Filter repos if requested
219
+ repos = cfg.repos
220
+ if repo_filter:
221
+ repos = [r for r in repos if r.id == repo_filter]
222
+ if not repos:
223
+ raise ValueError(f"Repo '{repo_filter}' not found in workspace config")
224
+
225
+ model_name = cfg.storage.resolved_model()
226
+
227
+ # Model safety check (skip on full rebuild)
228
+ if not rebuild:
229
+ stored_model = tracker.get_stored_model()
230
+ if stored_model and stored_model != model_name:
231
+ raise ValueError(
232
+ f"Model changed from '{stored_model}' to '{model_name}'. "
233
+ f"Run 'corbell index build --rebuild' to re-index."
234
+ )
235
+
236
+ indexing = cfg.indexing
237
+ extractor = CodeChunkExtractor(
238
+ chunk_size=indexing.chunk_size,
239
+ overlap=indexing.chunk_overlap,
240
+ )
241
+ model: EmbeddingModel
242
+ if model_name.startswith("gemini-"):
243
+ model = GoogleEmbeddingModel(model_name)
244
+ elif model_name.startswith("voyage-"):
245
+ model = VoyageEmbeddingModel(model_name)
246
+ else:
247
+ model = SentenceTransformerModel(model_name)
248
+
249
+ if rebuild:
250
+ return self._full_build(
251
+ repos, emb_store, graph_store, tracker, extractor, model,
252
+ indexing, model_name, db_path, repo_filter=repo_filter,
253
+ progress_fn=progress_fn,
254
+ )
255
+ else:
256
+ stale = tracker.get_stale_files(repos, cfg)
257
+ if not stale.has_changes:
258
+ return {"status": "clean", "chunks_added": 0, "repos_rebuilt": 0}
259
+
260
+ return self._incremental_build(
261
+ repos, stale, emb_store, graph_store, tracker,
262
+ extractor, model, cfg, indexing, model_name, db_path,
263
+ progress_fn=progress_fn,
264
+ )
265
+
266
+ def _full_build(
267
+ self,
268
+ repos: List,
269
+ emb_store: Any,
270
+ graph_store: Any,
271
+ tracker: IndexTracker,
272
+ extractor: Any,
273
+ model: Any,
274
+ indexing: Any,
275
+ model_name: str,
276
+ db_path: Path,
277
+ repo_filter: Optional[str] = None,
278
+ progress_fn: Optional[Callable[[str], None]] = None,
279
+ ) -> Dict[str, Any]:
280
+ """Run a full (re)build across all repos, serialised by IndexLock."""
281
+ from corbell.core.indexing.lock import IndexLock
282
+
283
+ lock = IndexLock(db_path.parent / "index.lock")
284
+ with lock:
285
+ return self._full_build_locked(
286
+ repos, emb_store, graph_store, tracker, extractor, model,
287
+ indexing, model_name, repo_filter=repo_filter, progress_fn=progress_fn,
288
+ )
289
+
290
+ def _full_build_locked(
291
+ self,
292
+ repos: List,
293
+ emb_store: Any,
294
+ graph_store: Any,
295
+ tracker: IndexTracker,
296
+ extractor: Any,
297
+ model: Any,
298
+ indexing: Any,
299
+ model_name: str,
300
+ repo_filter: Optional[str] = None,
301
+ progress_fn: Optional[Callable[[str], None]] = None,
302
+ ) -> Dict[str, Any]:
303
+ """Inner full build — called while holding IndexLock."""
304
+ # C2: Skip if another process already completed a build very recently.
305
+ last_build = tracker.get_last_build_at()
306
+ if last_build is not None and (time.time() - last_build) < 30:
307
+ return {"status": "already_built", "chunks_added": 0, "repos_rebuilt": 0}
308
+
309
+ # C1: Clear the index inside the lock to eliminate the race window.
310
+ if repo_filter:
311
+ emb_store.clear(service_id=repo_filter)
312
+ graph_store.delete_service_data(repo_filter)
313
+ tracker.remove_tracked([(r.id, r.id) for r in repos])
314
+ else:
315
+ emb_store.clear()
316
+ graph_store.clear()
317
+ tracker.clear_all()
318
+
319
+ total_chunks = 0
320
+ total_repos = 0
321
+ services_data = []
322
+ workers = _get_worker_count()
323
+
324
+ for repo in repos:
325
+ repo_id = repo.id
326
+ repo_path = repo.resolved_path
327
+ if not repo_path or not repo_path.exists():
328
+ continue
329
+
330
+ language = repo.language or "python"
331
+
332
+ services_data.append({
333
+ "id": repo_id,
334
+ "resolved_path": repo_path,
335
+ "repo": str(repo_path),
336
+ "language": language,
337
+ "tags": [],
338
+ })
339
+
340
+ gitignore_spec = load_gitignore(repo_path)
341
+
342
+ # Collect file list for parallel dispatch
343
+ file_list = _collect_repo_files(
344
+ repo_path, repo_id, indexing.max_file_bytes, gitignore_spec
345
+ )
346
+
347
+ if not file_list:
348
+ total_repos += 1
349
+ continue
350
+
351
+ if progress_fn:
352
+ progress_fn(
353
+ f"Extracting {repo_id} ({len(file_list)} files, {workers} workers)..."
354
+ )
355
+
356
+ # Build worker args (all strings — picklable)
357
+ worker_args = [
358
+ (
359
+ abs_path_str,
360
+ rel_path,
361
+ lang,
362
+ repo_id,
363
+ str(repo_path),
364
+ indexing.chunk_size,
365
+ indexing.chunk_overlap,
366
+ indexing.max_file_bytes,
367
+ )
368
+ for abs_path_str, rel_path, lang in file_list
369
+ ]
370
+
371
+ # Parallel extraction
372
+ chunks: List[Any] = []
373
+ with ProcessPoolExecutor(max_workers=workers) as pool:
374
+ futures = {pool.submit(_extract_file_worker, arg): arg for arg in worker_args}
375
+ for future in as_completed(futures):
376
+ try:
377
+ chunks.extend(future.result())
378
+ except Exception as exc:
379
+ file_info = futures[future]
380
+ logger.warning("Failed to extract %s: %s", file_info[0], exc)
381
+
382
+ if progress_fn:
383
+ progress_fn(f"Indexing {repo_id} ({len(chunks)} chunks)...")
384
+
385
+ if chunks:
386
+ # Encode all chunks for this repo
387
+ _encode_chunks(model, chunks)
388
+
389
+ # CRASH-SAFE: commit chunks first, then update meta
390
+ emb_store.upsert_batch(chunks)
391
+ total_chunks += len(chunks)
392
+
393
+ # Mark each file as indexed AFTER chunks are committed
394
+ file_mtimes = self._collect_file_mtimes(repo_path, chunks)
395
+ for file_path, mtime in file_mtimes.items():
396
+ tracker.mark_indexed(file_path, repo_id, mtime)
397
+
398
+ total_repos += 1
399
+
400
+ # Build graph
401
+ from corbell.core.graph.builder import ServiceGraphBuilder
402
+ from corbell.core.graph.method_graph import MethodGraphBuilder
403
+ if progress_fn:
404
+ progress_fn("Building call graph...")
405
+ sgb = ServiceGraphBuilder(graph_store)
406
+ mgb = MethodGraphBuilder(graph_store)
407
+ sgb.build_from_workspace(services_data, clear_existing=False, method_level=False)
408
+ for svc in services_data:
409
+ mgb.build_for_service(svc["id"], svc["resolved_path"])
410
+
411
+ # Store global metadata LAST (after all commits)
412
+ tracker.set_meta("embedding_model", model_name)
413
+ tracker.set_meta("last_build_at", str(time.time()))
414
+ tracker.set_meta("chunk_size", str(indexing.chunk_size))
415
+ tracker.set_meta("overlap", str(indexing.chunk_overlap))
416
+
417
+ return {
418
+ "status": "full_build",
419
+ "chunks_added": total_chunks,
420
+ "repos_rebuilt": total_repos,
421
+ }
422
+
423
+ def _incremental_build(
424
+ self,
425
+ repos: List,
426
+ stale: Any,
427
+ emb_store: Any,
428
+ graph_store: Any,
429
+ tracker: IndexTracker,
430
+ extractor: Any,
431
+ model: Any,
432
+ cfg: Any,
433
+ indexing: Any,
434
+ model_name: str,
435
+ db_path: Path,
436
+ progress_fn: Optional[Callable[[str], None]] = None,
437
+ ) -> Dict[str, Any]:
438
+ """Re-embed changed files and rebuild graph for affected repos, serialised by IndexLock."""
439
+ from corbell.core.indexing.lock import IndexLock
440
+
441
+ lock = IndexLock(db_path.parent / "index.lock")
442
+ with lock:
443
+ # Re-check after acquiring lock — another process may have built already
444
+ fresh_stale = tracker.get_stale_files(repos, cfg)
445
+ if not fresh_stale.has_changes:
446
+ return {"status": "clean", "chunks_added": 0, "repos_rebuilt": 0}
447
+
448
+ return self._incremental_build_locked(
449
+ repos, fresh_stale, emb_store, graph_store, tracker,
450
+ extractor, model, indexing, model_name, progress_fn=progress_fn,
451
+ )
452
+
453
+ def _incremental_build_locked(
454
+ self,
455
+ repos: List,
456
+ stale: Any,
457
+ emb_store: Any,
458
+ graph_store: Any,
459
+ tracker: IndexTracker,
460
+ extractor: Any,
461
+ model: Any,
462
+ indexing: Any,
463
+ model_name: str,
464
+ progress_fn: Optional[Callable[[str], None]] = None,
465
+ ) -> Dict[str, Any]:
466
+ """Inner incremental build — called while holding IndexLock."""
467
+ from corbell.core.graph.builder import ServiceGraphBuilder
468
+ from corbell.core.graph.method_graph import MethodGraphBuilder
469
+
470
+ total_chunks = 0
471
+ changed_repo_ids = stale.changed_repo_ids
472
+ workers = _get_worker_count()
473
+
474
+ # Build a lookup of repo_id → repo
475
+ repo_map = {r.id: r for r in repos}
476
+
477
+ # Handle deleted files
478
+ for file_path, repo_id in stale.deleted:
479
+ emb_store.delete_by_file(file_path, repo_id)
480
+ tracker.remove_tracked(stale.deleted)
481
+
482
+ # Re-embed modified + added files
483
+ files_to_reindex: Dict[str, List] = {}
484
+ for file_path, repo_id in stale.added + stale.modified:
485
+ files_to_reindex.setdefault(repo_id, []).append(file_path)
486
+
487
+ for repo_id, file_paths in files_to_reindex.items():
488
+ repo = repo_map.get(repo_id)
489
+ if not repo or not repo.resolved_path:
490
+ continue
491
+ repo_path = repo.resolved_path
492
+ if progress_fn:
493
+ progress_fn(
494
+ f"Re-indexing {len(file_paths)} files in {repo_id} ({workers} workers)..."
495
+ )
496
+
497
+ # Delete old chunks for all files in this repo before re-extracting
498
+ for rel_path in file_paths:
499
+ emb_store.delete_by_file(rel_path, repo_id)
500
+
501
+ # Build worker args for all files in this repo
502
+ from corbell.core.constants import EXTENSION_LANG
503
+
504
+ worker_args = []
505
+ for rel_path in file_paths:
506
+ abs_path = repo_path / rel_path
507
+ if not abs_path.exists():
508
+ continue
509
+ lang = EXTENSION_LANG.get(abs_path.suffix, "python")
510
+ worker_args.append((
511
+ str(abs_path),
512
+ rel_path,
513
+ lang,
514
+ repo_id,
515
+ str(repo_path),
516
+ indexing.chunk_size,
517
+ indexing.chunk_overlap,
518
+ indexing.max_file_bytes,
519
+ ))
520
+
521
+ if not worker_args:
522
+ continue
523
+
524
+ # Parallel extraction for this repo's changed files
525
+ chunks: List[Any] = []
526
+ with ProcessPoolExecutor(max_workers=workers) as pool:
527
+ futures = {pool.submit(_extract_file_worker, arg): arg for arg in worker_args}
528
+ for future in as_completed(futures):
529
+ try:
530
+ chunks.extend(future.result())
531
+ except Exception as exc:
532
+ file_info = futures[future]
533
+ logger.warning("Failed to extract %s: %s", file_info[0], exc)
534
+
535
+ if chunks:
536
+ # Encode all chunks for this repo batch
537
+ _encode_chunks(model, chunks)
538
+
539
+ # CRASH-SAFE: commit chunks first
540
+ emb_store.upsert_batch(chunks)
541
+ total_chunks += len(chunks)
542
+
543
+ # Mark all processed files as indexed AFTER commit
544
+ for rel_path in file_paths:
545
+ abs_path = repo_path / rel_path
546
+ if not abs_path.exists():
547
+ continue
548
+ try:
549
+ mtime = abs_path.stat().st_mtime
550
+ except OSError:
551
+ mtime = time.time()
552
+ tracker.mark_indexed(rel_path, repo_id, mtime)
553
+
554
+ # Rebuild graph for affected repos
555
+ sgb = ServiceGraphBuilder(graph_store)
556
+ mgb = MethodGraphBuilder(graph_store)
557
+ if progress_fn:
558
+ progress_fn("Rebuilding call graph...")
559
+
560
+ for repo_id in changed_repo_ids:
561
+ repo = repo_map.get(repo_id)
562
+ if not repo or not repo.resolved_path:
563
+ continue
564
+ repo_path = repo.resolved_path
565
+ language = repo.language or "python"
566
+
567
+ # Remove old graph data for this repo
568
+ graph_store.delete_service_data(repo_id)
569
+
570
+ # Rebuild graph for this repo
571
+ svc_data = [{
572
+ "id": repo_id,
573
+ "resolved_path": repo_path,
574
+ "repo": str(repo_path),
575
+ "language": language,
576
+ "tags": [],
577
+ }]
578
+ sgb.build_from_workspace(svc_data, clear_existing=False, method_level=False)
579
+ mgb.build_for_service(repo_id, repo_path)
580
+
581
+ # Update metadata LAST
582
+ tracker.set_meta("embedding_model", model_name)
583
+ tracker.set_meta("last_build_at", str(time.time()))
584
+ tracker.set_meta("chunk_size", str(indexing.chunk_size))
585
+ tracker.set_meta("overlap", str(indexing.chunk_overlap))
586
+
587
+ return {
588
+ "status": "incremental",
589
+ "chunks_added": total_chunks,
590
+ "repos_rebuilt": len(changed_repo_ids),
591
+ "files_added": len(stale.added),
592
+ "files_modified": len(stale.modified),
593
+ "files_deleted": len(stale.deleted),
594
+ }
595
+
596
+ def _collect_file_mtimes(self, repo_path: Path, chunks: List) -> Dict[str, float]:
597
+ """Collect mtimes for all files represented in the chunks list."""
598
+ result: Dict[str, float] = {}
599
+ for chunk in chunks:
600
+ rel_path = chunk.file_path
601
+ if rel_path not in result:
602
+ abs_path = repo_path / rel_path
603
+ try:
604
+ result[rel_path] = abs_path.stat().st_mtime
605
+ except OSError:
606
+ result[rel_path] = time.time()
607
+ return result
608
+