loki-mode 7.22.0 → 7.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,14 +10,27 @@ Usage:
10
10
  python tools/index-codebase.py # Index everything
11
11
  python tools/index-codebase.py --collection loki # Custom collection name
12
12
  python tools/index-codebase.py --reset # Clear and re-index
13
+ python tools/index-codebase.py --changed # Incremental: only changed files
13
14
  python tools/index-codebase.py --stats # Show index stats
14
15
 
15
16
  Requires:
16
17
  - ChromaDB running on localhost:8100 (docker)
17
18
  - pip install chromadb
19
+
20
+ Incremental freshness (--changed):
21
+ Maintains a manifest at .loki/state/code-index-manifest.json that records,
22
+ per indexed file, its mtime, sha1, and the chunk IDs it produced. The
23
+ --changed mode re-chunks only files whose mtime OR sha1 differ from the
24
+ manifest, upserts the new chunks, deletes chunk IDs that disappeared for a
25
+ changed file (the orphan-chunk fix), and drops all chunks for files removed
26
+ from disk. The --reset and default full-index paths are unchanged in their
27
+ indexing behavior; they additionally write the manifest at the end so a
28
+ later --changed run has an accurate baseline.
18
29
  """
19
30
 
20
31
  import argparse
32
+ import hashlib
33
+ import json
21
34
  import os
22
35
  import re
23
36
  import sys
@@ -30,6 +43,11 @@ import chromadb
30
43
  # Project root
31
44
  PROJECT_ROOT = Path(__file__).parent.parent.resolve()
32
45
 
46
+ # Manifest path (per-file freshness tracking for incremental indexing).
47
+ # Resolved relative to PROJECT_ROOT so the indexer and the MCP staleness
48
+ # check agree on a single location.
49
+ MANIFEST_PATH = PROJECT_ROOT / ".loki" / "state" / "code-index-manifest.json"
50
+
33
51
  # ChromaDB connection
34
52
  CHROMA_HOST = os.environ.get("LOKI_CHROMA_HOST", "localhost")
35
53
  CHROMA_PORT = int(os.environ.get("LOKI_CHROMA_PORT", "8100"))
@@ -333,6 +351,267 @@ def collect_files() -> list[tuple[Path, str]]:
333
351
  return files
334
352
 
335
353
 
354
+ # -----------------------------------------------------------------------------
355
+ # Manifest / incremental freshness (pure logic, no ChromaDB)
356
+ # -----------------------------------------------------------------------------
357
+
358
+
359
+ def file_sha1(filepath: Path) -> str:
360
+ """Return the hex sha1 of a file's bytes."""
361
+ h = hashlib.sha1()
362
+ h.update(filepath.read_bytes())
363
+ return h.hexdigest()
364
+
365
+
366
+ def chunk_file(filepath: Path, file_type: str) -> list[dict]:
367
+ """Dispatch to the right chunker for a file type."""
368
+ if file_type == "shell":
369
+ return chunk_shell_file(filepath)
370
+ if file_type == "python":
371
+ return chunk_python_file(filepath)
372
+ if file_type == "markdown":
373
+ return chunk_markdown_file(filepath)
374
+ return []
375
+
376
+
377
+ def load_manifest(manifest_path: Path = MANIFEST_PATH) -> dict:
378
+ """Load the freshness manifest, or return an empty one.
379
+
380
+ Schema:
381
+ {
382
+ "version": 1,
383
+ "collection": "<name>",
384
+ "files": {
385
+ "<rel_path>": {"mtime": <float>, "sha1": "<hex>", "chunk_ids": [...]}
386
+ }
387
+ }
388
+ """
389
+ try:
390
+ data = json.loads(manifest_path.read_text())
391
+ if isinstance(data, dict) and isinstance(data.get("files"), dict):
392
+ return data
393
+ except Exception:
394
+ pass
395
+ return {"version": 1, "collection": None, "files": {}}
396
+
397
+
398
+ def save_manifest(manifest: dict, manifest_path: Path = MANIFEST_PATH) -> None:
399
+ """Persist the freshness manifest atomically."""
400
+ manifest_path.parent.mkdir(parents=True, exist_ok=True)
401
+ tmp = manifest_path.with_suffix(manifest_path.suffix + ".tmp")
402
+ tmp.write_text(json.dumps(manifest, indent=2, sort_keys=True))
403
+ tmp.replace(manifest_path)
404
+
405
+
406
+ def build_manifest_entry(filepath: Path, chunk_ids: list[str]) -> dict:
407
+ """Build one manifest entry for a freshly chunked file."""
408
+ return {
409
+ "mtime": os.path.getmtime(filepath),
410
+ "sha1": file_sha1(filepath),
411
+ "chunk_ids": list(chunk_ids),
412
+ }
413
+
414
+
415
+ def compute_manifest_diff(old_manifest: dict,
416
+ current_files: list[tuple[str, dict]],
417
+ present_rel_paths: set) -> dict:
418
+ """Pure diff: decide what changed without touching ChromaDB or disk.
419
+
420
+ Args:
421
+ old_manifest: the previously saved manifest dict.
422
+ current_files: list of (rel_path, entry) for files re-chunked THIS run,
423
+ where entry is {"mtime", "sha1", "chunk_ids"}. Callers should only
424
+ include files they actually re-chunked (i.e. changed/new files).
425
+ present_rel_paths: the set of rel_paths that currently exist on disk and
426
+ are in scope for indexing (collect_files results). Used to detect
427
+ files removed from disk.
428
+
429
+ Returns a dict:
430
+ {
431
+ "upsert_ids": [...], # chunk IDs to (re)upsert this run
432
+ "delete_ids": [...], # orphan chunk IDs to remove
433
+ "changed_files": [rel_path], # files re-chunked this run
434
+ "removed_files": [rel_path], # files dropped from disk / scope
435
+ }
436
+ """
437
+ old_files = old_manifest.get("files", {})
438
+ current_map = {rel: entry for rel, entry in current_files}
439
+
440
+ upsert_ids: list[str] = []
441
+ delete_ids: list[str] = []
442
+ changed_files: list[str] = []
443
+
444
+ # Changed / new files: upsert their new chunk IDs, delete IDs that vanished.
445
+ for rel, entry in current_files:
446
+ changed_files.append(rel)
447
+ new_ids = list(entry.get("chunk_ids", []))
448
+ upsert_ids.extend(new_ids)
449
+ old_ids = set(old_files.get(rel, {}).get("chunk_ids", []))
450
+ gone = old_ids - set(new_ids)
451
+ delete_ids.extend(sorted(gone))
452
+
453
+ # Files removed from disk (tracked before, not present now): delete all chunks.
454
+ removed_files: list[str] = []
455
+ for rel, old_entry in old_files.items():
456
+ if rel in current_map:
457
+ continue
458
+ if rel in present_rel_paths:
459
+ continue
460
+ removed_files.append(rel)
461
+ delete_ids.extend(sorted(old_entry.get("chunk_ids", [])))
462
+
463
+ # Stable, deduped ordering for deterministic behavior / testing.
464
+ return {
465
+ "upsert_ids": upsert_ids,
466
+ "delete_ids": sorted(set(delete_ids)),
467
+ "changed_files": changed_files,
468
+ "removed_files": removed_files,
469
+ }
470
+
471
+
472
+ def file_is_changed(filepath: Path, rel: str, old_manifest: dict) -> bool:
473
+ """Return True if a file differs from its manifest entry (mtime OR sha1).
474
+
475
+ A file with no manifest entry is treated as new (changed). The mtime check
476
+ is the cheap first pass; sha1 is the authoritative fallback so a touch with
477
+ no content change still re-verifies but a real edit is always caught.
478
+ """
479
+ entry = old_manifest.get("files", {}).get(rel)
480
+ if not entry:
481
+ return True
482
+ try:
483
+ if os.path.getmtime(filepath) != entry.get("mtime"):
484
+ return True
485
+ except OSError:
486
+ return True
487
+ return file_sha1(filepath) != entry.get("sha1")
488
+
489
+
490
+ def check_staleness(manifest_path: Path = MANIFEST_PATH) -> dict:
491
+ """Compare manifest mtimes against current files on disk.
492
+
493
+ Mirrors the mtime-staleness pattern in memory/retrieval.py. Returns a dict
494
+ {"stale": bool, "stale_files": int, "manifest_present": bool} computed from
495
+ the manifest alone (no ChromaDB, no chunking) so it is safe to call from the
496
+ MCP server under any Python. A missing manifest degrades to not-stale.
497
+
498
+ Note: a brand-new file that was never indexed will not appear in the
499
+ manifest, so it is not counted here. This check detects edits and deletions
500
+ of already-indexed files, which is what drives orphan/staleness signals.
501
+ """
502
+ manifest = load_manifest(manifest_path)
503
+ files = manifest.get("files", {})
504
+ if not files:
505
+ return {"stale": False, "stale_files": 0, "manifest_present": False}
506
+
507
+ stale = 0
508
+ for rel, entry in files.items():
509
+ abs_path = PROJECT_ROOT / rel
510
+ if not abs_path.exists():
511
+ stale += 1 # deleted from disk -> orphan chunks remain
512
+ continue
513
+ try:
514
+ if os.path.getmtime(abs_path) != entry.get("mtime"):
515
+ stale += 1
516
+ except OSError:
517
+ stale += 1
518
+ return {"stale": stale > 0, "stale_files": stale, "manifest_present": True}
519
+
520
+
521
+ def index_changed(collection):
522
+ """Incremental index: re-chunk only changed files, fix orphan chunks.
523
+
524
+ Returns (changed_count, removed_count, upserted_chunks, deleted_chunks).
525
+ """
526
+ old_manifest = load_manifest(MANIFEST_PATH)
527
+ files = collect_files()
528
+ present_rel_paths = {
529
+ str(fp.relative_to(PROJECT_ROOT)) for fp, _ in files
530
+ }
531
+
532
+ # Re-chunk only files whose mtime or sha1 differs from the manifest.
533
+ current_entries: list[tuple[str, dict]] = []
534
+ chunks_by_rel: dict = {}
535
+ for filepath, file_type in files:
536
+ rel = str(filepath.relative_to(PROJECT_ROOT))
537
+ if not file_is_changed(filepath, rel, old_manifest):
538
+ continue
539
+ try:
540
+ chunks = chunk_file(filepath, file_type)
541
+ except Exception as e:
542
+ print(f" ERROR chunking {filepath}: {e}", file=sys.stderr)
543
+ continue
544
+ chunk_ids = [c["id"] for c in chunks]
545
+ current_entries.append((rel, build_manifest_entry(filepath, chunk_ids)))
546
+ chunks_by_rel[rel] = chunks
547
+
548
+ diff = compute_manifest_diff(old_manifest, current_entries, present_rel_paths)
549
+
550
+ # Apply deletes first (orphans + removed files), then upserts.
551
+ if diff["delete_ids"]:
552
+ try:
553
+ collection.delete(ids=diff["delete_ids"])
554
+ except Exception as e:
555
+ print(f" ERROR deleting orphan chunks: {e}", file=sys.stderr)
556
+
557
+ upserted = 0
558
+ for rel in diff["changed_files"]:
559
+ chunks = chunks_by_rel.get(rel, [])
560
+ if not chunks:
561
+ continue
562
+ try:
563
+ collection.upsert(
564
+ ids=[c["id"] for c in chunks],
565
+ documents=[c["content"] for c in chunks],
566
+ metadatas=[c["metadata"] for c in chunks],
567
+ )
568
+ upserted += len(chunks)
569
+ print(f" upsert {rel}: {len(chunks)} chunks")
570
+ except Exception as e:
571
+ print(f" ERROR upserting {rel}: {e}", file=sys.stderr)
572
+
573
+ # Update the manifest: keep unchanged entries, refresh changed ones, drop
574
+ # removed files.
575
+ new_files = dict(old_manifest.get("files", {}))
576
+ for rel in diff["removed_files"]:
577
+ new_files.pop(rel, None)
578
+ for rel, entry in current_entries:
579
+ new_files[rel] = entry
580
+ new_manifest = {
581
+ "version": 1,
582
+ "collection": collection.name,
583
+ "files": new_files,
584
+ }
585
+ save_manifest(new_manifest, MANIFEST_PATH)
586
+
587
+ return (len(diff["changed_files"]), len(diff["removed_files"]),
588
+ upserted, len(diff["delete_ids"]))
589
+
590
+
591
+ def write_manifest_for_full_index(collection):
592
+ """Rebuild the manifest from a full pass over all in-scope files.
593
+
594
+ Called at the end of --reset and default full-index so a later --changed run
595
+ has an accurate baseline. This is additive persistence only: it does not
596
+ change what was indexed, it records the chunk IDs that were produced.
597
+ """
598
+ files = collect_files()
599
+ new_files: dict = {}
600
+ for filepath, file_type in files:
601
+ try:
602
+ chunks = chunk_file(filepath, file_type)
603
+ except Exception:
604
+ continue
605
+ rel = str(filepath.relative_to(PROJECT_ROOT))
606
+ new_files[rel] = build_manifest_entry(filepath, [c["id"] for c in chunks])
607
+ save_manifest({
608
+ "version": 1,
609
+ "collection": collection.name,
610
+ "files": new_files,
611
+ }, MANIFEST_PATH)
612
+ return len(new_files)
613
+
614
+
336
615
  def index_all(collection, reset: bool = False):
337
616
  """Index the entire codebase."""
338
617
  files = collect_files()
@@ -429,6 +708,8 @@ def main():
429
708
  parser = argparse.ArgumentParser(description="Index loki-mode codebase into ChromaDB")
430
709
  parser.add_argument("--collection", default=COLLECTION_NAME, help="Collection name")
431
710
  parser.add_argument("--reset", action="store_true", help="Clear and re-index")
711
+ parser.add_argument("--changed", action="store_true",
712
+ help="Incremental: re-index only changed files (uses manifest)")
432
713
  parser.add_argument("--stats", action="store_true", help="Show index stats")
433
714
  parser.add_argument("--search", type=str, help="Run a test search query")
434
715
  parser.add_argument("--host", default=CHROMA_HOST, help="ChromaDB host")
@@ -457,11 +738,26 @@ def main():
457
738
  test_search(collection, args.search)
458
739
  return
459
740
 
741
+ if args.changed:
742
+ start = time.time()
743
+ changed, removed, upserted, deleted = index_changed(collection)
744
+ elapsed = time.time() - start
745
+ print(f"\nIncremental done: {changed} changed file(s), "
746
+ f"{removed} removed file(s), {upserted} chunks upserted, "
747
+ f"{deleted} orphan chunks deleted in {elapsed:.1f}s")
748
+ show_stats(collection)
749
+ return
750
+
460
751
  start = time.time()
461
752
  file_count, total_chunks = index_all(collection)
462
753
  elapsed = time.time() - start
463
754
 
755
+ # Additive persistence: record the manifest so a later --changed run has an
756
+ # accurate baseline. Does not change what was indexed above.
757
+ manifest_files = write_manifest_for_full_index(collection)
464
758
  print(f"\nDone: {total_chunks} chunks from {file_count} files in {elapsed:.1f}s")
759
+ print(f"Manifest: {manifest_files} files tracked at "
760
+ f"{MANIFEST_PATH.relative_to(PROJECT_ROOT)}")
465
761
  show_stats(collection)
466
762
 
467
763
  # Run a few test searches