codespine 0.5.10__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.5.10 → codespine-0.6.1}/PKG-INFO +1 -1
- {codespine-0.5.10 → codespine-0.6.1}/codespine/__init__.py +1 -1
- {codespine-0.5.10 → codespine-0.6.1}/codespine/cli.py +151 -22
- {codespine-0.5.10 → codespine-0.6.1}/codespine/config.py +2 -2
- {codespine-0.5.10 → codespine-0.6.1}/codespine/db/store.py +84 -11
- {codespine-0.5.10 → codespine-0.6.1}/codespine/indexer/engine.py +117 -8
- {codespine-0.5.10 → codespine-0.6.1}/codespine/mcp/server.py +156 -53
- {codespine-0.5.10 → codespine-0.6.1}/codespine/watch/watcher.py +106 -59
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.5.10 → codespine-0.6.1}/pyproject.toml +1 -1
- {codespine-0.5.10 → codespine-0.6.1}/LICENSE +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/README.md +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/community.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/context.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/coupling.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/flow.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/analysis/impact.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/db/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/db/schema.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/diff/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/indexer/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/mcp/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/noise/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/noise/blocklist.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/overlay/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/overlay/git_state.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/overlay/merge.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/overlay/store.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/bm25.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/fuzzy.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/hybrid.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/rrf.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/search/vector.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine/watch/__init__.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/SOURCES.txt +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/gindex.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/setup.cfg +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_call_resolver.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_community_detection.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_deadcode.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_java_parser.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_multimodule_index.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_overlay.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_search_ranking.py +0 -0
- {codespine-0.5.10 → codespine-0.6.1}/tests/test_store_recovery.py +0 -0
|
@@ -101,16 +101,17 @@ def main() -> None:
|
|
|
101
101
|
@click.option("--deep/--no-deep", default=False, show_default=True, help="Run expensive global analyses.")
|
|
102
102
|
@click.option(
|
|
103
103
|
"--embed/--no-embed",
|
|
104
|
-
default=
|
|
104
|
+
default=True,
|
|
105
105
|
show_default=True,
|
|
106
|
-
help="Generate vector embeddings
|
|
106
|
+
help="Generate vector embeddings. Uses sentence-transformers if installed (pip install codespine[ml]), otherwise falls back to hash-based vectors.",
|
|
107
107
|
)
|
|
108
108
|
@click.option("--allow-running", is_flag=True, hidden=True, help="Skip MCP running check (used by MCP analyse_project tool).")
|
|
109
109
|
def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool) -> None:
|
|
110
110
|
"""Index a local Java project (auto-detects workspace / Maven / Gradle layout).
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
Embeddings are generated by default. If sentence-transformers is installed
|
|
113
|
+
(pip install codespine[ml]), high-quality semantic vectors are used; otherwise
|
|
114
|
+
a fast hash-based fallback provides basic vector search.
|
|
114
115
|
"""
|
|
115
116
|
if not allow_running and _is_running():
|
|
116
117
|
click.secho("Stop MCP first ('codespine stop') to index.", fg="yellow")
|
|
@@ -118,6 +119,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
118
119
|
|
|
119
120
|
started = time.perf_counter()
|
|
120
121
|
abs_path = os.path.abspath(path)
|
|
122
|
+
|
|
123
|
+
# Warn about hash fallback early so users know to install [ml]
|
|
124
|
+
if embed:
|
|
125
|
+
from codespine.search.vector import _load_model
|
|
126
|
+
if _load_model() is None:
|
|
127
|
+
click.secho(
|
|
128
|
+
"⚠ sentence-transformers not found — using hash-based embeddings.\n"
|
|
129
|
+
" For better semantic search: pip install codespine[ml]\n",
|
|
130
|
+
fg="yellow",
|
|
131
|
+
)
|
|
132
|
+
|
|
121
133
|
store = GraphStore(read_only=False)
|
|
122
134
|
indexer = JavaIndexer(store)
|
|
123
135
|
|
|
@@ -309,9 +321,27 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
309
321
|
)
|
|
310
322
|
_finish_phase(coup_label, f"{len(coupling_pairs)} coupled file pairs")
|
|
311
323
|
else:
|
|
324
|
+
# Run lightweight versions of flow tracing and dead code from the call
|
|
325
|
+
# graph already built — no community detection or coupling (those are
|
|
326
|
+
# genuinely expensive). This gives partial results without --deep.
|
|
312
327
|
_phase("Detecting communities...", "skipped (large repo; rerun with --deep)")
|
|
313
|
-
|
|
314
|
-
|
|
328
|
+
|
|
329
|
+
flow_label = "Detecting execution flows..."
|
|
330
|
+
_live_phase(flow_label, "running (lightweight)")
|
|
331
|
+
try:
|
|
332
|
+
flows = trace_execution_flows(store, max_depth=3)
|
|
333
|
+
except Exception:
|
|
334
|
+
flows = []
|
|
335
|
+
_finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
|
|
336
|
+
|
|
337
|
+
dead_label = "Finding dead code..."
|
|
338
|
+
_live_phase(dead_label, "running (lightweight)")
|
|
339
|
+
try:
|
|
340
|
+
dead = detect_dead_code(store, limit=100)
|
|
341
|
+
except Exception:
|
|
342
|
+
dead = []
|
|
343
|
+
_finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
|
|
344
|
+
|
|
315
345
|
_phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
|
|
316
346
|
|
|
317
347
|
vector_count = store.query_records(
|
|
@@ -331,7 +361,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
331
361
|
edges = int(edge_count[0]["count"]) if edge_count else 0
|
|
332
362
|
elapsed = time.perf_counter() - started
|
|
333
363
|
|
|
334
|
-
|
|
364
|
+
if not embed:
|
|
365
|
+
embed_note = " (no embeddings; rerun with --embed for semantic search)"
|
|
366
|
+
elif _load_model() is None:
|
|
367
|
+
embed_note = " (hash embeddings; pip install codespine[ml] for better search)"
|
|
368
|
+
else:
|
|
369
|
+
embed_note = ""
|
|
335
370
|
module_info = f"{len(modules_with_ids)} modules/projects, " if is_multi else ""
|
|
336
371
|
click.echo()
|
|
337
372
|
click.secho(
|
|
@@ -339,6 +374,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
339
374
|
fg="green",
|
|
340
375
|
)
|
|
341
376
|
|
|
377
|
+
# Detect unresolved imports → hint about unindexed sibling projects
|
|
378
|
+
try:
|
|
379
|
+
unresolved = JavaIndexer.detect_unresolved_imports(store)
|
|
380
|
+
if unresolved:
|
|
381
|
+
click.echo()
|
|
382
|
+
click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
|
|
383
|
+
for pkg, samples in sorted(unresolved.items())[:8]:
|
|
384
|
+
click.echo(f" {pkg} (e.g. {samples[0]})")
|
|
385
|
+
except Exception:
|
|
386
|
+
pass # best-effort
|
|
387
|
+
|
|
342
388
|
# Publish a read replica so MCP and read-only CLI commands (search, stats…)
|
|
343
389
|
# run against an isolated snapshot rather than competing with the write
|
|
344
390
|
# process's buffer pool. The MCP daemon detects the sentinel file and
|
|
@@ -579,7 +625,14 @@ def list_projects(as_json: bool) -> None:
|
|
|
579
625
|
@main.command()
|
|
580
626
|
@click.option("--json", "as_json", is_flag=True)
|
|
581
627
|
def status(as_json: bool) -> None:
|
|
582
|
-
"""Show service and database status.
|
|
628
|
+
"""Show service and database status.
|
|
629
|
+
|
|
630
|
+
Quick reference for MCP server management:
|
|
631
|
+
codespine start – launch background MCP server
|
|
632
|
+
codespine stop – stop background MCP server
|
|
633
|
+
codespine status – this command
|
|
634
|
+
codespine mcp – run MCP in foreground (stdio, for IDE integration)
|
|
635
|
+
"""
|
|
583
636
|
running = _is_running()
|
|
584
637
|
pid = None
|
|
585
638
|
if os.path.exists(SETTINGS.pid_file):
|
|
@@ -590,17 +643,35 @@ def status(as_json: bool) -> None:
|
|
|
590
643
|
pid = None
|
|
591
644
|
store = GraphStore(read_only=True)
|
|
592
645
|
overlay = get_overlay_status(store)
|
|
646
|
+
|
|
647
|
+
# Check for stale PID file
|
|
648
|
+
stale_pid = pid is not None and not running
|
|
649
|
+
has_snapshot = os.path.exists(SETTINGS.db_snapshot_path)
|
|
650
|
+
|
|
593
651
|
payload = {
|
|
594
652
|
"running": running,
|
|
595
653
|
"pid": pid,
|
|
654
|
+
"stale_pid": stale_pid,
|
|
596
655
|
"pid_file": SETTINGS.pid_file,
|
|
597
656
|
"db_path": SETTINGS.db_path,
|
|
598
657
|
"db_size_bytes": _db_size_bytes(SETTINGS.db_path),
|
|
658
|
+
"read_replica": SETTINGS.db_snapshot_path if has_snapshot else None,
|
|
659
|
+
"read_replica_size_bytes": _db_size_bytes(SETTINGS.db_snapshot_path) if has_snapshot else 0,
|
|
599
660
|
"log_file": SETTINGS.log_file,
|
|
600
661
|
"overlay_dir": SETTINGS.overlay_dir,
|
|
601
662
|
"overlay_projects": overlay,
|
|
602
663
|
}
|
|
603
|
-
|
|
664
|
+
if as_json:
|
|
665
|
+
_echo_json(payload, True)
|
|
666
|
+
else:
|
|
667
|
+
_echo_json(payload, True)
|
|
668
|
+
if stale_pid:
|
|
669
|
+
click.secho(f"\n⚠ Stale PID file found (PID {pid} not running). Run 'codespine stop' to clean up.", fg="yellow")
|
|
670
|
+
if not running:
|
|
671
|
+
click.echo("\nTo start: codespine start")
|
|
672
|
+
click.echo("For IDE: codespine mcp (stdio mode)")
|
|
673
|
+
else:
|
|
674
|
+
click.echo(f"\nMCP server running (PID {pid}). Stop with: codespine stop")
|
|
604
675
|
|
|
605
676
|
|
|
606
677
|
@main.command("overlay-status")
|
|
@@ -680,17 +751,25 @@ def clear_project_cmd(project_id: str, allow_running: bool) -> None:
|
|
|
680
751
|
if not allow_running and _is_running():
|
|
681
752
|
click.secho("Stop MCP first ('codespine stop') to modify index.", fg="yellow")
|
|
682
753
|
return
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
754
|
+
try:
|
|
755
|
+
store = GraphStore(read_only=False)
|
|
756
|
+
recs = store.query_records(
|
|
757
|
+
"MATCH (p:Project) WHERE p.id = $pid RETURN p.id as id, p.path as path",
|
|
758
|
+
{"pid": project_id},
|
|
759
|
+
)
|
|
760
|
+
except Exception as exc:
|
|
761
|
+
click.secho(f"DB is corrupted ({exc}). Use 'codespine force-reset' to wipe all data.", fg="red")
|
|
762
|
+
return
|
|
688
763
|
if not recs:
|
|
689
764
|
click.secho(f"Project '{project_id}' not found in index.", fg="yellow")
|
|
690
765
|
return
|
|
691
766
|
project_path = recs[0].get("path", "")
|
|
692
|
-
|
|
693
|
-
|
|
767
|
+
try:
|
|
768
|
+
store.clear_analysis_artifacts()
|
|
769
|
+
store.clear_project(project_id)
|
|
770
|
+
except Exception as exc:
|
|
771
|
+
click.secho(f"DB write failed ({exc}). Use 'codespine force-reset' to recover.", fg="red")
|
|
772
|
+
return
|
|
694
773
|
store.overlay_store.clear_project(project_id)
|
|
695
774
|
meta_path = JavaIndexer._meta_cache_path(project_id)
|
|
696
775
|
if os.path.exists(meta_path):
|
|
@@ -713,9 +792,23 @@ def clear_index_cmd(allow_running: bool) -> None:
|
|
|
713
792
|
if not allow_running and _is_running():
|
|
714
793
|
click.secho("Stop MCP first ('codespine stop') to modify index.", fg="yellow")
|
|
715
794
|
return
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
795
|
+
try:
|
|
796
|
+
store = GraphStore(read_only=False)
|
|
797
|
+
projects = store.query_records("MATCH (p:Project) RETURN p.id as id")
|
|
798
|
+
except Exception:
|
|
799
|
+
# DB is corrupted — can't even open it. Force-delete everything.
|
|
800
|
+
click.secho("DB is corrupted. Running force-reset instead...", fg="yellow")
|
|
801
|
+
removed = GraphStore.force_delete_all_data()
|
|
802
|
+
click.secho(f"Force-reset complete. {len(removed)} path(s) removed. Index is now empty.", fg="green")
|
|
803
|
+
return
|
|
804
|
+
try:
|
|
805
|
+
store.rebuild_empty_db()
|
|
806
|
+
except Exception as exc:
|
|
807
|
+
# rebuild_empty_db failed even with fallbacks — force-delete.
|
|
808
|
+
click.secho(f"rebuild failed ({exc}). Running force-reset...", fg="yellow")
|
|
809
|
+
GraphStore.force_delete_all_data()
|
|
810
|
+
click.secho("Force-reset complete. Index is now empty.", fg="green")
|
|
811
|
+
return
|
|
719
812
|
store.overlay_store.clear_all()
|
|
720
813
|
for p in projects:
|
|
721
814
|
meta_path = JavaIndexer._meta_cache_path(p["id"])
|
|
@@ -727,6 +820,33 @@ def clear_index_cmd(allow_running: bool) -> None:
|
|
|
727
820
|
click.secho(f"Cleared {len(projects)} project(s). Index is now empty.", fg="green")
|
|
728
821
|
|
|
729
822
|
|
|
823
|
+
@main.command("force-reset")
|
|
824
|
+
@click.option("--force", is_flag=True, help="Skip confirmation prompt.")
|
|
825
|
+
def force_reset_cmd(force: bool) -> None:
|
|
826
|
+
"""Emergency reset: delete ALL CodeSpine data files without touching the DB engine.
|
|
827
|
+
|
|
828
|
+
Use this when the buffer pool is exhausted and normal reset/clear commands
|
|
829
|
+
also fail with OOM. This bypasses Kuzu entirely by removing data files
|
|
830
|
+
from disk, including the DB, read replica, overlay, meta cache, and
|
|
831
|
+
embedding cache.
|
|
832
|
+
|
|
833
|
+
After running this, restart the MCP server and re-index your projects.
|
|
834
|
+
"""
|
|
835
|
+
if not force and not click.confirm(
|
|
836
|
+
"This will DELETE all CodeSpine data (DB, overlay, caches). Continue?"
|
|
837
|
+
):
|
|
838
|
+
click.echo("Aborted.")
|
|
839
|
+
return
|
|
840
|
+
removed = GraphStore.force_delete_all_data()
|
|
841
|
+
if removed:
|
|
842
|
+
for p in removed:
|
|
843
|
+
click.echo(f" removed: {p}")
|
|
844
|
+
click.secho(f"\nForce-reset complete. {len(removed)} path(s) removed.", fg="green")
|
|
845
|
+
click.echo("Next: restart MCP ('codespine stop && codespine start') and re-index.")
|
|
846
|
+
else:
|
|
847
|
+
click.secho("Nothing to remove — already clean.", fg="yellow")
|
|
848
|
+
|
|
849
|
+
|
|
730
850
|
@main.command()
|
|
731
851
|
def setup() -> None:
|
|
732
852
|
"""Print local setup checks and next steps."""
|
|
@@ -743,12 +863,21 @@ def setup() -> None:
|
|
|
743
863
|
checks[mod] = True
|
|
744
864
|
except Exception:
|
|
745
865
|
checks[mod] = False
|
|
746
|
-
click.echo("
|
|
866
|
+
click.echo("Core dependencies:")
|
|
747
867
|
for mod, ok in checks.items():
|
|
748
868
|
click.echo(f" - {mod}: {'OK' if ok else 'MISSING'}")
|
|
749
|
-
|
|
750
|
-
|
|
869
|
+
# Check optional ML dependencies
|
|
870
|
+
try:
|
|
871
|
+
from sentence_transformers import SentenceTransformer
|
|
872
|
+
click.echo(" - sentence-transformers: OK (semantic embeddings active)")
|
|
873
|
+
except ImportError:
|
|
874
|
+
click.secho(" - sentence-transformers: NOT INSTALLED (hash fallback; install for better search)", fg="yellow")
|
|
875
|
+
click.echo("\nRecommended setup:")
|
|
876
|
+
click.echo(" pip install -e '.[full]' # core + ML + community detection")
|
|
877
|
+
click.echo(" pip install -e '.[ml]' # just ML embeddings")
|
|
878
|
+
click.echo("\nQuick start:")
|
|
751
879
|
click.echo(" codespine analyse /path/to/java-project --full")
|
|
880
|
+
click.echo(" codespine start # launch MCP server")
|
|
752
881
|
click.echo(" codespine search payment --json")
|
|
753
882
|
|
|
754
883
|
|
|
@@ -16,8 +16,8 @@ class Settings:
|
|
|
16
16
|
rrf_k: int = 60
|
|
17
17
|
semantic_candidate_pool: int = 2000
|
|
18
18
|
write_batch_size: int = 500
|
|
19
|
-
index_file_batch_size: int =
|
|
20
|
-
edge_write_batch_size: int =
|
|
19
|
+
index_file_batch_size: int = 20
|
|
20
|
+
edge_write_batch_size: int = 500
|
|
21
21
|
default_coupling_months: int = 6
|
|
22
22
|
default_min_coupling_strength: float = 0.3
|
|
23
23
|
default_min_cochanges: int = 3
|
|
@@ -147,10 +147,13 @@ class GraphStore:
|
|
|
147
147
|
|
|
148
148
|
def clear_project(self, project_id: str) -> None:
|
|
149
149
|
file_recs = self.query_records("MATCH (f:File) WHERE f.project_id = $pid RETURN f.id as id", {"pid": project_id})
|
|
150
|
+
# Small batches (10 files per tx) prevent buffer pool OOM on large projects.
|
|
150
151
|
for idx, rec in enumerate(file_recs, start=1):
|
|
151
|
-
self.
|
|
152
|
-
|
|
152
|
+
with self.transaction():
|
|
153
|
+
self.clear_file(rec["id"])
|
|
154
|
+
if idx % 10 == 0:
|
|
153
155
|
self._recycle_conn()
|
|
156
|
+
self._recycle_conn()
|
|
154
157
|
self.execute("MATCH (p:Project) WHERE p.id = $pid DETACH DELETE p", {"pid": project_id})
|
|
155
158
|
self._recycle_conn()
|
|
156
159
|
|
|
@@ -502,24 +505,94 @@ class GraphStore:
|
|
|
502
505
|
self.clear_flows()
|
|
503
506
|
self.clear_coupling()
|
|
504
507
|
|
|
508
|
+
@staticmethod
|
|
509
|
+
def force_delete_all_data() -> list[str]:
|
|
510
|
+
"""Delete all CodeSpine data files without touching the Kuzu engine.
|
|
511
|
+
|
|
512
|
+
This is the nuclear option for OOM recovery: when the buffer pool is
|
|
513
|
+
exhausted, normal DB writes (including reset_project / clear_project)
|
|
514
|
+
also fail. This bypasses Kuzu entirely by removing the data files
|
|
515
|
+
from disk, allowing a fresh start.
|
|
516
|
+
|
|
517
|
+
Returns the list of paths that were removed.
|
|
518
|
+
"""
|
|
519
|
+
removed: list[str] = []
|
|
520
|
+
for path in [
|
|
521
|
+
SETTINGS.db_path,
|
|
522
|
+
SETTINGS.db_snapshot_path,
|
|
523
|
+
SETTINGS.db_snapshot_path + ".updated",
|
|
524
|
+
SETTINGS.db_snapshot_path + ".tmp",
|
|
525
|
+
SETTINGS.embedding_cache_path,
|
|
526
|
+
SETTINGS.overlay_dir,
|
|
527
|
+
SETTINGS.index_meta_dir,
|
|
528
|
+
]:
|
|
529
|
+
if not os.path.exists(path):
|
|
530
|
+
continue
|
|
531
|
+
try:
|
|
532
|
+
if os.path.isdir(path):
|
|
533
|
+
shutil.rmtree(path, ignore_errors=True)
|
|
534
|
+
else:
|
|
535
|
+
os.remove(path)
|
|
536
|
+
removed.append(path)
|
|
537
|
+
except OSError:
|
|
538
|
+
pass
|
|
539
|
+
# Also remove any stale WAL files next to the DB
|
|
540
|
+
for suffix in (".wal", ".lock"):
|
|
541
|
+
wal_path = SETTINGS.db_path + suffix
|
|
542
|
+
if os.path.exists(wal_path):
|
|
543
|
+
try:
|
|
544
|
+
os.remove(wal_path)
|
|
545
|
+
removed.append(wal_path)
|
|
546
|
+
except OSError:
|
|
547
|
+
pass
|
|
548
|
+
return removed
|
|
549
|
+
|
|
505
550
|
def rebuild_empty_db(self) -> None:
|
|
506
551
|
self._recycle_conn()
|
|
507
552
|
path = SETTINGS.db_path
|
|
553
|
+
# Remove the DB directory AND any stale WAL / lock files
|
|
508
554
|
self._remove_db_path(path)
|
|
555
|
+
for suffix in (".wal", ".lock"):
|
|
556
|
+
sidecar = path + suffix
|
|
557
|
+
if os.path.exists(sidecar):
|
|
558
|
+
try:
|
|
559
|
+
os.remove(sidecar)
|
|
560
|
+
except OSError:
|
|
561
|
+
pass
|
|
562
|
+
|
|
509
563
|
# Kuzu may retain stale internal state from a previous failed open of
|
|
510
|
-
# this path (e.g. after Ctrl+C mid-write).
|
|
511
|
-
#
|
|
512
|
-
#
|
|
564
|
+
# this path (e.g. after Ctrl+C mid-write). The in-process C++ state
|
|
565
|
+
# is poisoned and will raise "unordered_map::at: key not found" on any
|
|
566
|
+
# new kuzu.Database() call — even for a freshly deleted path.
|
|
567
|
+
#
|
|
568
|
+
# Strategy: try primary → try /tmp fallback → force-delete everything
|
|
569
|
+
# and re-import kuzu to get a clean C++ state.
|
|
513
570
|
try:
|
|
514
571
|
self.db = self._open_db(path)
|
|
515
|
-
except Exception as
|
|
572
|
+
except Exception as exc1:
|
|
573
|
+
LOGGER.warning("rebuild_empty_db: primary path failed (%s)", exc1)
|
|
516
574
|
fallback = os.path.join("/tmp", ".codespine_db")
|
|
517
|
-
LOGGER.warning(
|
|
518
|
-
"Could not open fresh DB at %s after rebuild (%s); falling back to %s",
|
|
519
|
-
path, exc, fallback,
|
|
520
|
-
)
|
|
521
575
|
self._remove_db_path(fallback)
|
|
522
|
-
|
|
576
|
+
for suffix in (".wal", ".lock"):
|
|
577
|
+
sidecar = fallback + suffix
|
|
578
|
+
if os.path.exists(sidecar):
|
|
579
|
+
try:
|
|
580
|
+
os.remove(sidecar)
|
|
581
|
+
except OSError:
|
|
582
|
+
pass
|
|
583
|
+
try:
|
|
584
|
+
self.db = self._open_db(fallback)
|
|
585
|
+
except Exception as exc2:
|
|
586
|
+
# Nuclear option: force-delete all files and reimport kuzu
|
|
587
|
+
# so the C++ runtime starts from a completely clean state.
|
|
588
|
+
LOGGER.warning("rebuild_empty_db: fallback also failed (%s); force-resetting", exc2)
|
|
589
|
+
self.force_delete_all_data()
|
|
590
|
+
import importlib
|
|
591
|
+
importlib.reload(kuzu)
|
|
592
|
+
try:
|
|
593
|
+
self.db = kuzu.Database(path, buffer_pool_size=_WRITE_BUFFER_POOL_SIZE)
|
|
594
|
+
except TypeError:
|
|
595
|
+
self.db = kuzu.Database(path)
|
|
523
596
|
self._tls = threading.local()
|
|
524
597
|
ensure_schema(self._conn())
|
|
525
598
|
|
|
@@ -253,6 +253,20 @@ class JavaIndexer:
|
|
|
253
253
|
for fid in delete_chunk:
|
|
254
254
|
self.store.clear_file(fid)
|
|
255
255
|
self.store._recycle_conn()
|
|
256
|
+
|
|
257
|
+
# Clean up stale project entries that point to the same path under a
|
|
258
|
+
# different ID (e.g. re-indexing "vision-server" directly after it was
|
|
259
|
+
# previously indexed as "vision::vision-server" from a workspace root).
|
|
260
|
+
try:
|
|
261
|
+
stale = self.store.query_records(
|
|
262
|
+
"MATCH (p:Project) WHERE p.path = $path AND p.id <> $pid RETURN p.id as id",
|
|
263
|
+
{"path": root_path, "pid": project_id},
|
|
264
|
+
)
|
|
265
|
+
for old in stale:
|
|
266
|
+
self.store.clear_project(old["id"])
|
|
267
|
+
except Exception:
|
|
268
|
+
pass # best-effort cleanup
|
|
269
|
+
|
|
256
270
|
self.store.upsert_project(project_id, root_path)
|
|
257
271
|
|
|
258
272
|
for parse_chunk in self._chunked(parse_results, file_batch_size):
|
|
@@ -279,7 +293,7 @@ class JavaIndexer:
|
|
|
279
293
|
"hash": file_digest,
|
|
280
294
|
}
|
|
281
295
|
)
|
|
282
|
-
self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source))
|
|
296
|
+
self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source), imports=parsed.imports)
|
|
283
297
|
|
|
284
298
|
for cls in parsed.classes:
|
|
285
299
|
c_id = class_id(cls.fqcn, scope)
|
|
@@ -372,15 +386,31 @@ class JavaIndexer:
|
|
|
372
386
|
class_methods[c_id][method.signature] = m_id
|
|
373
387
|
files_indexed += 1
|
|
374
388
|
|
|
389
|
+
# Split writes into smaller transactions and recycle between each
|
|
390
|
+
# to prevent Kuzu WAL from exhausting the buffer pool on large
|
|
391
|
+
# incremental re-indexes (GH feedback: 1,604-file OOM).
|
|
392
|
+
if not full:
|
|
393
|
+
for clear_sub in self._chunked(file_rows, 10):
|
|
394
|
+
with self.store.transaction():
|
|
395
|
+
for row in clear_sub:
|
|
396
|
+
self.store.clear_file(row["id"])
|
|
397
|
+
self.store._recycle_conn()
|
|
375
398
|
with self.store.transaction():
|
|
376
|
-
for row in file_rows:
|
|
377
|
-
if not full:
|
|
378
|
-
self.store.clear_file(row["id"])
|
|
379
399
|
self.store.upsert_files_batch(file_rows)
|
|
400
|
+
self.store._recycle_conn()
|
|
401
|
+
with self.store.transaction():
|
|
380
402
|
self.store.upsert_classes_batch(class_rows)
|
|
381
|
-
self.store.upsert_methods_batch(method_rows)
|
|
382
|
-
self.store.upsert_symbols_batch(symbol_rows)
|
|
383
403
|
self.store._recycle_conn()
|
|
404
|
+
_METHOD_SUB_BATCH = 200
|
|
405
|
+
for method_sub in self._chunked(method_rows, _METHOD_SUB_BATCH):
|
|
406
|
+
with self.store.transaction():
|
|
407
|
+
self.store.upsert_methods_batch(method_sub)
|
|
408
|
+
self.store._recycle_conn()
|
|
409
|
+
_SYMBOL_SUB_BATCH = 200
|
|
410
|
+
for symbol_sub in self._chunked(symbol_rows, _SYMBOL_SUB_BATCH):
|
|
411
|
+
with self.store.transaction():
|
|
412
|
+
self.store.upsert_symbols_batch(symbol_sub)
|
|
413
|
+
self.store._recycle_conn()
|
|
384
414
|
|
|
385
415
|
self._emit(progress, "resolve_calls_start")
|
|
386
416
|
call_rows: list[dict] = []
|
|
@@ -697,7 +727,10 @@ class JavaIndexer:
|
|
|
697
727
|
return
|
|
698
728
|
|
|
699
729
|
@staticmethod
|
|
700
|
-
def _update_meta_cache_entry(
|
|
730
|
+
def _update_meta_cache_entry(
|
|
731
|
+
meta_cache: dict[str, dict], fid: str, file_path: str, digest: str, size_hint: int,
|
|
732
|
+
imports: list[str] | None = None,
|
|
733
|
+
) -> None:
|
|
701
734
|
try:
|
|
702
735
|
st = os.stat(file_path)
|
|
703
736
|
mtime_ns = int(getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000)))
|
|
@@ -705,7 +738,10 @@ class JavaIndexer:
|
|
|
705
738
|
except OSError:
|
|
706
739
|
mtime_ns = -1
|
|
707
740
|
size = size_hint
|
|
708
|
-
|
|
741
|
+
entry: dict = {"mtime_ns": mtime_ns, "size": size, "hash": digest}
|
|
742
|
+
if imports is not None:
|
|
743
|
+
entry["imports"] = imports
|
|
744
|
+
meta_cache[fid] = entry
|
|
709
745
|
|
|
710
746
|
@staticmethod
|
|
711
747
|
def _prune_meta_cache(meta_cache: dict[str, dict], current_file_ids: set[str]) -> None:
|
|
@@ -728,3 +764,76 @@ class JavaIndexer:
|
|
|
728
764
|
return normalized.split("/src/", 1)[0]
|
|
729
765
|
scope = os.path.dirname(normalized).strip()
|
|
730
766
|
return scope or "."
|
|
767
|
+
|
|
768
|
+
@staticmethod
|
|
769
|
+
def detect_unresolved_imports(store) -> dict[str, list[str]]:
|
|
770
|
+
"""Detect imports that reference packages not covered by any indexed project.
|
|
771
|
+
|
|
772
|
+
Returns a dict mapping unresolved base packages (e.g. "com.foo.bar")
|
|
773
|
+
to a list of sample import FQCNs. Useful for suggesting which sibling
|
|
774
|
+
projects to index.
|
|
775
|
+
|
|
776
|
+
Only reports project-internal packages (not java.*, javax.*, org.apache.*
|
|
777
|
+
etc.).
|
|
778
|
+
"""
|
|
779
|
+
# 1. Collect all indexed class FQCNs
|
|
780
|
+
try:
|
|
781
|
+
recs = store.query_records("MATCH (c:Class) RETURN c.fqcn as fqcn")
|
|
782
|
+
except Exception:
|
|
783
|
+
return {}
|
|
784
|
+
indexed_fqcns = {r["fqcn"] for r in recs if r.get("fqcn")}
|
|
785
|
+
indexed_packages = set()
|
|
786
|
+
for fqcn in indexed_fqcns:
|
|
787
|
+
parts = fqcn.rsplit(".", 1)
|
|
788
|
+
if len(parts) == 2:
|
|
789
|
+
indexed_packages.add(parts[0])
|
|
790
|
+
|
|
791
|
+
# 2. Collect all imports from overlay + any stored file data
|
|
792
|
+
# Parse imports from the parsed file metadata if available
|
|
793
|
+
meta_dir = SETTINGS.index_meta_dir
|
|
794
|
+
all_imports: set[str] = set()
|
|
795
|
+
if os.path.isdir(meta_dir):
|
|
796
|
+
for fname in os.listdir(meta_dir):
|
|
797
|
+
if not fname.endswith(".json"):
|
|
798
|
+
continue
|
|
799
|
+
try:
|
|
800
|
+
with open(os.path.join(meta_dir, fname), "r") as f:
|
|
801
|
+
data = json.load(f)
|
|
802
|
+
for fid, fmeta in data.items():
|
|
803
|
+
for imp in fmeta.get("imports", []):
|
|
804
|
+
all_imports.add(imp)
|
|
805
|
+
except Exception:
|
|
806
|
+
pass
|
|
807
|
+
|
|
808
|
+
# 3. Also scan the DB for CALLS edges that reference unknown targets
|
|
809
|
+
# (lightweight — just check which classes were resolved vs not)
|
|
810
|
+
|
|
811
|
+
# 4. Filter: skip standard library / well-known third-party packages
|
|
812
|
+
_SKIP_PREFIXES = (
|
|
813
|
+
"java.", "javax.", "jakarta.",
|
|
814
|
+
"org.apache.", "org.springframework.", "org.hibernate.",
|
|
815
|
+
"org.slf4j.", "org.junit.", "org.mockito.",
|
|
816
|
+
"com.google.", "com.fasterxml.", "com.sun.",
|
|
817
|
+
"io.micrometer.", "io.netty.", "io.lettuce.",
|
|
818
|
+
"lombok.", "reactor.", "rx.",
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
unresolved: dict[str, list[str]] = {}
|
|
822
|
+
for imp in all_imports:
|
|
823
|
+
if any(imp.startswith(prefix) for prefix in _SKIP_PREFIXES):
|
|
824
|
+
continue
|
|
825
|
+
# Check if this import's class exists in the index
|
|
826
|
+
simple_name = imp.rsplit(".", 1)[-1]
|
|
827
|
+
pkg = imp.rsplit(".", 1)[0] if "." in imp else ""
|
|
828
|
+
if imp in indexed_fqcns:
|
|
829
|
+
continue
|
|
830
|
+
if pkg in indexed_packages:
|
|
831
|
+
continue # same package, just not this specific class
|
|
832
|
+
# Group by top 3 package segments
|
|
833
|
+
parts = imp.split(".")
|
|
834
|
+
base_pkg = ".".join(parts[:min(3, len(parts))])
|
|
835
|
+
if base_pkg not in unresolved:
|
|
836
|
+
unresolved[base_pkg] = []
|
|
837
|
+
if len(unresolved[base_pkg]) < 5:
|
|
838
|
+
unresolved[base_pkg].append(imp)
|
|
839
|
+
return unresolved
|