codespine 0.5.9__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.5.9 → codespine-0.6.0}/PKG-INFO +1 -1
- {codespine-0.5.9 → codespine-0.6.0}/codespine/__init__.py +1 -1
- {codespine-0.5.9 → codespine-0.6.0}/codespine/cli.py +147 -20
- {codespine-0.5.9 → codespine-0.6.0}/codespine/config.py +2 -2
- {codespine-0.5.9 → codespine-0.6.0}/codespine/db/store.py +47 -2
- {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/engine.py +117 -8
- {codespine-0.5.9 → codespine-0.6.0}/codespine/mcp/server.py +146 -51
- {codespine-0.5.9 → codespine-0.6.0}/codespine/watch/watcher.py +106 -59
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.5.9 → codespine-0.6.0}/pyproject.toml +1 -1
- {codespine-0.5.9 → codespine-0.6.0}/LICENSE +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/README.md +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/community.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/context.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/coupling.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/flow.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/impact.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/db/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/db/schema.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/diff/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/mcp/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/noise/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/noise/blocklist.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/git_state.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/merge.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/store.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/bm25.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/fuzzy.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/hybrid.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/rrf.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/search/vector.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine/watch/__init__.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/SOURCES.txt +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/gindex.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/setup.cfg +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_call_resolver.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_community_detection.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_deadcode.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_java_parser.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_multimodule_index.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_overlay.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_search_ranking.py +0 -0
- {codespine-0.5.9 → codespine-0.6.0}/tests/test_store_recovery.py +0 -0
|
@@ -77,6 +77,19 @@ def _dead_result_count(dead_result: list[dict] | None) -> int:
|
|
|
77
77
|
return sum(1 for item in dead_result if isinstance(item, dict) and "_stats" not in item)
|
|
78
78
|
|
|
79
79
|
|
|
80
|
+
def _bar(done: int, total: int, width: int = 20) -> str:
|
|
81
|
+
"""Return an ASCII progress bar like [████████░░░░] 40%."""
|
|
82
|
+
if total <= 0:
|
|
83
|
+
return f"[{'░' * width}] ---%"
|
|
84
|
+
frac = min(done / total, 1.0)
|
|
85
|
+
filled = int(width * frac)
|
|
86
|
+
return f"[{'█' * filled}{'░' * (width - filled)}] {int(frac * 100):3d}%"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _spinner_char() -> str:
|
|
90
|
+
return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
|
|
91
|
+
|
|
92
|
+
|
|
80
93
|
@click.group()
|
|
81
94
|
def main() -> None:
|
|
82
95
|
"""CodeSpine CLI."""
|
|
@@ -88,16 +101,17 @@ def main() -> None:
|
|
|
88
101
|
@click.option("--deep/--no-deep", default=False, show_default=True, help="Run expensive global analyses.")
|
|
89
102
|
@click.option(
|
|
90
103
|
"--embed/--no-embed",
|
|
91
|
-
default=
|
|
104
|
+
default=True,
|
|
92
105
|
show_default=True,
|
|
93
|
-
help="Generate vector embeddings
|
|
106
|
+
help="Generate vector embeddings. Uses sentence-transformers if installed (pip install codespine[ml]), otherwise falls back to hash-based vectors.",
|
|
94
107
|
)
|
|
95
108
|
@click.option("--allow-running", is_flag=True, hidden=True, help="Skip MCP running check (used by MCP analyse_project tool).")
|
|
96
109
|
def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool) -> None:
|
|
97
110
|
"""Index a local Java project (auto-detects workspace / Maven / Gradle layout).
|
|
98
111
|
|
|
99
|
-
|
|
100
|
-
|
|
112
|
+
Embeddings are generated by default. If sentence-transformers is installed
|
|
113
|
+
(pip install codespine[ml]), high-quality semantic vectors are used; otherwise
|
|
114
|
+
a fast hash-based fallback provides basic vector search.
|
|
101
115
|
"""
|
|
102
116
|
if not allow_running and _is_running():
|
|
103
117
|
click.secho("Stop MCP first ('codespine stop') to index.", fg="yellow")
|
|
@@ -105,6 +119,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
105
119
|
|
|
106
120
|
started = time.perf_counter()
|
|
107
121
|
abs_path = os.path.abspath(path)
|
|
122
|
+
|
|
123
|
+
# Warn about hash fallback early so users know to install [ml]
|
|
124
|
+
if embed:
|
|
125
|
+
from codespine.search.vector import _load_model
|
|
126
|
+
if _load_model() is None:
|
|
127
|
+
click.secho(
|
|
128
|
+
"⚠ sentence-transformers not found — using hash-based embeddings.\n"
|
|
129
|
+
" For better semantic search: pip install codespine[ml]\n",
|
|
130
|
+
fg="yellow",
|
|
131
|
+
)
|
|
132
|
+
|
|
108
133
|
store = GraphStore(read_only=False)
|
|
109
134
|
indexer = JavaIndexer(store)
|
|
110
135
|
|
|
@@ -141,7 +166,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
141
166
|
|
|
142
167
|
# Shared progress state (reset per module)
|
|
143
168
|
parse_state = {"shown": False, "indexed": 0, "total": 0, "last_ts": 0.0, "printed_zero": False}
|
|
144
|
-
call_state = {"shown": False, "count": 0, "last_ts": 0.0}
|
|
169
|
+
call_state = {"shown": False, "count": 0, "last_ts": 0.0, "started_at": 0.0}
|
|
145
170
|
|
|
146
171
|
def _reset_state() -> None:
|
|
147
172
|
for k in list(parse_state):
|
|
@@ -171,22 +196,28 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
171
196
|
if total == 0:
|
|
172
197
|
return
|
|
173
198
|
if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
|
|
174
|
-
click.echo(f"\rParsing code...
|
|
199
|
+
click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
|
|
175
200
|
parse_state["shown"] = True
|
|
176
201
|
parse_state["last_ts"] = now
|
|
177
202
|
return
|
|
178
203
|
if event == "resolve_calls_start" and parse_state["shown"]:
|
|
179
204
|
click.echo()
|
|
180
205
|
parse_state["shown"] = False
|
|
181
|
-
|
|
206
|
+
call_state["started_at"] = now
|
|
207
|
+
_phase("Tracing calls...", "starting...")
|
|
182
208
|
return
|
|
183
209
|
if event == "resolve_calls_start":
|
|
184
|
-
|
|
210
|
+
call_state["started_at"] = now
|
|
211
|
+
_phase("Tracing calls...", "starting...")
|
|
185
212
|
return
|
|
186
213
|
if event == "resolve_calls_progress":
|
|
187
214
|
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
188
215
|
if (now - call_state["last_ts"]) >= 0.25:
|
|
189
|
-
|
|
216
|
+
elapsed_s = now - call_state["started_at"]
|
|
217
|
+
click.echo(
|
|
218
|
+
f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
|
|
219
|
+
nl=False,
|
|
220
|
+
)
|
|
190
221
|
call_state["shown"] = True
|
|
191
222
|
call_state["last_ts"] = now
|
|
192
223
|
return
|
|
@@ -194,7 +225,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
194
225
|
if call_state["shown"]:
|
|
195
226
|
click.echo()
|
|
196
227
|
call_state["shown"] = False
|
|
197
|
-
|
|
228
|
+
elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
|
|
229
|
+
_phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
|
|
198
230
|
return
|
|
199
231
|
if event == "resolve_types_start":
|
|
200
232
|
_phase("Analyzing types...", "running")
|
|
@@ -226,11 +258,11 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
226
258
|
# ── Helper for in-place progress updates ────────────────────────────
|
|
227
259
|
def _live_phase(label: str, status: str) -> None:
|
|
228
260
|
"""Overwrite the current line with a status update."""
|
|
229
|
-
click.echo(f"\r{label:<
|
|
261
|
+
click.echo(f"\r{_spinner_char()} {label:<28} {status:<48}", nl=False)
|
|
230
262
|
|
|
231
263
|
def _finish_phase(label: str, result: str) -> None:
|
|
232
264
|
"""Finalise an in-place phase line and move to the next line."""
|
|
233
|
-
click.echo(f"\r{label:<
|
|
265
|
+
click.echo(f"\r✓ {label:<28} {result:<48}")
|
|
234
266
|
|
|
235
267
|
# ── Cross-module call linking ──────────────────────────────────────
|
|
236
268
|
if is_multi and len(modules_with_ids) > 1:
|
|
@@ -289,9 +321,27 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
289
321
|
)
|
|
290
322
|
_finish_phase(coup_label, f"{len(coupling_pairs)} coupled file pairs")
|
|
291
323
|
else:
|
|
324
|
+
# Run lightweight versions of flow tracing and dead code from the call
|
|
325
|
+
# graph already built — no community detection or coupling (those are
|
|
326
|
+
# genuinely expensive). This gives partial results without --deep.
|
|
292
327
|
_phase("Detecting communities...", "skipped (large repo; rerun with --deep)")
|
|
293
|
-
|
|
294
|
-
|
|
328
|
+
|
|
329
|
+
flow_label = "Detecting execution flows..."
|
|
330
|
+
_live_phase(flow_label, "running (lightweight)")
|
|
331
|
+
try:
|
|
332
|
+
flows = trace_execution_flows(store, max_depth=3)
|
|
333
|
+
except Exception:
|
|
334
|
+
flows = []
|
|
335
|
+
_finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
|
|
336
|
+
|
|
337
|
+
dead_label = "Finding dead code..."
|
|
338
|
+
_live_phase(dead_label, "running (lightweight)")
|
|
339
|
+
try:
|
|
340
|
+
dead = detect_dead_code(store, limit=100)
|
|
341
|
+
except Exception:
|
|
342
|
+
dead = []
|
|
343
|
+
_finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
|
|
344
|
+
|
|
295
345
|
_phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
|
|
296
346
|
|
|
297
347
|
vector_count = store.query_records(
|
|
@@ -311,7 +361,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
311
361
|
edges = int(edge_count[0]["count"]) if edge_count else 0
|
|
312
362
|
elapsed = time.perf_counter() - started
|
|
313
363
|
|
|
314
|
-
|
|
364
|
+
if not embed:
|
|
365
|
+
embed_note = " (no embeddings; rerun with --embed for semantic search)"
|
|
366
|
+
elif _load_model() is None:
|
|
367
|
+
embed_note = " (hash embeddings; pip install codespine[ml] for better search)"
|
|
368
|
+
else:
|
|
369
|
+
embed_note = ""
|
|
315
370
|
module_info = f"{len(modules_with_ids)} modules/projects, " if is_multi else ""
|
|
316
371
|
click.echo()
|
|
317
372
|
click.secho(
|
|
@@ -319,6 +374,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
319
374
|
fg="green",
|
|
320
375
|
)
|
|
321
376
|
|
|
377
|
+
# Detect unresolved imports → hint about unindexed sibling projects
|
|
378
|
+
try:
|
|
379
|
+
unresolved = JavaIndexer.detect_unresolved_imports(store)
|
|
380
|
+
if unresolved:
|
|
381
|
+
click.echo()
|
|
382
|
+
click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
|
|
383
|
+
for pkg, samples in sorted(unresolved.items())[:8]:
|
|
384
|
+
click.echo(f" {pkg} (e.g. {samples[0]})")
|
|
385
|
+
except Exception:
|
|
386
|
+
pass # best-effort
|
|
387
|
+
|
|
322
388
|
# Publish a read replica so MCP and read-only CLI commands (search, stats…)
|
|
323
389
|
# run against an isolated snapshot rather than competing with the write
|
|
324
390
|
# process's buffer pool. The MCP daemon detects the sentinel file and
|
|
@@ -559,7 +625,14 @@ def list_projects(as_json: bool) -> None:
|
|
|
559
625
|
@main.command()
|
|
560
626
|
@click.option("--json", "as_json", is_flag=True)
|
|
561
627
|
def status(as_json: bool) -> None:
|
|
562
|
-
"""Show service and database status.
|
|
628
|
+
"""Show service and database status.
|
|
629
|
+
|
|
630
|
+
Quick reference for MCP server management:
|
|
631
|
+
codespine start – launch background MCP server
|
|
632
|
+
codespine stop – stop background MCP server
|
|
633
|
+
codespine status – this command
|
|
634
|
+
codespine mcp – run MCP in foreground (stdio, for IDE integration)
|
|
635
|
+
"""
|
|
563
636
|
running = _is_running()
|
|
564
637
|
pid = None
|
|
565
638
|
if os.path.exists(SETTINGS.pid_file):
|
|
@@ -570,17 +643,35 @@ def status(as_json: bool) -> None:
|
|
|
570
643
|
pid = None
|
|
571
644
|
store = GraphStore(read_only=True)
|
|
572
645
|
overlay = get_overlay_status(store)
|
|
646
|
+
|
|
647
|
+
# Check for stale PID file
|
|
648
|
+
stale_pid = pid is not None and not running
|
|
649
|
+
has_snapshot = os.path.exists(SETTINGS.db_snapshot_path)
|
|
650
|
+
|
|
573
651
|
payload = {
|
|
574
652
|
"running": running,
|
|
575
653
|
"pid": pid,
|
|
654
|
+
"stale_pid": stale_pid,
|
|
576
655
|
"pid_file": SETTINGS.pid_file,
|
|
577
656
|
"db_path": SETTINGS.db_path,
|
|
578
657
|
"db_size_bytes": _db_size_bytes(SETTINGS.db_path),
|
|
658
|
+
"read_replica": SETTINGS.db_snapshot_path if has_snapshot else None,
|
|
659
|
+
"read_replica_size_bytes": _db_size_bytes(SETTINGS.db_snapshot_path) if has_snapshot else 0,
|
|
579
660
|
"log_file": SETTINGS.log_file,
|
|
580
661
|
"overlay_dir": SETTINGS.overlay_dir,
|
|
581
662
|
"overlay_projects": overlay,
|
|
582
663
|
}
|
|
583
|
-
|
|
664
|
+
if as_json:
|
|
665
|
+
_echo_json(payload, True)
|
|
666
|
+
else:
|
|
667
|
+
_echo_json(payload, True)
|
|
668
|
+
if stale_pid:
|
|
669
|
+
click.secho(f"\n⚠ Stale PID file found (PID {pid} not running). Run 'codespine stop' to clean up.", fg="yellow")
|
|
670
|
+
if not running:
|
|
671
|
+
click.echo("\nTo start: codespine start")
|
|
672
|
+
click.echo("For IDE: codespine mcp (stdio mode)")
|
|
673
|
+
else:
|
|
674
|
+
click.echo(f"\nMCP server running (PID {pid}). Stop with: codespine stop")
|
|
584
675
|
|
|
585
676
|
|
|
586
677
|
@main.command("overlay-status")
|
|
@@ -707,6 +798,33 @@ def clear_index_cmd(allow_running: bool) -> None:
|
|
|
707
798
|
click.secho(f"Cleared {len(projects)} project(s). Index is now empty.", fg="green")
|
|
708
799
|
|
|
709
800
|
|
|
801
|
+
@main.command("force-reset")
|
|
802
|
+
@click.option("--force", is_flag=True, help="Skip confirmation prompt.")
|
|
803
|
+
def force_reset_cmd(force: bool) -> None:
|
|
804
|
+
"""Emergency reset: delete ALL CodeSpine data files without touching the DB engine.
|
|
805
|
+
|
|
806
|
+
Use this when the buffer pool is exhausted and normal reset/clear commands
|
|
807
|
+
also fail with OOM. This bypasses Kuzu entirely by removing data files
|
|
808
|
+
from disk, including the DB, read replica, overlay, meta cache, and
|
|
809
|
+
embedding cache.
|
|
810
|
+
|
|
811
|
+
After running this, restart the MCP server and re-index your projects.
|
|
812
|
+
"""
|
|
813
|
+
if not force and not click.confirm(
|
|
814
|
+
"This will DELETE all CodeSpine data (DB, overlay, caches). Continue?"
|
|
815
|
+
):
|
|
816
|
+
click.echo("Aborted.")
|
|
817
|
+
return
|
|
818
|
+
removed = GraphStore.force_delete_all_data()
|
|
819
|
+
if removed:
|
|
820
|
+
for p in removed:
|
|
821
|
+
click.echo(f" removed: {p}")
|
|
822
|
+
click.secho(f"\nForce-reset complete. {len(removed)} path(s) removed.", fg="green")
|
|
823
|
+
click.echo("Next: restart MCP ('codespine stop && codespine start') and re-index.")
|
|
824
|
+
else:
|
|
825
|
+
click.secho("Nothing to remove — already clean.", fg="yellow")
|
|
826
|
+
|
|
827
|
+
|
|
710
828
|
@main.command()
|
|
711
829
|
def setup() -> None:
|
|
712
830
|
"""Print local setup checks and next steps."""
|
|
@@ -723,12 +841,21 @@ def setup() -> None:
|
|
|
723
841
|
checks[mod] = True
|
|
724
842
|
except Exception:
|
|
725
843
|
checks[mod] = False
|
|
726
|
-
click.echo("
|
|
844
|
+
click.echo("Core dependencies:")
|
|
727
845
|
for mod, ok in checks.items():
|
|
728
846
|
click.echo(f" - {mod}: {'OK' if ok else 'MISSING'}")
|
|
729
|
-
|
|
730
|
-
|
|
847
|
+
# Check optional ML dependencies
|
|
848
|
+
try:
|
|
849
|
+
from sentence_transformers import SentenceTransformer
|
|
850
|
+
click.echo(" - sentence-transformers: OK (semantic embeddings active)")
|
|
851
|
+
except ImportError:
|
|
852
|
+
click.secho(" - sentence-transformers: NOT INSTALLED (hash fallback; install for better search)", fg="yellow")
|
|
853
|
+
click.echo("\nRecommended setup:")
|
|
854
|
+
click.echo(" pip install -e '.[full]' # core + ML + community detection")
|
|
855
|
+
click.echo(" pip install -e '.[ml]' # just ML embeddings")
|
|
856
|
+
click.echo("\nQuick start:")
|
|
731
857
|
click.echo(" codespine analyse /path/to/java-project --full")
|
|
858
|
+
click.echo(" codespine start # launch MCP server")
|
|
732
859
|
click.echo(" codespine search payment --json")
|
|
733
860
|
|
|
734
861
|
|
|
@@ -16,8 +16,8 @@ class Settings:
|
|
|
16
16
|
rrf_k: int = 60
|
|
17
17
|
semantic_candidate_pool: int = 2000
|
|
18
18
|
write_batch_size: int = 500
|
|
19
|
-
index_file_batch_size: int =
|
|
20
|
-
edge_write_batch_size: int =
|
|
19
|
+
index_file_batch_size: int = 20
|
|
20
|
+
edge_write_batch_size: int = 500
|
|
21
21
|
default_coupling_months: int = 6
|
|
22
22
|
default_min_coupling_strength: float = 0.3
|
|
23
23
|
default_min_cochanges: int = 3
|
|
@@ -147,10 +147,13 @@ class GraphStore:
|
|
|
147
147
|
|
|
148
148
|
def clear_project(self, project_id: str) -> None:
|
|
149
149
|
file_recs = self.query_records("MATCH (f:File) WHERE f.project_id = $pid RETURN f.id as id", {"pid": project_id})
|
|
150
|
+
# Small batches (10 files per tx) prevent buffer pool OOM on large projects.
|
|
150
151
|
for idx, rec in enumerate(file_recs, start=1):
|
|
151
|
-
self.
|
|
152
|
-
|
|
152
|
+
with self.transaction():
|
|
153
|
+
self.clear_file(rec["id"])
|
|
154
|
+
if idx % 10 == 0:
|
|
153
155
|
self._recycle_conn()
|
|
156
|
+
self._recycle_conn()
|
|
154
157
|
self.execute("MATCH (p:Project) WHERE p.id = $pid DETACH DELETE p", {"pid": project_id})
|
|
155
158
|
self._recycle_conn()
|
|
156
159
|
|
|
@@ -502,6 +505,48 @@ class GraphStore:
|
|
|
502
505
|
self.clear_flows()
|
|
503
506
|
self.clear_coupling()
|
|
504
507
|
|
|
508
|
+
@staticmethod
|
|
509
|
+
def force_delete_all_data() -> list[str]:
|
|
510
|
+
"""Delete all CodeSpine data files without touching the Kuzu engine.
|
|
511
|
+
|
|
512
|
+
This is the nuclear option for OOM recovery: when the buffer pool is
|
|
513
|
+
exhausted, normal DB writes (including reset_project / clear_project)
|
|
514
|
+
also fail. This bypasses Kuzu entirely by removing the data files
|
|
515
|
+
from disk, allowing a fresh start.
|
|
516
|
+
|
|
517
|
+
Returns the list of paths that were removed.
|
|
518
|
+
"""
|
|
519
|
+
removed: list[str] = []
|
|
520
|
+
for path in [
|
|
521
|
+
SETTINGS.db_path,
|
|
522
|
+
SETTINGS.db_snapshot_path,
|
|
523
|
+
SETTINGS.db_snapshot_path + ".updated",
|
|
524
|
+
SETTINGS.db_snapshot_path + ".tmp",
|
|
525
|
+
SETTINGS.embedding_cache_path,
|
|
526
|
+
SETTINGS.overlay_dir,
|
|
527
|
+
SETTINGS.index_meta_dir,
|
|
528
|
+
]:
|
|
529
|
+
if not os.path.exists(path):
|
|
530
|
+
continue
|
|
531
|
+
try:
|
|
532
|
+
if os.path.isdir(path):
|
|
533
|
+
shutil.rmtree(path, ignore_errors=True)
|
|
534
|
+
else:
|
|
535
|
+
os.remove(path)
|
|
536
|
+
removed.append(path)
|
|
537
|
+
except OSError:
|
|
538
|
+
pass
|
|
539
|
+
# Also remove any stale WAL files next to the DB
|
|
540
|
+
for suffix in (".wal", ".lock"):
|
|
541
|
+
wal_path = SETTINGS.db_path + suffix
|
|
542
|
+
if os.path.exists(wal_path):
|
|
543
|
+
try:
|
|
544
|
+
os.remove(wal_path)
|
|
545
|
+
removed.append(wal_path)
|
|
546
|
+
except OSError:
|
|
547
|
+
pass
|
|
548
|
+
return removed
|
|
549
|
+
|
|
505
550
|
def rebuild_empty_db(self) -> None:
|
|
506
551
|
self._recycle_conn()
|
|
507
552
|
path = SETTINGS.db_path
|
|
@@ -253,6 +253,20 @@ class JavaIndexer:
|
|
|
253
253
|
for fid in delete_chunk:
|
|
254
254
|
self.store.clear_file(fid)
|
|
255
255
|
self.store._recycle_conn()
|
|
256
|
+
|
|
257
|
+
# Clean up stale project entries that point to the same path under a
|
|
258
|
+
# different ID (e.g. re-indexing "vision-server" directly after it was
|
|
259
|
+
# previously indexed as "vision::vision-server" from a workspace root).
|
|
260
|
+
try:
|
|
261
|
+
stale = self.store.query_records(
|
|
262
|
+
"MATCH (p:Project) WHERE p.path = $path AND p.id <> $pid RETURN p.id as id",
|
|
263
|
+
{"path": root_path, "pid": project_id},
|
|
264
|
+
)
|
|
265
|
+
for old in stale:
|
|
266
|
+
self.store.clear_project(old["id"])
|
|
267
|
+
except Exception:
|
|
268
|
+
pass # best-effort cleanup
|
|
269
|
+
|
|
256
270
|
self.store.upsert_project(project_id, root_path)
|
|
257
271
|
|
|
258
272
|
for parse_chunk in self._chunked(parse_results, file_batch_size):
|
|
@@ -279,7 +293,7 @@ class JavaIndexer:
|
|
|
279
293
|
"hash": file_digest,
|
|
280
294
|
}
|
|
281
295
|
)
|
|
282
|
-
self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source))
|
|
296
|
+
self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source), imports=parsed.imports)
|
|
283
297
|
|
|
284
298
|
for cls in parsed.classes:
|
|
285
299
|
c_id = class_id(cls.fqcn, scope)
|
|
@@ -372,15 +386,31 @@ class JavaIndexer:
|
|
|
372
386
|
class_methods[c_id][method.signature] = m_id
|
|
373
387
|
files_indexed += 1
|
|
374
388
|
|
|
389
|
+
# Split writes into smaller transactions and recycle between each
|
|
390
|
+
# to prevent Kuzu WAL from exhausting the buffer pool on large
|
|
391
|
+
# incremental re-indexes (GH feedback: 1,604-file OOM).
|
|
392
|
+
if not full:
|
|
393
|
+
for clear_sub in self._chunked(file_rows, 10):
|
|
394
|
+
with self.store.transaction():
|
|
395
|
+
for row in clear_sub:
|
|
396
|
+
self.store.clear_file(row["id"])
|
|
397
|
+
self.store._recycle_conn()
|
|
375
398
|
with self.store.transaction():
|
|
376
|
-
for row in file_rows:
|
|
377
|
-
if not full:
|
|
378
|
-
self.store.clear_file(row["id"])
|
|
379
399
|
self.store.upsert_files_batch(file_rows)
|
|
400
|
+
self.store._recycle_conn()
|
|
401
|
+
with self.store.transaction():
|
|
380
402
|
self.store.upsert_classes_batch(class_rows)
|
|
381
|
-
self.store.upsert_methods_batch(method_rows)
|
|
382
|
-
self.store.upsert_symbols_batch(symbol_rows)
|
|
383
403
|
self.store._recycle_conn()
|
|
404
|
+
_METHOD_SUB_BATCH = 200
|
|
405
|
+
for method_sub in self._chunked(method_rows, _METHOD_SUB_BATCH):
|
|
406
|
+
with self.store.transaction():
|
|
407
|
+
self.store.upsert_methods_batch(method_sub)
|
|
408
|
+
self.store._recycle_conn()
|
|
409
|
+
_SYMBOL_SUB_BATCH = 200
|
|
410
|
+
for symbol_sub in self._chunked(symbol_rows, _SYMBOL_SUB_BATCH):
|
|
411
|
+
with self.store.transaction():
|
|
412
|
+
self.store.upsert_symbols_batch(symbol_sub)
|
|
413
|
+
self.store._recycle_conn()
|
|
384
414
|
|
|
385
415
|
self._emit(progress, "resolve_calls_start")
|
|
386
416
|
call_rows: list[dict] = []
|
|
@@ -697,7 +727,10 @@ class JavaIndexer:
|
|
|
697
727
|
return
|
|
698
728
|
|
|
699
729
|
@staticmethod
|
|
700
|
-
def _update_meta_cache_entry(
|
|
730
|
+
def _update_meta_cache_entry(
|
|
731
|
+
meta_cache: dict[str, dict], fid: str, file_path: str, digest: str, size_hint: int,
|
|
732
|
+
imports: list[str] | None = None,
|
|
733
|
+
) -> None:
|
|
701
734
|
try:
|
|
702
735
|
st = os.stat(file_path)
|
|
703
736
|
mtime_ns = int(getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000)))
|
|
@@ -705,7 +738,10 @@ class JavaIndexer:
|
|
|
705
738
|
except OSError:
|
|
706
739
|
mtime_ns = -1
|
|
707
740
|
size = size_hint
|
|
708
|
-
|
|
741
|
+
entry: dict = {"mtime_ns": mtime_ns, "size": size, "hash": digest}
|
|
742
|
+
if imports is not None:
|
|
743
|
+
entry["imports"] = imports
|
|
744
|
+
meta_cache[fid] = entry
|
|
709
745
|
|
|
710
746
|
@staticmethod
|
|
711
747
|
def _prune_meta_cache(meta_cache: dict[str, dict], current_file_ids: set[str]) -> None:
|
|
@@ -728,3 +764,76 @@ class JavaIndexer:
|
|
|
728
764
|
return normalized.split("/src/", 1)[0]
|
|
729
765
|
scope = os.path.dirname(normalized).strip()
|
|
730
766
|
return scope or "."
|
|
767
|
+
|
|
768
|
+
@staticmethod
|
|
769
|
+
def detect_unresolved_imports(store) -> dict[str, list[str]]:
|
|
770
|
+
"""Detect imports that reference packages not covered by any indexed project.
|
|
771
|
+
|
|
772
|
+
Returns a dict mapping unresolved base packages (e.g. "com.foo.bar")
|
|
773
|
+
to a list of sample import FQCNs. Useful for suggesting which sibling
|
|
774
|
+
projects to index.
|
|
775
|
+
|
|
776
|
+
Only reports project-internal packages (not java.*, javax.*, org.apache.*
|
|
777
|
+
etc.).
|
|
778
|
+
"""
|
|
779
|
+
# 1. Collect all indexed class FQCNs
|
|
780
|
+
try:
|
|
781
|
+
recs = store.query_records("MATCH (c:Class) RETURN c.fqcn as fqcn")
|
|
782
|
+
except Exception:
|
|
783
|
+
return {}
|
|
784
|
+
indexed_fqcns = {r["fqcn"] for r in recs if r.get("fqcn")}
|
|
785
|
+
indexed_packages = set()
|
|
786
|
+
for fqcn in indexed_fqcns:
|
|
787
|
+
parts = fqcn.rsplit(".", 1)
|
|
788
|
+
if len(parts) == 2:
|
|
789
|
+
indexed_packages.add(parts[0])
|
|
790
|
+
|
|
791
|
+
# 2. Collect all imports from overlay + any stored file data
|
|
792
|
+
# Parse imports from the parsed file metadata if available
|
|
793
|
+
meta_dir = SETTINGS.index_meta_dir
|
|
794
|
+
all_imports: set[str] = set()
|
|
795
|
+
if os.path.isdir(meta_dir):
|
|
796
|
+
for fname in os.listdir(meta_dir):
|
|
797
|
+
if not fname.endswith(".json"):
|
|
798
|
+
continue
|
|
799
|
+
try:
|
|
800
|
+
with open(os.path.join(meta_dir, fname), "r") as f:
|
|
801
|
+
data = json.load(f)
|
|
802
|
+
for fid, fmeta in data.items():
|
|
803
|
+
for imp in fmeta.get("imports", []):
|
|
804
|
+
all_imports.add(imp)
|
|
805
|
+
except Exception:
|
|
806
|
+
pass
|
|
807
|
+
|
|
808
|
+
# 3. Also scan the DB for CALLS edges that reference unknown targets
|
|
809
|
+
# (lightweight — just check which classes were resolved vs not)
|
|
810
|
+
|
|
811
|
+
# 4. Filter: skip standard library / well-known third-party packages
|
|
812
|
+
_SKIP_PREFIXES = (
|
|
813
|
+
"java.", "javax.", "jakarta.",
|
|
814
|
+
"org.apache.", "org.springframework.", "org.hibernate.",
|
|
815
|
+
"org.slf4j.", "org.junit.", "org.mockito.",
|
|
816
|
+
"com.google.", "com.fasterxml.", "com.sun.",
|
|
817
|
+
"io.micrometer.", "io.netty.", "io.lettuce.",
|
|
818
|
+
"lombok.", "reactor.", "rx.",
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
unresolved: dict[str, list[str]] = {}
|
|
822
|
+
for imp in all_imports:
|
|
823
|
+
if any(imp.startswith(prefix) for prefix in _SKIP_PREFIXES):
|
|
824
|
+
continue
|
|
825
|
+
# Check if this import's class exists in the index
|
|
826
|
+
simple_name = imp.rsplit(".", 1)[-1]
|
|
827
|
+
pkg = imp.rsplit(".", 1)[0] if "." in imp else ""
|
|
828
|
+
if imp in indexed_fqcns:
|
|
829
|
+
continue
|
|
830
|
+
if pkg in indexed_packages:
|
|
831
|
+
continue # same package, just not this specific class
|
|
832
|
+
# Group by top 3 package segments
|
|
833
|
+
parts = imp.split(".")
|
|
834
|
+
base_pkg = ".".join(parts[:min(3, len(parts))])
|
|
835
|
+
if base_pkg not in unresolved:
|
|
836
|
+
unresolved[base_pkg] = []
|
|
837
|
+
if len(unresolved[base_pkg]) < 5:
|
|
838
|
+
unresolved[base_pkg].append(imp)
|
|
839
|
+
return unresolved
|