codespine 0.5.9__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {codespine-0.5.9 → codespine-0.6.0}/PKG-INFO +1 -1
  2. {codespine-0.5.9 → codespine-0.6.0}/codespine/__init__.py +1 -1
  3. {codespine-0.5.9 → codespine-0.6.0}/codespine/cli.py +147 -20
  4. {codespine-0.5.9 → codespine-0.6.0}/codespine/config.py +2 -2
  5. {codespine-0.5.9 → codespine-0.6.0}/codespine/db/store.py +47 -2
  6. {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/engine.py +117 -8
  7. {codespine-0.5.9 → codespine-0.6.0}/codespine/mcp/server.py +146 -51
  8. {codespine-0.5.9 → codespine-0.6.0}/codespine/watch/watcher.py +106 -59
  9. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/PKG-INFO +1 -1
  10. {codespine-0.5.9 → codespine-0.6.0}/pyproject.toml +1 -1
  11. {codespine-0.5.9 → codespine-0.6.0}/LICENSE +0 -0
  12. {codespine-0.5.9 → codespine-0.6.0}/README.md +0 -0
  13. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/__init__.py +0 -0
  14. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/community.py +0 -0
  15. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/context.py +0 -0
  16. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/coupling.py +0 -0
  17. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/crossmodule.py +0 -0
  18. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/deadcode.py +0 -0
  19. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/flow.py +0 -0
  20. {codespine-0.5.9 → codespine-0.6.0}/codespine/analysis/impact.py +0 -0
  21. {codespine-0.5.9 → codespine-0.6.0}/codespine/db/__init__.py +0 -0
  22. {codespine-0.5.9 → codespine-0.6.0}/codespine/db/schema.py +0 -0
  23. {codespine-0.5.9 → codespine-0.6.0}/codespine/diff/__init__.py +0 -0
  24. {codespine-0.5.9 → codespine-0.6.0}/codespine/diff/branch_diff.py +0 -0
  25. {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/__init__.py +0 -0
  26. {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/call_resolver.py +0 -0
  27. {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/java_parser.py +0 -0
  28. {codespine-0.5.9 → codespine-0.6.0}/codespine/indexer/symbol_builder.py +0 -0
  29. {codespine-0.5.9 → codespine-0.6.0}/codespine/mcp/__init__.py +0 -0
  30. {codespine-0.5.9 → codespine-0.6.0}/codespine/noise/__init__.py +0 -0
  31. {codespine-0.5.9 → codespine-0.6.0}/codespine/noise/blocklist.py +0 -0
  32. {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/__init__.py +0 -0
  33. {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/git_state.py +0 -0
  34. {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/merge.py +0 -0
  35. {codespine-0.5.9 → codespine-0.6.0}/codespine/overlay/store.py +0 -0
  36. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/__init__.py +0 -0
  37. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/bm25.py +0 -0
  38. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/fuzzy.py +0 -0
  39. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/hybrid.py +0 -0
  40. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/rrf.py +0 -0
  41. {codespine-0.5.9 → codespine-0.6.0}/codespine/search/vector.py +0 -0
  42. {codespine-0.5.9 → codespine-0.6.0}/codespine/watch/__init__.py +0 -0
  43. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/SOURCES.txt +0 -0
  44. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/dependency_links.txt +0 -0
  45. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/entry_points.txt +0 -0
  46. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/requires.txt +0 -0
  47. {codespine-0.5.9 → codespine-0.6.0}/codespine.egg-info/top_level.txt +0 -0
  48. {codespine-0.5.9 → codespine-0.6.0}/gindex.py +0 -0
  49. {codespine-0.5.9 → codespine-0.6.0}/setup.cfg +0 -0
  50. {codespine-0.5.9 → codespine-0.6.0}/tests/test_branch_diff_normalize.py +0 -0
  51. {codespine-0.5.9 → codespine-0.6.0}/tests/test_call_resolver.py +0 -0
  52. {codespine-0.5.9 → codespine-0.6.0}/tests/test_community_detection.py +0 -0
  53. {codespine-0.5.9 → codespine-0.6.0}/tests/test_deadcode.py +0 -0
  54. {codespine-0.5.9 → codespine-0.6.0}/tests/test_index_and_hybrid.py +0 -0
  55. {codespine-0.5.9 → codespine-0.6.0}/tests/test_java_parser.py +0 -0
  56. {codespine-0.5.9 → codespine-0.6.0}/tests/test_multimodule_index.py +0 -0
  57. {codespine-0.5.9 → codespine-0.6.0}/tests/test_overlay.py +0 -0
  58. {codespine-0.5.9 → codespine-0.6.0}/tests/test_search_ranking.py +0 -0
  59. {codespine-0.5.9 → codespine-0.6.0}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.5.9
3
+ Version: 0.6.0
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.5.9"
4
+ __version__ = "0.6.0"
@@ -77,6 +77,19 @@ def _dead_result_count(dead_result: list[dict] | None) -> int:
77
77
  return sum(1 for item in dead_result if isinstance(item, dict) and "_stats" not in item)
78
78
 
79
79
 
80
+ def _bar(done: int, total: int, width: int = 20) -> str:
81
+ """Return an ASCII progress bar like [████████░░░░] 40%."""
82
+ if total <= 0:
83
+ return f"[{'░' * width}] ---%"
84
+ frac = min(done / total, 1.0)
85
+ filled = int(width * frac)
86
+ return f"[{'█' * filled}{'░' * (width - filled)}] {int(frac * 100):3d}%"
87
+
88
+
89
+ def _spinner_char() -> str:
90
+ return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
91
+
92
+
80
93
  @click.group()
81
94
  def main() -> None:
82
95
  """CodeSpine CLI."""
@@ -88,16 +101,17 @@ def main() -> None:
88
101
  @click.option("--deep/--no-deep", default=False, show_default=True, help="Run expensive global analyses.")
89
102
  @click.option(
90
103
  "--embed/--no-embed",
91
- default=False,
104
+ default=True,
92
105
  show_default=True,
93
- help="Generate vector embeddings (slow if sentence-transformers installed; enables semantic search).",
106
+ help="Generate vector embeddings. Uses sentence-transformers if installed (pip install codespine[ml]), otherwise falls back to hash-based vectors.",
94
107
  )
95
108
  @click.option("--allow-running", is_flag=True, hidden=True, help="Skip MCP running check (used by MCP analyse_project tool).")
96
109
  def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool) -> None:
97
110
  """Index a local Java project (auto-detects workspace / Maven / Gradle layout).
98
111
 
99
- By default embeddings are skipped for speed. Pass --embed to generate
100
- vector embeddings for semantic search (requires sentence-transformers).
112
+ Embeddings are generated by default. If sentence-transformers is installed
113
+ (pip install codespine[ml]), high-quality semantic vectors are used; otherwise
114
+ a fast hash-based fallback provides basic vector search.
101
115
  """
102
116
  if not allow_running and _is_running():
103
117
  click.secho("Stop MCP first ('codespine stop') to index.", fg="yellow")
@@ -105,6 +119,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
105
119
 
106
120
  started = time.perf_counter()
107
121
  abs_path = os.path.abspath(path)
122
+
123
+ # Warn about hash fallback early so users know to install [ml]
124
+ if embed:
125
+ from codespine.search.vector import _load_model
126
+ if _load_model() is None:
127
+ click.secho(
128
+ "⚠ sentence-transformers not found — using hash-based embeddings.\n"
129
+ " For better semantic search: pip install codespine[ml]\n",
130
+ fg="yellow",
131
+ )
132
+
108
133
  store = GraphStore(read_only=False)
109
134
  indexer = JavaIndexer(store)
110
135
 
@@ -141,7 +166,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
141
166
 
142
167
  # Shared progress state (reset per module)
143
168
  parse_state = {"shown": False, "indexed": 0, "total": 0, "last_ts": 0.0, "printed_zero": False}
144
- call_state = {"shown": False, "count": 0, "last_ts": 0.0}
169
+ call_state = {"shown": False, "count": 0, "last_ts": 0.0, "started_at": 0.0}
145
170
 
146
171
  def _reset_state() -> None:
147
172
  for k in list(parse_state):
@@ -171,22 +196,28 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
171
196
  if total == 0:
172
197
  return
173
198
  if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
174
- click.echo(f"\rParsing code... {indexed}/{total}", nl=False)
199
+ click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
175
200
  parse_state["shown"] = True
176
201
  parse_state["last_ts"] = now
177
202
  return
178
203
  if event == "resolve_calls_start" and parse_state["shown"]:
179
204
  click.echo()
180
205
  parse_state["shown"] = False
181
- _phase("Tracing calls...", "running")
206
+ call_state["started_at"] = now
207
+ _phase("Tracing calls...", "starting...")
182
208
  return
183
209
  if event == "resolve_calls_start":
184
- _phase("Tracing calls...", "running")
210
+ call_state["started_at"] = now
211
+ _phase("Tracing calls...", "starting...")
185
212
  return
186
213
  if event == "resolve_calls_progress":
187
214
  call_state["count"] = int(payload.get("calls_resolved", 0))
188
215
  if (now - call_state["last_ts"]) >= 0.25:
189
- click.echo(f"\rTracing calls... {call_state['count']} resolved", nl=False)
216
+ elapsed_s = now - call_state["started_at"]
217
+ click.echo(
218
+ f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
219
+ nl=False,
220
+ )
190
221
  call_state["shown"] = True
191
222
  call_state["last_ts"] = now
192
223
  return
@@ -194,7 +225,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
194
225
  if call_state["shown"]:
195
226
  click.echo()
196
227
  call_state["shown"] = False
197
- _phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved")
228
+ elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
229
+ _phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
198
230
  return
199
231
  if event == "resolve_types_start":
200
232
  _phase("Analyzing types...", "running")
@@ -226,11 +258,11 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
226
258
  # ── Helper for in-place progress updates ────────────────────────────
227
259
  def _live_phase(label: str, status: str) -> None:
228
260
  """Overwrite the current line with a status update."""
229
- click.echo(f"\r{label:<30} {status:<50}", nl=False)
261
+ click.echo(f"\r{_spinner_char()} {label:<28} {status:<48}", nl=False)
230
262
 
231
263
  def _finish_phase(label: str, result: str) -> None:
232
264
  """Finalise an in-place phase line and move to the next line."""
233
- click.echo(f"\r{label:<30} {result:<50}")
265
+ click.echo(f"\r{label:<28} {result:<48}")
234
266
 
235
267
  # ── Cross-module call linking ──────────────────────────────────────
236
268
  if is_multi and len(modules_with_ids) > 1:
@@ -289,9 +321,27 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
289
321
  )
290
322
  _finish_phase(coup_label, f"{len(coupling_pairs)} coupled file pairs")
291
323
  else:
324
+ # Run lightweight versions of flow tracing and dead code from the call
325
+ # graph already built — no community detection or coupling (those are
326
+ # genuinely expensive). This gives partial results without --deep.
292
327
  _phase("Detecting communities...", "skipped (large repo; rerun with --deep)")
293
- _phase("Detecting execution flows...", "skipped (large repo; rerun with --deep)")
294
- _phase("Finding dead code...", "skipped (large repo; rerun with --deep)")
328
+
329
+ flow_label = "Detecting execution flows..."
330
+ _live_phase(flow_label, "running (lightweight)")
331
+ try:
332
+ flows = trace_execution_flows(store, max_depth=3)
333
+ except Exception:
334
+ flows = []
335
+ _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
336
+
337
+ dead_label = "Finding dead code..."
338
+ _live_phase(dead_label, "running (lightweight)")
339
+ try:
340
+ dead = detect_dead_code(store, limit=100)
341
+ except Exception:
342
+ dead = []
343
+ _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
344
+
295
345
  _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
296
346
 
297
347
  vector_count = store.query_records(
@@ -311,7 +361,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
311
361
  edges = int(edge_count[0]["count"]) if edge_count else 0
312
362
  elapsed = time.perf_counter() - started
313
363
 
314
- embed_note = "" if embed else " (no embeddings; rerun with --embed for semantic search)"
364
+ if not embed:
365
+ embed_note = " (no embeddings; rerun with --embed for semantic search)"
366
+ elif _load_model() is None:
367
+ embed_note = " (hash embeddings; pip install codespine[ml] for better search)"
368
+ else:
369
+ embed_note = ""
315
370
  module_info = f"{len(modules_with_ids)} modules/projects, " if is_multi else ""
316
371
  click.echo()
317
372
  click.secho(
@@ -319,6 +374,17 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
319
374
  fg="green",
320
375
  )
321
376
 
377
+ # Detect unresolved imports → hint about unindexed sibling projects
378
+ try:
379
+ unresolved = JavaIndexer.detect_unresolved_imports(store)
380
+ if unresolved:
381
+ click.echo()
382
+ click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
383
+ for pkg, samples in sorted(unresolved.items())[:8]:
384
+ click.echo(f" {pkg} (e.g. {samples[0]})")
385
+ except Exception:
386
+ pass # best-effort
387
+
322
388
  # Publish a read replica so MCP and read-only CLI commands (search, stats…)
323
389
  # run against an isolated snapshot rather than competing with the write
324
390
  # process's buffer pool. The MCP daemon detects the sentinel file and
@@ -559,7 +625,14 @@ def list_projects(as_json: bool) -> None:
559
625
  @main.command()
560
626
  @click.option("--json", "as_json", is_flag=True)
561
627
  def status(as_json: bool) -> None:
562
- """Show service and database status."""
628
+ """Show service and database status.
629
+
630
+ Quick reference for MCP server management:
631
+ codespine start – launch background MCP server
632
+ codespine stop – stop background MCP server
633
+ codespine status – this command
634
+ codespine mcp – run MCP in foreground (stdio, for IDE integration)
635
+ """
563
636
  running = _is_running()
564
637
  pid = None
565
638
  if os.path.exists(SETTINGS.pid_file):
@@ -570,17 +643,35 @@ def status(as_json: bool) -> None:
570
643
  pid = None
571
644
  store = GraphStore(read_only=True)
572
645
  overlay = get_overlay_status(store)
646
+
647
+ # Check for stale PID file
648
+ stale_pid = pid is not None and not running
649
+ has_snapshot = os.path.exists(SETTINGS.db_snapshot_path)
650
+
573
651
  payload = {
574
652
  "running": running,
575
653
  "pid": pid,
654
+ "stale_pid": stale_pid,
576
655
  "pid_file": SETTINGS.pid_file,
577
656
  "db_path": SETTINGS.db_path,
578
657
  "db_size_bytes": _db_size_bytes(SETTINGS.db_path),
658
+ "read_replica": SETTINGS.db_snapshot_path if has_snapshot else None,
659
+ "read_replica_size_bytes": _db_size_bytes(SETTINGS.db_snapshot_path) if has_snapshot else 0,
579
660
  "log_file": SETTINGS.log_file,
580
661
  "overlay_dir": SETTINGS.overlay_dir,
581
662
  "overlay_projects": overlay,
582
663
  }
583
- _echo_json(payload, as_json)
664
+ if as_json:
665
+ _echo_json(payload, True)
666
+ else:
667
+ _echo_json(payload, True)
668
+ if stale_pid:
669
+ click.secho(f"\n⚠ Stale PID file found (PID {pid} not running). Run 'codespine stop' to clean up.", fg="yellow")
670
+ if not running:
671
+ click.echo("\nTo start: codespine start")
672
+ click.echo("For IDE: codespine mcp (stdio mode)")
673
+ else:
674
+ click.echo(f"\nMCP server running (PID {pid}). Stop with: codespine stop")
584
675
 
585
676
 
586
677
  @main.command("overlay-status")
@@ -707,6 +798,33 @@ def clear_index_cmd(allow_running: bool) -> None:
707
798
  click.secho(f"Cleared {len(projects)} project(s). Index is now empty.", fg="green")
708
799
 
709
800
 
801
+ @main.command("force-reset")
802
+ @click.option("--force", is_flag=True, help="Skip confirmation prompt.")
803
+ def force_reset_cmd(force: bool) -> None:
804
+ """Emergency reset: delete ALL CodeSpine data files without touching the DB engine.
805
+
806
+ Use this when the buffer pool is exhausted and normal reset/clear commands
807
+ also fail with OOM. This bypasses Kuzu entirely by removing data files
808
+ from disk, including the DB, read replica, overlay, meta cache, and
809
+ embedding cache.
810
+
811
+ After running this, restart the MCP server and re-index your projects.
812
+ """
813
+ if not force and not click.confirm(
814
+ "This will DELETE all CodeSpine data (DB, overlay, caches). Continue?"
815
+ ):
816
+ click.echo("Aborted.")
817
+ return
818
+ removed = GraphStore.force_delete_all_data()
819
+ if removed:
820
+ for p in removed:
821
+ click.echo(f" removed: {p}")
822
+ click.secho(f"\nForce-reset complete. {len(removed)} path(s) removed.", fg="green")
823
+ click.echo("Next: restart MCP ('codespine stop && codespine start') and re-index.")
824
+ else:
825
+ click.secho("Nothing to remove — already clean.", fg="yellow")
826
+
827
+
710
828
  @main.command()
711
829
  def setup() -> None:
712
830
  """Print local setup checks and next steps."""
@@ -723,12 +841,21 @@ def setup() -> None:
723
841
  checks[mod] = True
724
842
  except Exception:
725
843
  checks[mod] = False
726
- click.echo("Dependency check:")
844
+ click.echo("Core dependencies:")
727
845
  for mod, ok in checks.items():
728
846
  click.echo(f" - {mod}: {'OK' if ok else 'MISSING'}")
729
- click.echo("\\nRecommended:")
730
- click.echo(" pip install -e .")
847
+ # Check optional ML dependencies
848
+ try:
849
+ from sentence_transformers import SentenceTransformer
850
+ click.echo(" - sentence-transformers: OK (semantic embeddings active)")
851
+ except ImportError:
852
+ click.secho(" - sentence-transformers: NOT INSTALLED (hash fallback; install for better search)", fg="yellow")
853
+ click.echo("\nRecommended setup:")
854
+ click.echo(" pip install -e '.[full]' # core + ML + community detection")
855
+ click.echo(" pip install -e '.[ml]' # just ML embeddings")
856
+ click.echo("\nQuick start:")
731
857
  click.echo(" codespine analyse /path/to/java-project --full")
858
+ click.echo(" codespine start # launch MCP server")
732
859
  click.echo(" codespine search payment --json")
733
860
 
734
861
 
@@ -16,8 +16,8 @@ class Settings:
16
16
  rrf_k: int = 60
17
17
  semantic_candidate_pool: int = 2000
18
18
  write_batch_size: int = 500
19
- index_file_batch_size: int = 64
20
- edge_write_batch_size: int = 2000
19
+ index_file_batch_size: int = 20
20
+ edge_write_batch_size: int = 500
21
21
  default_coupling_months: int = 6
22
22
  default_min_coupling_strength: float = 0.3
23
23
  default_min_cochanges: int = 3
@@ -147,10 +147,13 @@ class GraphStore:
147
147
 
148
148
  def clear_project(self, project_id: str) -> None:
149
149
  file_recs = self.query_records("MATCH (f:File) WHERE f.project_id = $pid RETURN f.id as id", {"pid": project_id})
150
+ # Small batches (10 files per tx) prevent buffer pool OOM on large projects.
150
151
  for idx, rec in enumerate(file_recs, start=1):
151
- self.clear_file(rec["id"])
152
- if idx % 50 == 0:
152
+ with self.transaction():
153
+ self.clear_file(rec["id"])
154
+ if idx % 10 == 0:
153
155
  self._recycle_conn()
156
+ self._recycle_conn()
154
157
  self.execute("MATCH (p:Project) WHERE p.id = $pid DETACH DELETE p", {"pid": project_id})
155
158
  self._recycle_conn()
156
159
 
@@ -502,6 +505,48 @@ class GraphStore:
502
505
  self.clear_flows()
503
506
  self.clear_coupling()
504
507
 
508
+ @staticmethod
509
+ def force_delete_all_data() -> list[str]:
510
+ """Delete all CodeSpine data files without touching the Kuzu engine.
511
+
512
+ This is the nuclear option for OOM recovery: when the buffer pool is
513
+ exhausted, normal DB writes (including reset_project / clear_project)
514
+ also fail. This bypasses Kuzu entirely by removing the data files
515
+ from disk, allowing a fresh start.
516
+
517
+ Returns the list of paths that were removed.
518
+ """
519
+ removed: list[str] = []
520
+ for path in [
521
+ SETTINGS.db_path,
522
+ SETTINGS.db_snapshot_path,
523
+ SETTINGS.db_snapshot_path + ".updated",
524
+ SETTINGS.db_snapshot_path + ".tmp",
525
+ SETTINGS.embedding_cache_path,
526
+ SETTINGS.overlay_dir,
527
+ SETTINGS.index_meta_dir,
528
+ ]:
529
+ if not os.path.exists(path):
530
+ continue
531
+ try:
532
+ if os.path.isdir(path):
533
+ shutil.rmtree(path, ignore_errors=True)
534
+ else:
535
+ os.remove(path)
536
+ removed.append(path)
537
+ except OSError:
538
+ pass
539
+ # Also remove any stale WAL files next to the DB
540
+ for suffix in (".wal", ".lock"):
541
+ wal_path = SETTINGS.db_path + suffix
542
+ if os.path.exists(wal_path):
543
+ try:
544
+ os.remove(wal_path)
545
+ removed.append(wal_path)
546
+ except OSError:
547
+ pass
548
+ return removed
549
+
505
550
  def rebuild_empty_db(self) -> None:
506
551
  self._recycle_conn()
507
552
  path = SETTINGS.db_path
@@ -253,6 +253,20 @@ class JavaIndexer:
253
253
  for fid in delete_chunk:
254
254
  self.store.clear_file(fid)
255
255
  self.store._recycle_conn()
256
+
257
+ # Clean up stale project entries that point to the same path under a
258
+ # different ID (e.g. re-indexing "vision-server" directly after it was
259
+ # previously indexed as "vision::vision-server" from a workspace root).
260
+ try:
261
+ stale = self.store.query_records(
262
+ "MATCH (p:Project) WHERE p.path = $path AND p.id <> $pid RETURN p.id as id",
263
+ {"path": root_path, "pid": project_id},
264
+ )
265
+ for old in stale:
266
+ self.store.clear_project(old["id"])
267
+ except Exception:
268
+ pass # best-effort cleanup
269
+
256
270
  self.store.upsert_project(project_id, root_path)
257
271
 
258
272
  for parse_chunk in self._chunked(parse_results, file_batch_size):
@@ -279,7 +293,7 @@ class JavaIndexer:
279
293
  "hash": file_digest,
280
294
  }
281
295
  )
282
- self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source))
296
+ self._update_meta_cache_entry(meta_cache, f_id, file_path, file_digest, len(source), imports=parsed.imports)
283
297
 
284
298
  for cls in parsed.classes:
285
299
  c_id = class_id(cls.fqcn, scope)
@@ -372,15 +386,31 @@ class JavaIndexer:
372
386
  class_methods[c_id][method.signature] = m_id
373
387
  files_indexed += 1
374
388
 
389
+ # Split writes into smaller transactions and recycle between each
390
+ # to prevent Kuzu WAL from exhausting the buffer pool on large
391
+ # incremental re-indexes (GH feedback: 1,604-file OOM).
392
+ if not full:
393
+ for clear_sub in self._chunked(file_rows, 10):
394
+ with self.store.transaction():
395
+ for row in clear_sub:
396
+ self.store.clear_file(row["id"])
397
+ self.store._recycle_conn()
375
398
  with self.store.transaction():
376
- for row in file_rows:
377
- if not full:
378
- self.store.clear_file(row["id"])
379
399
  self.store.upsert_files_batch(file_rows)
400
+ self.store._recycle_conn()
401
+ with self.store.transaction():
380
402
  self.store.upsert_classes_batch(class_rows)
381
- self.store.upsert_methods_batch(method_rows)
382
- self.store.upsert_symbols_batch(symbol_rows)
383
403
  self.store._recycle_conn()
404
+ _METHOD_SUB_BATCH = 200
405
+ for method_sub in self._chunked(method_rows, _METHOD_SUB_BATCH):
406
+ with self.store.transaction():
407
+ self.store.upsert_methods_batch(method_sub)
408
+ self.store._recycle_conn()
409
+ _SYMBOL_SUB_BATCH = 200
410
+ for symbol_sub in self._chunked(symbol_rows, _SYMBOL_SUB_BATCH):
411
+ with self.store.transaction():
412
+ self.store.upsert_symbols_batch(symbol_sub)
413
+ self.store._recycle_conn()
384
414
 
385
415
  self._emit(progress, "resolve_calls_start")
386
416
  call_rows: list[dict] = []
@@ -697,7 +727,10 @@ class JavaIndexer:
697
727
  return
698
728
 
699
729
  @staticmethod
700
- def _update_meta_cache_entry(meta_cache: dict[str, dict], fid: str, file_path: str, digest: str, size_hint: int) -> None:
730
+ def _update_meta_cache_entry(
731
+ meta_cache: dict[str, dict], fid: str, file_path: str, digest: str, size_hint: int,
732
+ imports: list[str] | None = None,
733
+ ) -> None:
701
734
  try:
702
735
  st = os.stat(file_path)
703
736
  mtime_ns = int(getattr(st, "st_mtime_ns", int(st.st_mtime * 1_000_000_000)))
@@ -705,7 +738,10 @@ class JavaIndexer:
705
738
  except OSError:
706
739
  mtime_ns = -1
707
740
  size = size_hint
708
- meta_cache[fid] = {"mtime_ns": mtime_ns, "size": size, "hash": digest}
741
+ entry: dict = {"mtime_ns": mtime_ns, "size": size, "hash": digest}
742
+ if imports is not None:
743
+ entry["imports"] = imports
744
+ meta_cache[fid] = entry
709
745
 
710
746
  @staticmethod
711
747
  def _prune_meta_cache(meta_cache: dict[str, dict], current_file_ids: set[str]) -> None:
@@ -728,3 +764,76 @@ class JavaIndexer:
728
764
  return normalized.split("/src/", 1)[0]
729
765
  scope = os.path.dirname(normalized).strip()
730
766
  return scope or "."
767
+
768
+ @staticmethod
769
+ def detect_unresolved_imports(store) -> dict[str, list[str]]:
770
+ """Detect imports that reference packages not covered by any indexed project.
771
+
772
+ Returns a dict mapping unresolved base packages (e.g. "com.foo.bar")
773
+ to a list of sample import FQCNs. Useful for suggesting which sibling
774
+ projects to index.
775
+
776
+ Only reports project-internal packages (not java.*, javax.*, org.apache.*
777
+ etc.).
778
+ """
779
+ # 1. Collect all indexed class FQCNs
780
+ try:
781
+ recs = store.query_records("MATCH (c:Class) RETURN c.fqcn as fqcn")
782
+ except Exception:
783
+ return {}
784
+ indexed_fqcns = {r["fqcn"] for r in recs if r.get("fqcn")}
785
+ indexed_packages = set()
786
+ for fqcn in indexed_fqcns:
787
+ parts = fqcn.rsplit(".", 1)
788
+ if len(parts) == 2:
789
+ indexed_packages.add(parts[0])
790
+
791
+ # 2. Collect all imports from overlay + any stored file data
792
+ # Parse imports from the parsed file metadata if available
793
+ meta_dir = SETTINGS.index_meta_dir
794
+ all_imports: set[str] = set()
795
+ if os.path.isdir(meta_dir):
796
+ for fname in os.listdir(meta_dir):
797
+ if not fname.endswith(".json"):
798
+ continue
799
+ try:
800
+ with open(os.path.join(meta_dir, fname), "r") as f:
801
+ data = json.load(f)
802
+ for fid, fmeta in data.items():
803
+ for imp in fmeta.get("imports", []):
804
+ all_imports.add(imp)
805
+ except Exception:
806
+ pass
807
+
808
+ # 3. Also scan the DB for CALLS edges that reference unknown targets
809
+ # (lightweight — just check which classes were resolved vs not)
810
+
811
+ # 4. Filter: skip standard library / well-known third-party packages
812
+ _SKIP_PREFIXES = (
813
+ "java.", "javax.", "jakarta.",
814
+ "org.apache.", "org.springframework.", "org.hibernate.",
815
+ "org.slf4j.", "org.junit.", "org.mockito.",
816
+ "com.google.", "com.fasterxml.", "com.sun.",
817
+ "io.micrometer.", "io.netty.", "io.lettuce.",
818
+ "lombok.", "reactor.", "rx.",
819
+ )
820
+
821
+ unresolved: dict[str, list[str]] = {}
822
+ for imp in all_imports:
823
+ if any(imp.startswith(prefix) for prefix in _SKIP_PREFIXES):
824
+ continue
825
+ # Check if this import's class exists in the index
826
+ simple_name = imp.rsplit(".", 1)[-1]
827
+ pkg = imp.rsplit(".", 1)[0] if "." in imp else ""
828
+ if imp in indexed_fqcns:
829
+ continue
830
+ if pkg in indexed_packages:
831
+ continue # same package, just not this specific class
832
+ # Group by top 3 package segments
833
+ parts = imp.split(".")
834
+ base_pkg = ".".join(parts[:min(3, len(parts))])
835
+ if base_pkg not in unresolved:
836
+ unresolved[base_pkg] = []
837
+ if len(unresolved[base_pkg]) < 5:
838
+ unresolved[base_pkg].append(imp)
839
+ return unresolved