codespine 0.9.4__tar.gz → 0.9.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {codespine-0.9.4 → codespine-0.9.6}/PKG-INFO +1 -1
  2. {codespine-0.9.4 → codespine-0.9.6}/codespine/__init__.py +1 -1
  3. {codespine-0.9.4 → codespine-0.9.6}/codespine/cli.py +109 -31
  4. {codespine-0.9.4 → codespine-0.9.6}/codespine/config.py +9 -0
  5. {codespine-0.9.4 → codespine-0.9.6}/codespine/db/store.py +76 -50
  6. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/call_resolver.py +1 -1
  7. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/engine.py +28 -14
  8. codespine-0.9.6/codespine/sharding/__init__.py +9 -0
  9. codespine-0.9.6/codespine/sharding/router.py +123 -0
  10. codespine-0.9.6/codespine/sharding/store.py +312 -0
  11. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/PKG-INFO +1 -1
  12. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/SOURCES.txt +3 -0
  13. {codespine-0.9.4 → codespine-0.9.6}/pyproject.toml +1 -1
  14. {codespine-0.9.4 → codespine-0.9.6}/LICENSE +0 -0
  15. {codespine-0.9.4 → codespine-0.9.6}/README.md +0 -0
  16. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/__init__.py +0 -0
  17. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/community.py +0 -0
  18. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/context.py +0 -0
  19. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/coupling.py +0 -0
  20. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/crossmodule.py +0 -0
  21. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/deadcode.py +0 -0
  22. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/flow.py +0 -0
  23. {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/impact.py +0 -0
  24. {codespine-0.9.4 → codespine-0.9.6}/codespine/db/__init__.py +0 -0
  25. {codespine-0.9.4 → codespine-0.9.6}/codespine/db/schema.py +0 -0
  26. {codespine-0.9.4 → codespine-0.9.6}/codespine/diff/__init__.py +0 -0
  27. {codespine-0.9.4 → codespine-0.9.6}/codespine/diff/branch_diff.py +0 -0
  28. {codespine-0.9.4 → codespine-0.9.6}/codespine/guide.py +0 -0
  29. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/__init__.py +0 -0
  30. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/di_resolver.py +0 -0
  31. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/java_parser.py +0 -0
  32. {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/symbol_builder.py +0 -0
  33. {codespine-0.9.4 → codespine-0.9.6}/codespine/mcp/__init__.py +0 -0
  34. {codespine-0.9.4 → codespine-0.9.6}/codespine/mcp/server.py +0 -0
  35. {codespine-0.9.4 → codespine-0.9.6}/codespine/noise/__init__.py +0 -0
  36. {codespine-0.9.4 → codespine-0.9.6}/codespine/noise/blocklist.py +0 -0
  37. {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/__init__.py +0 -0
  38. {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/git_state.py +0 -0
  39. {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/merge.py +0 -0
  40. {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/store.py +0 -0
  41. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/__init__.py +0 -0
  42. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/bm25.py +0 -0
  43. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/fuzzy.py +0 -0
  44. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/hybrid.py +0 -0
  45. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/rrf.py +0 -0
  46. {codespine-0.9.4 → codespine-0.9.6}/codespine/search/vector.py +0 -0
  47. {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/__init__.py +0 -0
  48. {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/git_hook.py +0 -0
  49. {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/watcher.py +0 -0
  50. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/dependency_links.txt +0 -0
  51. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/entry_points.txt +0 -0
  52. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/requires.txt +0 -0
  53. {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/top_level.txt +0 -0
  54. {codespine-0.9.4 → codespine-0.9.6}/gindex.py +0 -0
  55. {codespine-0.9.4 → codespine-0.9.6}/setup.cfg +0 -0
  56. {codespine-0.9.4 → codespine-0.9.6}/tests/test_branch_diff_normalize.py +0 -0
  57. {codespine-0.9.4 → codespine-0.9.6}/tests/test_call_resolver.py +0 -0
  58. {codespine-0.9.4 → codespine-0.9.6}/tests/test_community_detection.py +0 -0
  59. {codespine-0.9.4 → codespine-0.9.6}/tests/test_deadcode.py +0 -0
  60. {codespine-0.9.4 → codespine-0.9.6}/tests/test_index_and_hybrid.py +0 -0
  61. {codespine-0.9.4 → codespine-0.9.6}/tests/test_java_parser.py +0 -0
  62. {codespine-0.9.4 → codespine-0.9.6}/tests/test_multimodule_index.py +0 -0
  63. {codespine-0.9.4 → codespine-0.9.6}/tests/test_overlay.py +0 -0
  64. {codespine-0.9.4 → codespine-0.9.6}/tests/test_search_ranking.py +0 -0
  65. {codespine-0.9.4 → codespine-0.9.6}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.4
3
+ Version: 0.9.6
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.9.4"
4
+ __version__ = "0.9.6"
@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
20
20
  from codespine.analysis.impact import analyze_impact
21
21
  from codespine.config import SETTINGS
22
22
  from codespine.db.store import GraphStore
23
+ from codespine.sharding import ShardedGraphStore, ShardRouter
23
24
  from codespine.diff.branch_diff import compare_branches
24
25
  from codespine.indexer.engine import JavaIndexer
25
26
  from codespine.mcp.server import build_mcp_server
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
90
91
  return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
91
92
 
92
93
 
94
+ def _show_shard_topology(as_json: bool) -> None:
95
+ """Display the current shard routing topology and imbalance metrics."""
96
+ router = ShardRouter()
97
+ sg = ShardedGraphStore(read_only=True)
98
+ topology = sg.describe()
99
+
100
+ # Gather project → shard mapping from all shards.
101
+ shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
102
+ for p in sg.list_project_metadata():
103
+ pid = p.get("id", "")
104
+ idx = router.shard_for(pid)
105
+ shard_project_counts[idx].append(pid)
106
+
107
+ counts = [len(v) for v in shard_project_counts.values()]
108
+ total = sum(counts)
109
+ median = sorted(counts)[len(counts) // 2] if counts else 0
110
+ max_count = max(counts) if counts else 0
111
+ imbalance = (max_count / median) if median else 1.0
112
+
113
+ if as_json:
114
+ _echo_json({
115
+ "topology": topology,
116
+ "project_distribution": {str(k): v for k, v in shard_project_counts.items()},
117
+ "imbalance_ratio": round(imbalance, 2),
118
+ }, as_json=True)
119
+ return
120
+
121
+ click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
122
+ click.echo(f" Directory : {router.shards_dir}")
123
+ click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
124
+ click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
125
+ click.echo()
126
+ header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
127
+ click.secho(header, fg="cyan")
128
+ click.echo("-" * 60)
129
+ for i, info in enumerate(topology.get("shards", [])):
130
+ plist = shard_project_counts.get(i, [])
131
+ exists_str = "yes" if info.get("exists") else "no"
132
+ click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
133
+ for pid in plist:
134
+ click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
135
+ if imbalance > 2.0:
136
+ click.secho(
137
+ f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
138
+ fg="yellow",
139
+ )
140
+
141
+
93
142
  @click.group()
94
143
  def main() -> None:
95
144
  """CodeSpine CLI."""
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
130
179
  fg="yellow",
131
180
  )
132
181
 
133
- store = GraphStore(read_only=False)
134
- indexer = JavaIndexer(store)
182
+ # ShardedGraphStore routes each project to its dedicated DB shard.
183
+ # For single-project analysis this is transparent — shard() always
184
+ # returns a GraphStore pointing to the correct shard path.
185
+ sg = ShardedGraphStore(read_only=False)
186
+ # The indexer is initialised per-module below with the right shard store.
187
+ # We keep a single ShardedGraphStore to fan-out cross-module linking later.
135
188
 
136
189
  # --- Workspace → project → module detection ---
137
190
  # Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
241
294
  last_result = None
242
295
  for idx, (module_path, project_id) in enumerate(modules_with_ids):
243
296
  if is_multi:
297
+ shard_idx = sg.router.shard_for(project_id)
244
298
  click.echo()
245
- click.secho(f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}", fg="cyan")
299
+ click.secho(
300
+ f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
301
+ fg="cyan",
302
+ )
246
303
  _reset_state()
304
+ # Use the shard store for this project so data lands in the right DB.
305
+ shard_store = sg.shard(project_id)
306
+ indexer = JavaIndexer(shard_store)
247
307
  last_result = indexer.index_project(
248
308
  module_path, full=full, progress=_progress, project_id=project_id, embed=embed
249
309
  )
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
264
324
  """Finalise an in-place phase line and move to the next line."""
265
325
  click.echo(f"\r✓ {label:<28} {result:<48}")
266
326
 
327
+ # For cross-module operations (cross-module linking, deep analysis, stats)
328
+ # we use the shard store for the root project (all modules share one shard).
329
+ root_project_id = last_result.project_id if last_result else root_basename
330
+ root_shard_store = sg.shard(root_project_id)
331
+
267
332
  # ── Cross-module call linking ──────────────────────────────────────
268
333
  if is_multi and len(modules_with_ids) > 1:
269
334
  xmod_label = "Cross-module linking..."
270
335
  _live_phase(xmod_label, "running")
271
336
  xmod_pids = [pid for _, pid in modules_with_ids]
272
337
  xmod_edges = link_cross_module_calls(
273
- store, project_ids=xmod_pids,
338
+ root_shard_store, project_ids=xmod_pids,
274
339
  progress=lambda s: _live_phase(xmod_label, s),
275
340
  )
276
341
  _finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
287
352
  comm_label = "Detecting communities..."
288
353
  _live_phase(comm_label, "running")
289
354
  communities = detect_communities(
290
- store,
355
+ root_shard_store,
291
356
  progress=lambda s: _live_phase(comm_label, s),
292
357
  )
293
358
  _finish_phase(comm_label, f"{len(communities)} clusters found")
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
295
360
  flow_label = "Detecting execution flows..."
296
361
  _live_phase(flow_label, "running")
297
362
  flows = trace_execution_flows(
298
- store,
363
+ root_shard_store,
299
364
  progress=lambda s: _live_phase(flow_label, s),
300
365
  )
301
366
  _finish_phase(flow_label, f"{len(flows)} processes found")
302
367
 
303
368
  dead_label = "Finding dead code..."
304
369
  _live_phase(dead_label, "running")
305
- dead = detect_dead_code(store, limit=500)
370
+ dead = detect_dead_code(root_shard_store, limit=500)
306
371
  _finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
307
372
 
308
373
  coup_label = "Analyzing git history..."
309
374
  _live_phase(coup_label, "running")
310
- store.clear_coupling()
375
+ root_shard_store.clear_coupling()
311
376
  coupling_root = abs_path
312
377
  coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
313
378
  coupling_pairs = compute_coupling(
314
- store,
379
+ root_shard_store,
315
380
  coupling_root,
316
381
  coupling_project,
317
382
  days=SETTINGS.default_coupling_days,
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
329
394
  flow_label = "Detecting execution flows..."
330
395
  _live_phase(flow_label, "running (lightweight)")
331
396
  try:
332
- flows = trace_execution_flows(store, max_depth=3)
397
+ flows = trace_execution_flows(root_shard_store, max_depth=3)
333
398
  except Exception:
334
399
  flows = []
335
400
  _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
337
402
  dead_label = "Finding dead code..."
338
403
  _live_phase(dead_label, "running (lightweight)")
339
404
  try:
340
- dead = detect_dead_code(store, limit=100)
405
+ dead = detect_dead_code(root_shard_store, limit=100)
341
406
  except Exception:
342
407
  dead = []
343
408
  _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
344
409
 
345
410
  _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
346
411
 
347
- vector_count = store.query_records(
412
+ vector_count = root_shard_store.query_records(
348
413
  """
349
414
  MATCH (s:Symbol)
350
415
  WHERE s.embedding IS NOT NULL
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
355
420
  vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
356
421
  _phase("Generating embeddings...", f"{vectors_stored} vectors stored")
357
422
 
358
- symbol_count = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
359
- edge_count = store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
423
+ symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
424
+ edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
360
425
  symbols = int(symbol_count[0]["count"]) if symbol_count else 0
361
426
  edges = int(edge_count[0]["count"]) if edge_count else 0
362
427
  elapsed = time.perf_counter() - started
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
376
441
 
377
442
  # Detect unresolved imports → hint about unindexed sibling projects
378
443
  try:
379
- unresolved = JavaIndexer.detect_unresolved_imports(store)
444
+ unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
380
445
  if unresolved:
381
446
  click.echo()
382
447
  click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
387
452
 
388
453
  # Publish a read replica so MCP and read-only CLI commands (search, stats…)
389
454
  # run against an isolated snapshot rather than competing with the write
390
- # process's buffer pool. The MCP daemon detects the sentinel file and
391
- # hot-reloads without restarting.
455
+ # process's buffer pool. Snapshot all open shards concurrently.
392
456
  snap_label = "Publishing read replica..."
393
457
  _live_phase(snap_label, "copying")
394
- store._recycle_conn()
395
- snapped = GraphStore.snapshot_to_read_replica()
396
- _finish_phase(snap_label, "MCP will reload automatically" if snapped else "skipped (source DB not found)")
458
+ root_shard_store._recycle_conn()
459
+ sg.snapshot_all(background=False)
460
+ _finish_phase(snap_label, "MCP will reload automatically")
397
461
 
398
462
 
399
463
  @main.command()
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
523
587
 
524
588
  @main.command()
525
589
  @click.option("--json", "as_json", is_flag=True)
526
- def stats(as_json: bool) -> None:
590
+ @click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
591
+ def stats(as_json: bool, show_shards: bool) -> None:
527
592
  """Show per-project and aggregate graph statistics."""
528
- store = GraphStore(read_only=True)
529
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path ORDER BY p.id")
593
+ if show_shards:
594
+ _show_shard_topology(as_json)
595
+ return
596
+
597
+ # Fan-out across all shards so stats covers every project in the cluster.
598
+ sg = ShardedGraphStore(read_only=True)
599
+ all_projects_meta = sg.list_project_metadata()
600
+
601
+ # For detailed stats we need the per-project shard store.
602
+ def _project_store(pid: str):
603
+ return sg.shard(pid)
604
+
530
605
  if not projects:
531
606
  click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
532
607
  return
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
534
609
  rows = []
535
610
  for p in projects:
536
611
  pid = p["id"]
537
- files = store.query_records(
612
+ # Route each query to the project's owning shard.
613
+ ps = _project_store(pid)
614
+ files = ps.query_records(
538
615
  "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
539
616
  )
540
- classes = store.query_records(
617
+ classes = ps.query_records(
541
618
  """
542
619
  MATCH (f:File) WHERE f.project_id = $pid
543
620
  WITH f
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
546
623
  """,
547
624
  {"pid": pid},
548
625
  )
549
- methods = store.query_records(
626
+ methods = ps.query_records(
550
627
  """
551
628
  MATCH (f:File) WHERE f.project_id = $pid
552
629
  WITH f
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
557
634
  """,
558
635
  {"pid": pid},
559
636
  )
560
- calls = store.query_records(
637
+ calls = ps.query_records(
561
638
  """
562
639
  MATCH (f:File) WHERE f.project_id = $pid
563
640
  WITH f
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
568
645
  """,
569
646
  {"pid": pid},
570
647
  )
571
- emb = store.query_records(
648
+ emb = ps.query_records(
572
649
  """
573
650
  MATCH (f:File) WHERE f.project_id = $pid
574
651
  WITH f
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
580
657
  rows.append({
581
658
  "project": pid,
582
659
  "path": p["path"],
660
+ "shard": sg.router.shard_for(pid),
583
661
  "files": files[0]["n"] if files else 0,
584
662
  "classes": classes[0]["n"] if classes else 0,
585
663
  "methods": methods[0]["n"] if methods else 0,
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
592
670
  return
593
671
 
594
672
  col_w = max(len(r["project"]) for r in rows)
595
- header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
673
+ header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
596
674
  click.secho(header, fg="cyan")
597
675
  click.echo("-" * len(header))
598
676
  total_files = total_classes = total_methods = total_calls = total_emb = 0
599
677
  for r in rows:
600
678
  click.echo(
601
- f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
679
+ f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
602
680
  )
603
681
  total_files += r["files"]
604
682
  total_classes += r["classes"]
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
608
686
  if len(rows) > 1:
609
687
  click.echo("-" * len(header))
610
688
  click.secho(
611
- f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
689
+ f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
612
690
  fg="green",
613
691
  )
614
692
 
@@ -4,8 +4,17 @@ from dataclasses import dataclass
4
4
 
5
5
  @dataclass(frozen=True)
6
6
  class Settings:
7
+ # Legacy single-DB paths — kept for backward compat and as defaults when
8
+ # sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
7
9
  db_path: str = os.path.expanduser("~/.codespine_db")
8
10
  db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
11
+
12
+ # Sharding — new layout stores each shard under shards_dir/{N}/db
13
+ # num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
14
+ # ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
15
+ num_shards: int = 4
16
+ shards_dir: str = os.path.expanduser("~/.codespine/shards")
17
+
9
18
  pid_file: str = os.path.expanduser("~/.codespine.pid")
10
19
  log_file: str = os.path.expanduser("~/.codespine.log")
11
20
  embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
@@ -8,7 +8,7 @@ import shutil
8
8
  import threading
9
9
  import time
10
10
  from contextlib import contextmanager
11
- from dataclasses import dataclass
11
+ from dataclasses import InitVar, dataclass
12
12
  from typing import Any
13
13
 
14
14
  import kuzu
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
39
39
  @dataclass
40
40
  class GraphStore:
41
41
  read_only: bool = False
42
+ # Optional path overrides — when provided, the store uses these paths
43
+ # instead of the global SETTINGS values. The ShardedGraphStore uses
44
+ # this to give each shard its own isolated KùzuDB directory.
45
+ db_path_override: InitVar[str | None] = None
46
+ snapshot_path_override: InitVar[str | None] = None
47
+
48
+ def __post_init__(
49
+ self,
50
+ db_path_override: str | None,
51
+ snapshot_path_override: str | None,
52
+ ) -> None:
53
+ # Resolve effective paths — per-shard overrides win over global SETTINGS.
54
+ self._db_path: str = db_path_override or SETTINGS.db_path
55
+ self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
56
+
57
+ # Per-instance snapshot synchronisation (not class-level) so that
58
+ # multiple shards can snapshot concurrently without a shared bottleneck.
59
+ self._inst_snapshot_lock: threading.Lock = threading.Lock()
60
+ self._inst_snapshot_pending: threading.Event = threading.Event()
42
61
 
43
- def __post_init__(self) -> None:
44
62
  self._tls: threading.local = threading.local()
45
63
  from codespine.overlay.store import OverlayStore
46
64
 
@@ -48,10 +66,10 @@ class GraphStore:
48
66
 
49
67
  # Read-only callers (MCP, CLI reads) use the read replica when available.
50
68
  # This isolates them from the write process's buffer pool and WAL churn.
51
- if self.read_only and os.path.exists(SETTINGS.db_snapshot_path):
52
- db_path = SETTINGS.db_snapshot_path
69
+ if self.read_only and os.path.exists(self._snapshot_path):
70
+ db_path = self._snapshot_path
53
71
  else:
54
- db_path = SETTINGS.db_path
72
+ db_path = self._db_path
55
73
 
56
74
  try:
57
75
  self.db = self._open_with_recovery(db_path)
@@ -97,7 +115,7 @@ class GraphStore:
97
115
  try:
98
116
  ensure_schema(self._conn())
99
117
  except Exception as exc:
100
- path = getattr(self.db, "database_path", SETTINGS.db_path)
118
+ path = getattr(self.db, "database_path", self._db_path)
101
119
  if not self._is_recoverable_db_error(exc):
102
120
  raise
103
121
  LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
@@ -527,15 +545,27 @@ class GraphStore:
527
545
  rows = [{"source_id": r["source_id"], "target_id": r["target_id"],
528
546
  "confidence": float(r["confidence"]), "reason": r["reason"]}
529
547
  for r in records]
530
- op = "CREATE" if create_mode else "MERGE"
531
- self.execute(
532
- f"""
533
- UNWIND $rows AS row
534
- MATCH (src:Method {{id: row.source_id}}), (dst:Method {{id: row.target_id}})
535
- {op} (src)-[:CALLS {{confidence: row.confidence, reason: row.reason}}]->(dst)
536
- """,
537
- {"rows": rows},
538
- )
548
+ if create_mode:
549
+ self.execute(
550
+ """
551
+ UNWIND $rows AS row
552
+ MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
553
+ CREATE (src)-[:CALLS {confidence: row.confidence, reason: row.reason}]->(dst)
554
+ """,
555
+ {"rows": rows},
556
+ )
557
+ else:
558
+ # Properties are SET, not part of the MERGE pattern — ensures at most
559
+ # one CALLS edge per (src, dst) pair regardless of confidence value.
560
+ self.execute(
561
+ """
562
+ UNWIND $rows AS row
563
+ MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
564
+ MERGE (src)-[r:CALLS]->(dst)
565
+ SET r.confidence = row.confidence, r.reason = row.reason
566
+ """,
567
+ {"rows": rows},
568
+ )
539
569
 
540
570
  def add_reference(self, rel: str, src_label: str, src_id: str, dst_label: str, dst_id: str, confidence: float) -> None:
541
571
  if rel not in {"REFERENCES_TYPE", "IMPLEMENTS", "OVERRIDES"}:
@@ -756,8 +786,7 @@ class GraphStore:
756
786
  self.clear_flows()
757
787
  self.clear_coupling()
758
788
 
759
- @staticmethod
760
- def force_delete_all_data() -> list[str]:
789
+ def force_delete_all_data(self) -> list[str]:
761
790
  """Delete all CodeSpine data files without touching the Kuzu engine.
762
791
 
763
792
  This is the nuclear option for OOM recovery: when the buffer pool is
@@ -767,12 +796,14 @@ class GraphStore:
767
796
 
768
797
  Returns the list of paths that were removed.
769
798
  """
799
+ db_path = self._db_path
800
+ snapshot_path = self._snapshot_path
770
801
  removed: list[str] = []
771
802
  for path in [
772
- SETTINGS.db_path,
773
- SETTINGS.db_snapshot_path,
774
- SETTINGS.db_snapshot_path + ".updated",
775
- SETTINGS.db_snapshot_path + ".tmp",
803
+ db_path,
804
+ snapshot_path,
805
+ snapshot_path + ".updated",
806
+ snapshot_path + ".tmp",
776
807
  SETTINGS.embedding_cache_path,
777
808
  SETTINGS.overlay_dir,
778
809
  SETTINGS.index_meta_dir,
@@ -789,7 +820,7 @@ class GraphStore:
789
820
  pass
790
821
  # Also remove any stale WAL files next to the DB
791
822
  for suffix in (".wal", ".lock"):
792
- wal_path = SETTINGS.db_path + suffix
823
+ wal_path = db_path + suffix
793
824
  if os.path.exists(wal_path):
794
825
  try:
795
826
  os.remove(wal_path)
@@ -800,7 +831,7 @@ class GraphStore:
800
831
 
801
832
  def rebuild_empty_db(self) -> None:
802
833
  self._recycle_conn()
803
- path = SETTINGS.db_path
834
+ path = self._db_path
804
835
  # Remove the DB directory AND any stale WAL / lock files
805
836
  self._remove_db_path(path)
806
837
  for suffix in (".wal", ".lock"):
@@ -813,11 +844,8 @@ class GraphStore:
813
844
 
814
845
  # Also remove the read replica so that read-only callers (stats, MCP)
815
846
  # don't continue to see stale data from before the wipe.
816
- for stale in [
817
- SETTINGS.db_snapshot_path,
818
- SETTINGS.db_snapshot_path + ".tmp",
819
- SETTINGS.db_snapshot_path + ".updated",
820
- ]:
847
+ snap = self._snapshot_path
848
+ for stale in [snap, snap + ".tmp", snap + ".updated"]:
821
849
  self._remove_db_path(stale)
822
850
 
823
851
  # Kuzu may retain stale internal state from a previous failed open of
@@ -914,18 +942,15 @@ class GraphStore:
914
942
  },
915
943
  )
916
944
 
917
- # Lock and flag for background snapshot coalescing.
918
- # Only one snapshot runs at a time; a pending request supersedes queued ones.
919
- _snapshot_lock: threading.Lock = threading.Lock()
920
- _snapshot_pending: threading.Event = threading.Event()
921
-
922
- @staticmethod
923
- def snapshot_to_read_replica(background: bool = False) -> bool:
945
+ def snapshot_to_read_replica(self, background: bool = False) -> bool:
924
946
  """Atomically copy the write DB to the read-replica path.
925
947
 
926
948
  The read replica is used by the MCP daemon and all read-only CLI
927
949
  commands so they never contend with the write process's buffer pool.
928
950
 
951
+ Each GraphStore instance has its own snapshot lock so that multiple
952
+ shards can snapshot concurrently without serialising on a class lock.
953
+
929
954
  Parameters
930
955
  ----------
931
956
  background:
@@ -938,36 +963,38 @@ class GraphStore:
938
963
  Returns True on success (or when dispatched to background), False if
939
964
  the source DB does not exist.
940
965
  """
941
- src = SETTINGS.db_path
966
+ src = self._db_path
942
967
  if not os.path.exists(src):
943
968
  return False
944
969
 
945
970
  if background:
946
971
  # Signal that a snapshot is wanted, then ensure a worker is running.
947
- GraphStore._snapshot_pending.set()
972
+ self._inst_snapshot_pending.set()
973
+ inst = self # capture for closure
948
974
 
949
975
  def _worker() -> None:
950
- while GraphStore._snapshot_pending.is_set():
951
- GraphStore._snapshot_pending.clear()
952
- with GraphStore._snapshot_lock:
953
- GraphStore._do_snapshot()
976
+ while inst._inst_snapshot_pending.is_set():
977
+ inst._inst_snapshot_pending.clear()
978
+ with inst._inst_snapshot_lock:
979
+ inst._do_snapshot()
954
980
 
955
- if not GraphStore._snapshot_lock.locked():
981
+ if not self._inst_snapshot_lock.locked():
956
982
  t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
957
983
  t.start()
958
984
  return True
959
985
 
960
986
  # Foreground (blocking) path — used by CLI analyse and tests.
961
- with GraphStore._snapshot_lock:
962
- return GraphStore._do_snapshot()
987
+ with self._inst_snapshot_lock:
988
+ return self._do_snapshot()
963
989
 
964
- @staticmethod
965
- def _do_snapshot() -> bool:
966
- """Perform the actual copy. Must be called with _snapshot_lock held."""
967
- src = SETTINGS.db_path
968
- dst = SETTINGS.db_snapshot_path
990
+ def _do_snapshot(self) -> bool:
991
+ """Perform the actual copy. Must be called with the instance snapshot lock held."""
992
+ src = self._db_path
993
+ dst = self._snapshot_path
969
994
  if not os.path.exists(src):
970
995
  return False
996
+ # Ensure the parent directory for the replica exists (shards layout).
997
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
971
998
  tmp = dst + ".tmp"
972
999
  try:
973
1000
  if os.path.exists(tmp):
@@ -975,7 +1002,6 @@ class GraphStore:
975
1002
  if os.path.isdir(src):
976
1003
  shutil.copytree(src, tmp)
977
1004
  else:
978
- os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
979
1005
  shutil.copy2(src, tmp)
980
1006
  if os.path.exists(dst):
981
1007
  shutil.rmtree(dst, ignore_errors=True)
@@ -5,7 +5,7 @@ from typing import Iterator
5
5
 
6
6
  from codespine.noise.blocklist import MIN_FUZZY_NAME_LEN, NOISE_METHOD_NAMES
7
7
 
8
- MAX_FUZZY_TARGETS = 12
8
+ MAX_FUZZY_TARGETS = 6 # reduced from 12 — keeps precision, halves low-confidence edge fan-out
9
9
 
10
10
 
11
11
  def _simple_type_name(type_name: str | None) -> str:
@@ -221,7 +221,7 @@ class JavaIndexer:
221
221
  calls_resolved = 0
222
222
  type_relationships = 0
223
223
  file_batch_size = max(1, int(getattr(SETTINGS, "index_file_batch_size", 64)))
224
- edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size", 2000)))
224
+ edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size", 5000)))
225
225
 
226
226
  if not full:
227
227
  method_catalog, class_catalog, fqcn_to_class_ids, class_methods = (
@@ -480,22 +480,36 @@ class JavaIndexer:
480
480
  self.store._recycle_conn()
481
481
 
482
482
  self._emit(progress, "resolve_calls_start")
483
- call_rows: list[dict] = []
484
- for src, dst, confidence, reason in resolve_calls(method_catalog, method_calls, method_context, class_catalog):
485
- call_rows.append(
486
- {
487
- "source_id": src,
488
- "target_id": dst,
489
- "confidence": confidence,
490
- "reason": reason,
491
- }
483
+ # Deduplicate (src, dst) pairs — the same pair can appear many times when
484
+ # a method calls another method multiple times at different call sites.
485
+ # Keep the highest-confidence resolution to avoid N writes per pair.
486
+ best_calls: dict[tuple[str, str], tuple[float, str]] = {}
487
+ for src, dst, confidence, reason in resolve_calls(
488
+ method_catalog, method_calls, method_context, class_catalog
489
+ ):
490
+ key = (src, dst)
491
+ if key not in best_calls or confidence > best_calls[key][0]:
492
+ best_calls[key] = (confidence, reason)
493
+
494
+ # Stream writes in batches — never hold the full set in RAM.
495
+ call_buf: list[dict] = []
496
+ for (src, dst), (confidence, reason) in best_calls.items():
497
+ call_buf.append(
498
+ {"source_id": src, "target_id": dst,
499
+ "confidence": confidence, "reason": reason}
492
500
  )
493
- for call_chunk in self._chunked(call_rows, edge_batch_size):
501
+ if len(call_buf) >= edge_batch_size:
502
+ with self.store.transaction():
503
+ self.store.add_calls_batch(call_buf)
504
+ calls_resolved += len(call_buf)
505
+ self.store._recycle_conn()
506
+ self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
507
+ call_buf = []
508
+ if call_buf:
494
509
  with self.store.transaction():
495
- self.store.add_calls_batch(call_chunk)
496
- calls_resolved += len(call_chunk)
510
+ self.store.add_calls_batch(call_buf)
511
+ calls_resolved += len(call_buf)
497
512
  self.store._recycle_conn()
498
- self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
499
513
  self._emit(progress, "resolve_calls_done", calls_resolved=calls_resolved)
500
514
 
501
515
  self._emit(progress, "resolve_types_start")
@@ -0,0 +1,9 @@
1
+ """CodeSpine sharding package.
2
+
3
+ Exposes the consistent-hash router and the ShardedGraphStore facade.
4
+ """
5
+
6
+ from codespine.sharding.router import ShardRouter
7
+ from codespine.sharding.store import ShardedGraphStore
8
+
9
+ __all__ = ["ShardRouter", "ShardedGraphStore"]