codespine 0.9.5__tar.gz → 0.9.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {codespine-0.9.5 → codespine-0.9.6}/PKG-INFO +1 -1
  2. {codespine-0.9.5 → codespine-0.9.6}/codespine/__init__.py +1 -1
  3. {codespine-0.9.5 → codespine-0.9.6}/codespine/cli.py +109 -31
  4. {codespine-0.9.5 → codespine-0.9.6}/codespine/config.py +9 -0
  5. {codespine-0.9.5 → codespine-0.9.6}/codespine/db/store.py +55 -41
  6. codespine-0.9.6/codespine/sharding/__init__.py +9 -0
  7. codespine-0.9.6/codespine/sharding/router.py +123 -0
  8. codespine-0.9.6/codespine/sharding/store.py +312 -0
  9. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/PKG-INFO +1 -1
  10. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/SOURCES.txt +3 -0
  11. {codespine-0.9.5 → codespine-0.9.6}/pyproject.toml +1 -1
  12. {codespine-0.9.5 → codespine-0.9.6}/LICENSE +0 -0
  13. {codespine-0.9.5 → codespine-0.9.6}/README.md +0 -0
  14. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/__init__.py +0 -0
  15. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/community.py +0 -0
  16. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/context.py +0 -0
  17. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/coupling.py +0 -0
  18. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/crossmodule.py +0 -0
  19. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/deadcode.py +0 -0
  20. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/flow.py +0 -0
  21. {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/impact.py +0 -0
  22. {codespine-0.9.5 → codespine-0.9.6}/codespine/db/__init__.py +0 -0
  23. {codespine-0.9.5 → codespine-0.9.6}/codespine/db/schema.py +0 -0
  24. {codespine-0.9.5 → codespine-0.9.6}/codespine/diff/__init__.py +0 -0
  25. {codespine-0.9.5 → codespine-0.9.6}/codespine/diff/branch_diff.py +0 -0
  26. {codespine-0.9.5 → codespine-0.9.6}/codespine/guide.py +0 -0
  27. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/__init__.py +0 -0
  28. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/call_resolver.py +0 -0
  29. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/di_resolver.py +0 -0
  30. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/engine.py +0 -0
  31. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/java_parser.py +0 -0
  32. {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/symbol_builder.py +0 -0
  33. {codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/__init__.py +0 -0
  34. {codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/server.py +0 -0
  35. {codespine-0.9.5 → codespine-0.9.6}/codespine/noise/__init__.py +0 -0
  36. {codespine-0.9.5 → codespine-0.9.6}/codespine/noise/blocklist.py +0 -0
  37. {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/__init__.py +0 -0
  38. {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/git_state.py +0 -0
  39. {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/merge.py +0 -0
  40. {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/store.py +0 -0
  41. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/__init__.py +0 -0
  42. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/bm25.py +0 -0
  43. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/fuzzy.py +0 -0
  44. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/hybrid.py +0 -0
  45. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/rrf.py +0 -0
  46. {codespine-0.9.5 → codespine-0.9.6}/codespine/search/vector.py +0 -0
  47. {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/__init__.py +0 -0
  48. {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/git_hook.py +0 -0
  49. {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/watcher.py +0 -0
  50. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/dependency_links.txt +0 -0
  51. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/entry_points.txt +0 -0
  52. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/requires.txt +0 -0
  53. {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/top_level.txt +0 -0
  54. {codespine-0.9.5 → codespine-0.9.6}/gindex.py +0 -0
  55. {codespine-0.9.5 → codespine-0.9.6}/setup.cfg +0 -0
  56. {codespine-0.9.5 → codespine-0.9.6}/tests/test_branch_diff_normalize.py +0 -0
  57. {codespine-0.9.5 → codespine-0.9.6}/tests/test_call_resolver.py +0 -0
  58. {codespine-0.9.5 → codespine-0.9.6}/tests/test_community_detection.py +0 -0
  59. {codespine-0.9.5 → codespine-0.9.6}/tests/test_deadcode.py +0 -0
  60. {codespine-0.9.5 → codespine-0.9.6}/tests/test_index_and_hybrid.py +0 -0
  61. {codespine-0.9.5 → codespine-0.9.6}/tests/test_java_parser.py +0 -0
  62. {codespine-0.9.5 → codespine-0.9.6}/tests/test_multimodule_index.py +0 -0
  63. {codespine-0.9.5 → codespine-0.9.6}/tests/test_overlay.py +0 -0
  64. {codespine-0.9.5 → codespine-0.9.6}/tests/test_search_ranking.py +0 -0
  65. {codespine-0.9.5 → codespine-0.9.6}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.5
3
+ Version: 0.9.6
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.9.5"
4
+ __version__ = "0.9.6"
@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
20
20
  from codespine.analysis.impact import analyze_impact
21
21
  from codespine.config import SETTINGS
22
22
  from codespine.db.store import GraphStore
23
+ from codespine.sharding import ShardedGraphStore, ShardRouter
23
24
  from codespine.diff.branch_diff import compare_branches
24
25
  from codespine.indexer.engine import JavaIndexer
25
26
  from codespine.mcp.server import build_mcp_server
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
90
91
  return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
91
92
 
92
93
 
94
+ def _show_shard_topology(as_json: bool) -> None:
95
+ """Display the current shard routing topology and imbalance metrics."""
96
+ router = ShardRouter()
97
+ sg = ShardedGraphStore(read_only=True)
98
+ topology = sg.describe()
99
+
100
+ # Gather project → shard mapping from all shards.
101
+ shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
102
+ for p in sg.list_project_metadata():
103
+ pid = p.get("id", "")
104
+ idx = router.shard_for(pid)
105
+ shard_project_counts[idx].append(pid)
106
+
107
+ counts = [len(v) for v in shard_project_counts.values()]
108
+ total = sum(counts)
109
+ median = sorted(counts)[len(counts) // 2] if counts else 0
110
+ max_count = max(counts) if counts else 0
111
+ imbalance = (max_count / median) if median else 1.0
112
+
113
+ if as_json:
114
+ _echo_json({
115
+ "topology": topology,
116
+ "project_distribution": {str(k): v for k, v in shard_project_counts.items()},
117
+ "imbalance_ratio": round(imbalance, 2),
118
+ }, as_json=True)
119
+ return
120
+
121
+ click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
122
+ click.echo(f" Directory : {router.shards_dir}")
123
+ click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
124
+ click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
125
+ click.echo()
126
+ header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
127
+ click.secho(header, fg="cyan")
128
+ click.echo("-" * 60)
129
+ for i, info in enumerate(topology.get("shards", [])):
130
+ plist = shard_project_counts.get(i, [])
131
+ exists_str = "yes" if info.get("exists") else "no"
132
+ click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
133
+ for pid in plist:
134
+ click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
135
+ if imbalance > 2.0:
136
+ click.secho(
137
+ f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
138
+ fg="yellow",
139
+ )
140
+
141
+
93
142
  @click.group()
94
143
  def main() -> None:
95
144
  """CodeSpine CLI."""
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
130
179
  fg="yellow",
131
180
  )
132
181
 
133
- store = GraphStore(read_only=False)
134
- indexer = JavaIndexer(store)
182
+ # ShardedGraphStore routes each project to its dedicated DB shard.
183
+ # For single-project analysis this is transparent — shard() always
184
+ # returns a GraphStore pointing to the correct shard path.
185
+ sg = ShardedGraphStore(read_only=False)
186
+ # The indexer is initialised per-module below with the right shard store.
187
+ # We keep a single ShardedGraphStore to fan-out cross-module linking later.
135
188
 
136
189
  # --- Workspace → project → module detection ---
137
190
  # Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
241
294
  last_result = None
242
295
  for idx, (module_path, project_id) in enumerate(modules_with_ids):
243
296
  if is_multi:
297
+ shard_idx = sg.router.shard_for(project_id)
244
298
  click.echo()
245
- click.secho(f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}", fg="cyan")
299
+ click.secho(
300
+ f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
301
+ fg="cyan",
302
+ )
246
303
  _reset_state()
304
+ # Use the shard store for this project so data lands in the right DB.
305
+ shard_store = sg.shard(project_id)
306
+ indexer = JavaIndexer(shard_store)
247
307
  last_result = indexer.index_project(
248
308
  module_path, full=full, progress=_progress, project_id=project_id, embed=embed
249
309
  )
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
264
324
  """Finalise an in-place phase line and move to the next line."""
265
325
  click.echo(f"\r✓ {label:<28} {result:<48}")
266
326
 
327
+ # For cross-module operations (cross-module linking, deep analysis, stats)
328
+ # we use the shard store for the root project (all modules share one shard).
329
+ root_project_id = last_result.project_id if last_result else root_basename
330
+ root_shard_store = sg.shard(root_project_id)
331
+
267
332
  # ── Cross-module call linking ──────────────────────────────────────
268
333
  if is_multi and len(modules_with_ids) > 1:
269
334
  xmod_label = "Cross-module linking..."
270
335
  _live_phase(xmod_label, "running")
271
336
  xmod_pids = [pid for _, pid in modules_with_ids]
272
337
  xmod_edges = link_cross_module_calls(
273
- store, project_ids=xmod_pids,
338
+ root_shard_store, project_ids=xmod_pids,
274
339
  progress=lambda s: _live_phase(xmod_label, s),
275
340
  )
276
341
  _finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
287
352
  comm_label = "Detecting communities..."
288
353
  _live_phase(comm_label, "running")
289
354
  communities = detect_communities(
290
- store,
355
+ root_shard_store,
291
356
  progress=lambda s: _live_phase(comm_label, s),
292
357
  )
293
358
  _finish_phase(comm_label, f"{len(communities)} clusters found")
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
295
360
  flow_label = "Detecting execution flows..."
296
361
  _live_phase(flow_label, "running")
297
362
  flows = trace_execution_flows(
298
- store,
363
+ root_shard_store,
299
364
  progress=lambda s: _live_phase(flow_label, s),
300
365
  )
301
366
  _finish_phase(flow_label, f"{len(flows)} processes found")
302
367
 
303
368
  dead_label = "Finding dead code..."
304
369
  _live_phase(dead_label, "running")
305
- dead = detect_dead_code(store, limit=500)
370
+ dead = detect_dead_code(root_shard_store, limit=500)
306
371
  _finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
307
372
 
308
373
  coup_label = "Analyzing git history..."
309
374
  _live_phase(coup_label, "running")
310
- store.clear_coupling()
375
+ root_shard_store.clear_coupling()
311
376
  coupling_root = abs_path
312
377
  coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
313
378
  coupling_pairs = compute_coupling(
314
- store,
379
+ root_shard_store,
315
380
  coupling_root,
316
381
  coupling_project,
317
382
  days=SETTINGS.default_coupling_days,
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
329
394
  flow_label = "Detecting execution flows..."
330
395
  _live_phase(flow_label, "running (lightweight)")
331
396
  try:
332
- flows = trace_execution_flows(store, max_depth=3)
397
+ flows = trace_execution_flows(root_shard_store, max_depth=3)
333
398
  except Exception:
334
399
  flows = []
335
400
  _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
337
402
  dead_label = "Finding dead code..."
338
403
  _live_phase(dead_label, "running (lightweight)")
339
404
  try:
340
- dead = detect_dead_code(store, limit=100)
405
+ dead = detect_dead_code(root_shard_store, limit=100)
341
406
  except Exception:
342
407
  dead = []
343
408
  _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
344
409
 
345
410
  _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
346
411
 
347
- vector_count = store.query_records(
412
+ vector_count = root_shard_store.query_records(
348
413
  """
349
414
  MATCH (s:Symbol)
350
415
  WHERE s.embedding IS NOT NULL
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
355
420
  vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
356
421
  _phase("Generating embeddings...", f"{vectors_stored} vectors stored")
357
422
 
358
- symbol_count = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
359
- edge_count = store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
423
+ symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
424
+ edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
360
425
  symbols = int(symbol_count[0]["count"]) if symbol_count else 0
361
426
  edges = int(edge_count[0]["count"]) if edge_count else 0
362
427
  elapsed = time.perf_counter() - started
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
376
441
 
377
442
  # Detect unresolved imports → hint about unindexed sibling projects
378
443
  try:
379
- unresolved = JavaIndexer.detect_unresolved_imports(store)
444
+ unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
380
445
  if unresolved:
381
446
  click.echo()
382
447
  click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
387
452
 
388
453
  # Publish a read replica so MCP and read-only CLI commands (search, stats…)
389
454
  # run against an isolated snapshot rather than competing with the write
390
- # process's buffer pool. The MCP daemon detects the sentinel file and
391
- # hot-reloads without restarting.
455
+ # process's buffer pool. Snapshot all open shards concurrently.
392
456
  snap_label = "Publishing read replica..."
393
457
  _live_phase(snap_label, "copying")
394
- store._recycle_conn()
395
- snapped = GraphStore.snapshot_to_read_replica()
396
- _finish_phase(snap_label, "MCP will reload automatically" if snapped else "skipped (source DB not found)")
458
+ root_shard_store._recycle_conn()
459
+ sg.snapshot_all(background=False)
460
+ _finish_phase(snap_label, "MCP will reload automatically")
397
461
 
398
462
 
399
463
  @main.command()
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
523
587
 
524
588
  @main.command()
525
589
  @click.option("--json", "as_json", is_flag=True)
526
- def stats(as_json: bool) -> None:
590
+ @click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
591
+ def stats(as_json: bool, show_shards: bool) -> None:
527
592
  """Show per-project and aggregate graph statistics."""
528
- store = GraphStore(read_only=True)
529
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path ORDER BY p.id")
593
+ if show_shards:
594
+ _show_shard_topology(as_json)
595
+ return
596
+
597
+ # Fan-out across all shards so stats covers every project in the cluster.
598
+ sg = ShardedGraphStore(read_only=True)
599
+ all_projects_meta = sg.list_project_metadata()
600
+
601
+ # For detailed stats we need the per-project shard store.
602
+ def _project_store(pid: str):
603
+ return sg.shard(pid)
604
+
530
605
  if not projects:
531
606
  click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
532
607
  return
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
534
609
  rows = []
535
610
  for p in projects:
536
611
  pid = p["id"]
537
- files = store.query_records(
612
+ # Route each query to the project's owning shard.
613
+ ps = _project_store(pid)
614
+ files = ps.query_records(
538
615
  "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
539
616
  )
540
- classes = store.query_records(
617
+ classes = ps.query_records(
541
618
  """
542
619
  MATCH (f:File) WHERE f.project_id = $pid
543
620
  WITH f
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
546
623
  """,
547
624
  {"pid": pid},
548
625
  )
549
- methods = store.query_records(
626
+ methods = ps.query_records(
550
627
  """
551
628
  MATCH (f:File) WHERE f.project_id = $pid
552
629
  WITH f
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
557
634
  """,
558
635
  {"pid": pid},
559
636
  )
560
- calls = store.query_records(
637
+ calls = ps.query_records(
561
638
  """
562
639
  MATCH (f:File) WHERE f.project_id = $pid
563
640
  WITH f
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
568
645
  """,
569
646
  {"pid": pid},
570
647
  )
571
- emb = store.query_records(
648
+ emb = ps.query_records(
572
649
  """
573
650
  MATCH (f:File) WHERE f.project_id = $pid
574
651
  WITH f
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
580
657
  rows.append({
581
658
  "project": pid,
582
659
  "path": p["path"],
660
+ "shard": sg.router.shard_for(pid),
583
661
  "files": files[0]["n"] if files else 0,
584
662
  "classes": classes[0]["n"] if classes else 0,
585
663
  "methods": methods[0]["n"] if methods else 0,
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
592
670
  return
593
671
 
594
672
  col_w = max(len(r["project"]) for r in rows)
595
- header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
673
+ header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
596
674
  click.secho(header, fg="cyan")
597
675
  click.echo("-" * len(header))
598
676
  total_files = total_classes = total_methods = total_calls = total_emb = 0
599
677
  for r in rows:
600
678
  click.echo(
601
- f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
679
+ f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
602
680
  )
603
681
  total_files += r["files"]
604
682
  total_classes += r["classes"]
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
608
686
  if len(rows) > 1:
609
687
  click.echo("-" * len(header))
610
688
  click.secho(
611
- f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
689
+ f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
612
690
  fg="green",
613
691
  )
614
692
 
@@ -4,8 +4,17 @@ from dataclasses import dataclass
4
4
 
5
5
  @dataclass(frozen=True)
6
6
  class Settings:
7
+ # Legacy single-DB paths — kept for backward compat and as defaults when
8
+ # sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
7
9
  db_path: str = os.path.expanduser("~/.codespine_db")
8
10
  db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
11
+
12
+ # Sharding — new layout stores each shard under shards_dir/{N}/db
13
+ # num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
14
+ # ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
15
+ num_shards: int = 4
16
+ shards_dir: str = os.path.expanduser("~/.codespine/shards")
17
+
9
18
  pid_file: str = os.path.expanduser("~/.codespine.pid")
10
19
  log_file: str = os.path.expanduser("~/.codespine.log")
11
20
  embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
@@ -8,7 +8,7 @@ import shutil
8
8
  import threading
9
9
  import time
10
10
  from contextlib import contextmanager
11
- from dataclasses import dataclass
11
+ from dataclasses import InitVar, dataclass
12
12
  from typing import Any
13
13
 
14
14
  import kuzu
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
39
39
  @dataclass
40
40
  class GraphStore:
41
41
  read_only: bool = False
42
+ # Optional path overrides — when provided, the store uses these paths
43
+ # instead of the global SETTINGS values. The ShardedGraphStore uses
44
+ # this to give each shard its own isolated KùzuDB directory.
45
+ db_path_override: InitVar[str | None] = None
46
+ snapshot_path_override: InitVar[str | None] = None
47
+
48
+ def __post_init__(
49
+ self,
50
+ db_path_override: str | None,
51
+ snapshot_path_override: str | None,
52
+ ) -> None:
53
+ # Resolve effective paths — per-shard overrides win over global SETTINGS.
54
+ self._db_path: str = db_path_override or SETTINGS.db_path
55
+ self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
56
+
57
+ # Per-instance snapshot synchronisation (not class-level) so that
58
+ # multiple shards can snapshot concurrently without a shared bottleneck.
59
+ self._inst_snapshot_lock: threading.Lock = threading.Lock()
60
+ self._inst_snapshot_pending: threading.Event = threading.Event()
42
61
 
43
- def __post_init__(self) -> None:
44
62
  self._tls: threading.local = threading.local()
45
63
  from codespine.overlay.store import OverlayStore
46
64
 
@@ -48,10 +66,10 @@ class GraphStore:
48
66
 
49
67
  # Read-only callers (MCP, CLI reads) use the read replica when available.
50
68
  # This isolates them from the write process's buffer pool and WAL churn.
51
- if self.read_only and os.path.exists(SETTINGS.db_snapshot_path):
52
- db_path = SETTINGS.db_snapshot_path
69
+ if self.read_only and os.path.exists(self._snapshot_path):
70
+ db_path = self._snapshot_path
53
71
  else:
54
- db_path = SETTINGS.db_path
72
+ db_path = self._db_path
55
73
 
56
74
  try:
57
75
  self.db = self._open_with_recovery(db_path)
@@ -97,7 +115,7 @@ class GraphStore:
97
115
  try:
98
116
  ensure_schema(self._conn())
99
117
  except Exception as exc:
100
- path = getattr(self.db, "database_path", SETTINGS.db_path)
118
+ path = getattr(self.db, "database_path", self._db_path)
101
119
  if not self._is_recoverable_db_error(exc):
102
120
  raise
103
121
  LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
@@ -768,8 +786,7 @@ class GraphStore:
768
786
  self.clear_flows()
769
787
  self.clear_coupling()
770
788
 
771
- @staticmethod
772
- def force_delete_all_data() -> list[str]:
789
+ def force_delete_all_data(self) -> list[str]:
773
790
  """Delete all CodeSpine data files without touching the Kuzu engine.
774
791
 
775
792
  This is the nuclear option for OOM recovery: when the buffer pool is
@@ -779,12 +796,14 @@ class GraphStore:
779
796
 
780
797
  Returns the list of paths that were removed.
781
798
  """
799
+ db_path = self._db_path
800
+ snapshot_path = self._snapshot_path
782
801
  removed: list[str] = []
783
802
  for path in [
784
- SETTINGS.db_path,
785
- SETTINGS.db_snapshot_path,
786
- SETTINGS.db_snapshot_path + ".updated",
787
- SETTINGS.db_snapshot_path + ".tmp",
803
+ db_path,
804
+ snapshot_path,
805
+ snapshot_path + ".updated",
806
+ snapshot_path + ".tmp",
788
807
  SETTINGS.embedding_cache_path,
789
808
  SETTINGS.overlay_dir,
790
809
  SETTINGS.index_meta_dir,
@@ -801,7 +820,7 @@ class GraphStore:
801
820
  pass
802
821
  # Also remove any stale WAL files next to the DB
803
822
  for suffix in (".wal", ".lock"):
804
- wal_path = SETTINGS.db_path + suffix
823
+ wal_path = db_path + suffix
805
824
  if os.path.exists(wal_path):
806
825
  try:
807
826
  os.remove(wal_path)
@@ -812,7 +831,7 @@ class GraphStore:
812
831
 
813
832
  def rebuild_empty_db(self) -> None:
814
833
  self._recycle_conn()
815
- path = SETTINGS.db_path
834
+ path = self._db_path
816
835
  # Remove the DB directory AND any stale WAL / lock files
817
836
  self._remove_db_path(path)
818
837
  for suffix in (".wal", ".lock"):
@@ -825,11 +844,8 @@ class GraphStore:
825
844
 
826
845
  # Also remove the read replica so that read-only callers (stats, MCP)
827
846
  # don't continue to see stale data from before the wipe.
828
- for stale in [
829
- SETTINGS.db_snapshot_path,
830
- SETTINGS.db_snapshot_path + ".tmp",
831
- SETTINGS.db_snapshot_path + ".updated",
832
- ]:
847
+ snap = self._snapshot_path
848
+ for stale in [snap, snap + ".tmp", snap + ".updated"]:
833
849
  self._remove_db_path(stale)
834
850
 
835
851
  # Kuzu may retain stale internal state from a previous failed open of
@@ -926,18 +942,15 @@ class GraphStore:
926
942
  },
927
943
  )
928
944
 
929
- # Lock and flag for background snapshot coalescing.
930
- # Only one snapshot runs at a time; a pending request supersedes queued ones.
931
- _snapshot_lock: threading.Lock = threading.Lock()
932
- _snapshot_pending: threading.Event = threading.Event()
933
-
934
- @staticmethod
935
- def snapshot_to_read_replica(background: bool = False) -> bool:
945
+ def snapshot_to_read_replica(self, background: bool = False) -> bool:
936
946
  """Atomically copy the write DB to the read-replica path.
937
947
 
938
948
  The read replica is used by the MCP daemon and all read-only CLI
939
949
  commands so they never contend with the write process's buffer pool.
940
950
 
951
+ Each GraphStore instance has its own snapshot lock so that multiple
952
+ shards can snapshot concurrently without serialising on a class lock.
953
+
941
954
  Parameters
942
955
  ----------
943
956
  background:
@@ -950,36 +963,38 @@ class GraphStore:
950
963
  Returns True on success (or when dispatched to background), False if
951
964
  the source DB does not exist.
952
965
  """
953
- src = SETTINGS.db_path
966
+ src = self._db_path
954
967
  if not os.path.exists(src):
955
968
  return False
956
969
 
957
970
  if background:
958
971
  # Signal that a snapshot is wanted, then ensure a worker is running.
959
- GraphStore._snapshot_pending.set()
972
+ self._inst_snapshot_pending.set()
973
+ inst = self # capture for closure
960
974
 
961
975
  def _worker() -> None:
962
- while GraphStore._snapshot_pending.is_set():
963
- GraphStore._snapshot_pending.clear()
964
- with GraphStore._snapshot_lock:
965
- GraphStore._do_snapshot()
976
+ while inst._inst_snapshot_pending.is_set():
977
+ inst._inst_snapshot_pending.clear()
978
+ with inst._inst_snapshot_lock:
979
+ inst._do_snapshot()
966
980
 
967
- if not GraphStore._snapshot_lock.locked():
981
+ if not self._inst_snapshot_lock.locked():
968
982
  t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
969
983
  t.start()
970
984
  return True
971
985
 
972
986
  # Foreground (blocking) path — used by CLI analyse and tests.
973
- with GraphStore._snapshot_lock:
974
- return GraphStore._do_snapshot()
987
+ with self._inst_snapshot_lock:
988
+ return self._do_snapshot()
975
989
 
976
- @staticmethod
977
- def _do_snapshot() -> bool:
978
- """Perform the actual copy. Must be called with _snapshot_lock held."""
979
- src = SETTINGS.db_path
980
- dst = SETTINGS.db_snapshot_path
990
+ def _do_snapshot(self) -> bool:
991
+ """Perform the actual copy. Must be called with the instance snapshot lock held."""
992
+ src = self._db_path
993
+ dst = self._snapshot_path
981
994
  if not os.path.exists(src):
982
995
  return False
996
+ # Ensure the parent directory for the replica exists (shards layout).
997
+ os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
983
998
  tmp = dst + ".tmp"
984
999
  try:
985
1000
  if os.path.exists(tmp):
@@ -987,7 +1002,6 @@ class GraphStore:
987
1002
  if os.path.isdir(src):
988
1003
  shutil.copytree(src, tmp)
989
1004
  else:
990
- os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
991
1005
  shutil.copy2(src, tmp)
992
1006
  if os.path.exists(dst):
993
1007
  shutil.rmtree(dst, ignore_errors=True)
@@ -0,0 +1,9 @@
1
+ """CodeSpine sharding package.
2
+
3
+ Exposes the consistent-hash router and the ShardedGraphStore facade.
4
+ """
5
+
6
+ from codespine.sharding.router import ShardRouter
7
+ from codespine.sharding.store import ShardedGraphStore
8
+
9
+ __all__ = ["ShardRouter", "ShardedGraphStore"]
@@ -0,0 +1,123 @@
1
+ """Consistent-hash shard router for CodeSpine.
2
+
3
+ Design
4
+ ------
5
+ * ``num_shards`` physical shards — each shard owns an independent KùzuDB at
6
+ ``~/.codespine/shards/{N}/db``.
7
+ * Shard key = *root project name* (the part before ``::`` in a multi-module
8
+ project ID). This guarantees that all modules of the same project are
9
+ co-located in the same shard so that cross-module call resolution still
10
+ works in one graph traversal.
11
+ * Virtual-node ring (``VIRTUAL_NODES_PER_SHARD = 150``) gives an even
12
+ distribution even for small shard counts.
13
+ * ``num_shards`` can be changed at any time; affected projects must be
14
+ re-indexed, but unaffected projects continue to work.
15
+
16
+ Env var override
17
+ ----------------
18
+ ``CODESPINE_SHARDS=N`` (integer, default 4) sets the number of shards at
19
+ process start. 0 or 1 disables sharding (all projects land in shard 0).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import bisect
25
+ import hashlib
26
+ import os
27
+
28
+ VIRTUAL_NODES_PER_SHARD = 150 # virtual ring entries per physical shard
29
+
30
+
31
+ class ShardRouter:
32
+ """Maps project IDs to shard indices via a consistent-hash ring.
33
+
34
+ Parameters
35
+ ----------
36
+ num_shards:
37
+ Number of physical shards. Defaults to the ``CODESPINE_SHARDS``
38
+ environment variable, or ``4`` if unset.
39
+ shards_dir:
40
+ Base directory that holds per-shard sub-directories.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ num_shards: int | None = None,
46
+ shards_dir: str | None = None,
47
+ ) -> None:
48
+ _env = os.environ.get("CODESPINE_SHARDS", "").strip()
49
+ _default = max(1, int(_env)) if _env.isdigit() else 4
50
+ self.num_shards: int = max(1, num_shards if num_shards is not None else _default)
51
+ self.shards_dir: str = shards_dir or os.path.expanduser("~/.codespine/shards")
52
+
53
+ # Build virtual-node ring: list of (ring_point, shard_index) sorted by ring_point
54
+ self._ring: list[tuple[int, int]] = []
55
+ for shard_idx in range(self.num_shards):
56
+ for vn in range(VIRTUAL_NODES_PER_SHARD):
57
+ point = self._hash_key(f"shard-{shard_idx}-vn-{vn}")
58
+ self._ring.append((point, shard_idx))
59
+ self._ring.sort()
60
+ self._ring_points = [p for p, _ in self._ring]
61
+
62
+ # ------------------------------------------------------------------
63
+ # Routing
64
+ # ------------------------------------------------------------------
65
+
66
+ @staticmethod
67
+ def _hash_key(key: str) -> int:
68
+ """Deterministic 64-bit hash of a string."""
69
+ raw = hashlib.md5(key.encode("utf-8")).digest()
70
+ # Use first 8 bytes as unsigned 64-bit integer for wide ring range.
71
+ return int.from_bytes(raw[:8], "big")
72
+
73
+ def _root_key(self, project_id: str) -> str:
74
+ """Extract the root portion of a project_id for co-location.
75
+
76
+ For multi-module projects (format ``root::module``), all modules of
77
+ the same root must land on the same shard so that cross-module graph
78
+ traversals work without federation.
79
+ """
80
+ return project_id.split("::")[0] if "::" in project_id else project_id
81
+
82
+ def shard_for(self, project_id: str) -> int:
83
+ """Return the shard index [0, num_shards) for the given project_id."""
84
+ if self.num_shards == 1:
85
+ return 0
86
+ point = self._hash_key(self._root_key(project_id))
87
+ pos = bisect.bisect_left(self._ring_points, point)
88
+ # Wrap around the ring
89
+ _, shard_idx = self._ring[pos % len(self._ring)]
90
+ return shard_idx
91
+
92
+ def all_shards(self) -> list[int]:
93
+ """Return all shard indices."""
94
+ return list(range(self.num_shards))
95
+
96
+ # ------------------------------------------------------------------
97
+ # Path helpers
98
+ # ------------------------------------------------------------------
99
+
100
+ def db_path(self, shard_index: int) -> str:
101
+ """Absolute write-DB path for a shard."""
102
+ return os.path.join(self.shards_dir, str(shard_index), "db")
103
+
104
+ def snapshot_path(self, shard_index: int) -> str:
105
+ """Absolute read-replica path for a shard."""
106
+ return os.path.join(self.shards_dir, str(shard_index), "db_read")
107
+
108
+ def shard_home(self, shard_index: int) -> str:
109
+ """Directory that holds all data for a shard."""
110
+ return os.path.join(self.shards_dir, str(shard_index))
111
+
112
+ # ------------------------------------------------------------------
113
+ # Helpers
114
+ # ------------------------------------------------------------------
115
+
116
+ def describe(self) -> dict:
117
+ """Return a human-readable summary of the routing table."""
118
+ return {
119
+ "num_shards": self.num_shards,
120
+ "shards_dir": self.shards_dir,
121
+ "virtual_nodes_per_shard": VIRTUAL_NODES_PER_SHARD,
122
+ "ring_size": len(self._ring),
123
+ }
@@ -0,0 +1,312 @@
1
+ """ShardedGraphStore — in-process shard coordinator.
2
+
3
+ Each project (or multi-module root) is consistently hashed to a shard index.
4
+ All modules of the same project share one shard so that cross-module graph
5
+ traversals see the full call graph without federation.
6
+
7
+ Design
8
+ ------
9
+ * ``ShardedGraphStore`` maintains a pool of ``GraphStore`` instances, one per
10
+ shard opened so far. Shards are opened lazily on first access.
11
+ * Existing callers that receive a plain ``GraphStore`` continue to work
12
+ unchanged. The new entry point is ``ShardedGraphStore.shard(project_id)``
13
+ which returns the ``GraphStore`` responsible for that project.
14
+ * Fan-out reads (``list_project_metadata``, global search) call
15
+ ``all_shards()`` to iterate every open shard.
16
+ * ``snapshot_all()`` triggers per-shard snapshots concurrently.
17
+
18
+ Migration from v0.9.x
19
+ ---------------------
20
+ If ``~/.codespine_db`` exists and the new shards directory doesn't, the
21
+ store automatically migrates the legacy DB to shard 0's path on first access
22
+ so existing indexed data isn't lost.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import logging
28
+ import os
29
+ import shutil
30
+ import threading
31
+ from typing import Any
32
+
33
+ from codespine.config import SETTINGS
34
+ from codespine.db.store import GraphStore
35
+ from codespine.sharding.router import ShardRouter
36
+
37
+ LOGGER = logging.getLogger(__name__)
38
+
39
+
40
+ class ShardedGraphStore:
41
+ """Coordinates multiple per-shard ``GraphStore`` instances.
42
+
43
+ Parameters
44
+ ----------
45
+ read_only:
46
+ Passed through to each ``GraphStore``.
47
+ num_shards:
48
+ Override for the shard count. Defaults to ``SETTINGS.num_shards``.
49
+ shards_dir:
50
+ Override for the shards base directory.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ read_only: bool = False,
56
+ num_shards: int | None = None,
57
+ shards_dir: str | None = None,
58
+ ) -> None:
59
+ self.read_only = read_only
60
+ self.router = ShardRouter(
61
+ num_shards=num_shards or SETTINGS.num_shards,
62
+ shards_dir=shards_dir or SETTINGS.shards_dir,
63
+ )
64
+ self._pool: dict[int, GraphStore] = {}
65
+ self._lock = threading.Lock()
66
+ self._migrated = False
67
+
68
+ # ------------------------------------------------------------------
69
+ # Core routing
70
+ # ------------------------------------------------------------------
71
+
72
+ def shard(self, project_id: str) -> GraphStore:
73
+ """Return (or open) the GraphStore for this project's shard.
74
+
75
+ In write mode this always returns a valid store (creating the DB if
76
+ needed). In read-only mode, if the shard DB has never been written to,
77
+ this also creates it (returning an empty-but-valid store) so that
78
+ callers can safely query it without crashing.
79
+ """
80
+ idx = self.router.shard_for(project_id)
81
+ store = self._get_shard(idx)
82
+ if store is None:
83
+ # Fallback: open read-only against an empty path so callers get an
84
+ # empty result set rather than a crash. This happens when the
85
+ # shard DB doesn't exist yet.
86
+ with self._lock:
87
+ if idx not in self._pool:
88
+ db_path = self.router.db_path(idx)
89
+ snap_path = self.router.snapshot_path(idx)
90
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
91
+ self._pool[idx] = GraphStore(
92
+ read_only=False, # create empty DB
93
+ db_path_override=db_path,
94
+ snapshot_path_override=snap_path,
95
+ )
96
+ store = self._pool[idx]
97
+ return store
98
+
99
+ def _get_shard(self, idx: int) -> GraphStore | None:
100
+ """Return the GraphStore for shard *idx*, or None if it doesn't exist
101
+ yet and we're in read-only mode (nothing to read there)."""
102
+ with self._lock:
103
+ if idx not in self._pool:
104
+ self._maybe_migrate(idx)
105
+ db_path = self.router.db_path(idx)
106
+ snap_path = self.router.snapshot_path(idx)
107
+
108
+ # In read-only mode, skip shards whose DB hasn't been created
109
+ # yet — Kuzu refuses to create an empty DB under read_only=True.
110
+ if self.read_only and not os.path.exists(db_path) and not os.path.exists(snap_path):
111
+ return None
112
+
113
+ # Ensure parent directory exists before Kuzu opens it.
114
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
115
+ self._pool[idx] = GraphStore(
116
+ read_only=self.read_only,
117
+ db_path_override=db_path,
118
+ snapshot_path_override=snap_path,
119
+ )
120
+ return self._pool[idx]
121
+
122
+ def _maybe_migrate(self, idx: int) -> None:
123
+ """One-time migration: copy legacy ~/.codespine_db → shard 0 DB path.
124
+
125
+ Guard: only triggers when the shards_dir matches the compiled-in
126
+ default (SETTINGS.shards_dir). Custom / test shards_dir values are
127
+ never eligible for migration so that test isolation is preserved.
128
+ """
129
+ if self._migrated or idx != 0:
130
+ return
131
+ self._migrated = True
132
+
133
+ # Safety guard: never auto-migrate when using a non-default shards dir.
134
+ # This prevents test code that passes a temp dir from accidentally
135
+ # touching production data.
136
+ if os.path.realpath(self.router.shards_dir) != os.path.realpath(SETTINGS.shards_dir):
137
+ return
138
+
139
+ legacy = SETTINGS.db_path # ~/.codespine_db
140
+ target = self.router.db_path(0)
141
+ if not os.path.exists(legacy):
142
+ return
143
+ if os.path.exists(target):
144
+ # Sharded layout already initialised — don't overwrite.
145
+ return
146
+ LOGGER.info(
147
+ "Migrating legacy DB %s → shard 0 path %s", legacy, target
148
+ )
149
+ try:
150
+ os.makedirs(os.path.dirname(target), exist_ok=True)
151
+ # Copy first, delete after — if the copy fails the original is safe.
152
+ if os.path.isdir(legacy):
153
+ shutil.copytree(legacy, target)
154
+ else:
155
+ shutil.copy2(legacy, target)
156
+ shutil.rmtree(legacy, ignore_errors=True)
157
+ # Also migrate read replica if present.
158
+ legacy_snap = SETTINGS.db_snapshot_path
159
+ target_snap = self.router.snapshot_path(0)
160
+ if os.path.exists(legacy_snap) and not os.path.exists(target_snap):
161
+ if os.path.isdir(legacy_snap):
162
+ shutil.copytree(legacy_snap, target_snap)
163
+ else:
164
+ shutil.copy2(legacy_snap, target_snap)
165
+ shutil.rmtree(legacy_snap, ignore_errors=True)
166
+ except OSError as exc:
167
+ LOGGER.warning("Migration from legacy path failed: %s — starting fresh", exc)
168
+
169
+ def all_shards(self) -> list[GraphStore]:
170
+ """Open and return all physical shards that exist (for fan-out reads).
171
+
172
+ Non-existent shards are skipped in read-only mode to avoid Kuzu
173
+ errors when opening empty paths under read_only=True.
174
+ """
175
+ stores = []
176
+ for i in self.router.all_shards():
177
+ s = self._get_shard(i)
178
+ if s is not None:
179
+ stores.append(s)
180
+ return stores
181
+
182
+ def open_shards(self) -> list[GraphStore]:
183
+ """Return only shards that have already been opened (no I/O)."""
184
+ with self._lock:
185
+ return [s for s in self._pool.values() if s is not None]
186
+
187
+ # ------------------------------------------------------------------
188
+ # Delegated project-scoped operations
189
+ # ------------------------------------------------------------------
190
+
191
+ def upsert_project(self, project_id: str, path: str) -> None:
192
+ self.shard(project_id).upsert_project(project_id, path)
193
+
194
+ def clear_project(self, project_id: str) -> None:
195
+ self.shard(project_id).clear_project(project_id)
196
+
197
+ def get_project_metadata(self, project_id: str) -> dict[str, Any] | None:
198
+ return self.shard(project_id).get_project_metadata(project_id)
199
+
200
+ def set_project_overlay_dirty(self, project_id: str, dirty: bool) -> None:
201
+ self.shard(project_id).set_project_overlay_dirty(project_id, dirty)
202
+
203
+ def set_project_indexed_commit(self, project_id: str, commit: str) -> None:
204
+ self.shard(project_id).set_project_indexed_commit(project_id, commit)
205
+
206
+ def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
207
+ return self.shard(project_id).project_file_hashes(project_id)
208
+
209
+ def project_has_embeddings(self, project_id: str) -> bool:
210
+ return self.shard(project_id).project_has_embeddings(project_id)
211
+
212
+ def upsert_file_from_entry(self, entry: dict, project_path: str) -> None:
213
+ project_id = entry.get("project_id", "")
214
+ self.shard(project_id).upsert_file_from_entry(entry, project_path)
215
+
216
+ def clear_file_by_path(self, project_id: str, project_path: str, file_path: str) -> None:
217
+ self.shard(project_id).clear_file_by_path(project_id, project_path, file_path)
218
+
219
+ # ------------------------------------------------------------------
220
+ # Fan-out global reads
221
+ # ------------------------------------------------------------------
222
+
223
+ def list_project_metadata(self) -> list[dict[str, Any]]:
224
+ """Aggregate project list across all shards."""
225
+ results: list[dict[str, Any]] = []
226
+ seen: set[str] = set()
227
+ for store in self.all_shards():
228
+ for rec in store.list_project_metadata():
229
+ pid = rec.get("id", "")
230
+ if pid and pid not in seen:
231
+ seen.add(pid)
232
+ results.append(rec)
233
+ results.sort(key=lambda r: r.get("id", ""))
234
+ return results
235
+
236
+ def query_records(
237
+ self,
238
+ query: str,
239
+ params: dict[str, Any] | None = None,
240
+ *,
241
+ project_id: str | None = None,
242
+ ) -> list[dict[str, Any]]:
243
+ """Execute a Cypher query.
244
+
245
+ If ``project_id`` is given, the query runs only on that project's
246
+ shard (fast path). Otherwise the query fans out to all shards and
247
+ results are concatenated.
248
+
249
+ Note: fan-out only makes sense for queries whose results are
250
+ independent per shard (e.g. listing nodes). Queries that aggregate
251
+ across shards (e.g. global COUNT) will return per-shard subtotals.
252
+ """
253
+ if project_id:
254
+ return self.shard(project_id).query_records(query, params)
255
+ merged: list[dict[str, Any]] = []
256
+ for store in self.all_shards():
257
+ merged.extend(store.query_records(query, params))
258
+ return merged
259
+
260
+ # ------------------------------------------------------------------
261
+ # Snapshot all shards
262
+ # ------------------------------------------------------------------
263
+
264
+ def snapshot_all(self, background: bool = False) -> None:
265
+ """Snapshot every open shard.
266
+
267
+ In background mode all snapshots run concurrently in daemon threads
268
+ (one per shard) rather than sequentially.
269
+ """
270
+ for store in self.open_shards():
271
+ store.snapshot_to_read_replica(background=background)
272
+
273
+ # ------------------------------------------------------------------
274
+ # Global reset / status
275
+ # ------------------------------------------------------------------
276
+
277
+ def force_delete_all_data(self) -> list[str]:
278
+ """Delete all shards' data files. Equivalent to force_delete per shard."""
279
+ removed: list[str] = []
280
+ for store in self.all_shards():
281
+ removed.extend(store.force_delete_all_data())
282
+ return removed
283
+
284
+ def describe(self) -> dict:
285
+ """Return a human-readable description of the shard topology."""
286
+ shard_info = []
287
+ for idx in self.router.all_shards():
288
+ db_path = self.router.db_path(idx)
289
+ shard_info.append({
290
+ "index": idx,
291
+ "db_path": db_path,
292
+ "exists": os.path.exists(db_path),
293
+ "open": idx in self._pool,
294
+ })
295
+ return {
296
+ **self.router.describe(),
297
+ "shards": shard_info,
298
+ }
299
+
300
+ # ------------------------------------------------------------------
301
+ # Backward-compat helpers — delegate to shard 0 for single-project use
302
+ # ------------------------------------------------------------------
303
+
304
+ @property
305
+ def overlay_store(self):
306
+ """Expose the overlay store from shard 0 for backward compat."""
307
+ return self._get_shard(0).overlay_store
308
+
309
+ @staticmethod
310
+ def stable_id(*parts: str) -> str:
311
+ """Stable SHA1-based identifier (shard-independent)."""
312
+ return GraphStore.stable_id(*parts)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.5
3
+ Version: 0.9.6
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -45,6 +45,9 @@ codespine/search/fuzzy.py
45
45
  codespine/search/hybrid.py
46
46
  codespine/search/rrf.py
47
47
  codespine/search/vector.py
48
+ codespine/sharding/__init__.py
49
+ codespine/sharding/router.py
50
+ codespine/sharding/store.py
48
51
  codespine/watch/__init__.py
49
52
  codespine/watch/git_hook.py
50
53
  codespine/watch/watcher.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codespine"
7
- version = "0.9.5"
7
+ version = "0.9.6"
8
8
  description = "Local Java code intelligence indexer backed by a graph database"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
File without changes
File without changes
File without changes
File without changes
File without changes