codespine 0.9.5__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {codespine-0.9.5 → codespine-0.9.7}/PKG-INFO +1 -1
  2. {codespine-0.9.5 → codespine-0.9.7}/codespine/__init__.py +1 -1
  3. {codespine-0.9.5 → codespine-0.9.7}/codespine/cli.py +306 -116
  4. {codespine-0.9.5 → codespine-0.9.7}/codespine/config.py +9 -0
  5. {codespine-0.9.5 → codespine-0.9.7}/codespine/db/store.py +126 -70
  6. codespine-0.9.7/codespine/sharding/__init__.py +9 -0
  7. codespine-0.9.7/codespine/sharding/router.py +123 -0
  8. codespine-0.9.7/codespine/sharding/store.py +312 -0
  9. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/PKG-INFO +1 -1
  10. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/SOURCES.txt +4 -0
  11. {codespine-0.9.5 → codespine-0.9.7}/pyproject.toml +1 -1
  12. codespine-0.9.7/tests/test_sharding.py +200 -0
  13. {codespine-0.9.5 → codespine-0.9.7}/LICENSE +0 -0
  14. {codespine-0.9.5 → codespine-0.9.7}/README.md +0 -0
  15. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/__init__.py +0 -0
  16. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/community.py +0 -0
  17. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/context.py +0 -0
  18. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/coupling.py +0 -0
  19. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/crossmodule.py +0 -0
  20. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/deadcode.py +0 -0
  21. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/flow.py +0 -0
  22. {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/impact.py +0 -0
  23. {codespine-0.9.5 → codespine-0.9.7}/codespine/db/__init__.py +0 -0
  24. {codespine-0.9.5 → codespine-0.9.7}/codespine/db/schema.py +0 -0
  25. {codespine-0.9.5 → codespine-0.9.7}/codespine/diff/__init__.py +0 -0
  26. {codespine-0.9.5 → codespine-0.9.7}/codespine/diff/branch_diff.py +0 -0
  27. {codespine-0.9.5 → codespine-0.9.7}/codespine/guide.py +0 -0
  28. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/__init__.py +0 -0
  29. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/call_resolver.py +0 -0
  30. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/di_resolver.py +0 -0
  31. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/engine.py +0 -0
  32. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/java_parser.py +0 -0
  33. {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/symbol_builder.py +0 -0
  34. {codespine-0.9.5 → codespine-0.9.7}/codespine/mcp/__init__.py +0 -0
  35. {codespine-0.9.5 → codespine-0.9.7}/codespine/mcp/server.py +0 -0
  36. {codespine-0.9.5 → codespine-0.9.7}/codespine/noise/__init__.py +0 -0
  37. {codespine-0.9.5 → codespine-0.9.7}/codespine/noise/blocklist.py +0 -0
  38. {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/__init__.py +0 -0
  39. {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/git_state.py +0 -0
  40. {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/merge.py +0 -0
  41. {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/store.py +0 -0
  42. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/__init__.py +0 -0
  43. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/bm25.py +0 -0
  44. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/fuzzy.py +0 -0
  45. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/hybrid.py +0 -0
  46. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/rrf.py +0 -0
  47. {codespine-0.9.5 → codespine-0.9.7}/codespine/search/vector.py +0 -0
  48. {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/__init__.py +0 -0
  49. {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/git_hook.py +0 -0
  50. {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/watcher.py +0 -0
  51. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/dependency_links.txt +0 -0
  52. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/entry_points.txt +0 -0
  53. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/requires.txt +0 -0
  54. {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/top_level.txt +0 -0
  55. {codespine-0.9.5 → codespine-0.9.7}/gindex.py +0 -0
  56. {codespine-0.9.5 → codespine-0.9.7}/setup.cfg +0 -0
  57. {codespine-0.9.5 → codespine-0.9.7}/tests/test_branch_diff_normalize.py +0 -0
  58. {codespine-0.9.5 → codespine-0.9.7}/tests/test_call_resolver.py +0 -0
  59. {codespine-0.9.5 → codespine-0.9.7}/tests/test_community_detection.py +0 -0
  60. {codespine-0.9.5 → codespine-0.9.7}/tests/test_deadcode.py +0 -0
  61. {codespine-0.9.5 → codespine-0.9.7}/tests/test_index_and_hybrid.py +0 -0
  62. {codespine-0.9.5 → codespine-0.9.7}/tests/test_java_parser.py +0 -0
  63. {codespine-0.9.5 → codespine-0.9.7}/tests/test_multimodule_index.py +0 -0
  64. {codespine-0.9.5 → codespine-0.9.7}/tests/test_overlay.py +0 -0
  65. {codespine-0.9.5 → codespine-0.9.7}/tests/test_search_ranking.py +0 -0
  66. {codespine-0.9.5 → codespine-0.9.7}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.5
3
+ Version: 0.9.7
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.9.5"
4
+ __version__ = "0.9.7"
@@ -6,7 +6,10 @@ import os
6
6
  import signal
7
7
  import subprocess
8
8
  import sys
9
+ import threading
9
10
  import time
11
+ from collections import defaultdict
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
13
 
11
14
  import click
12
15
  import psutil
@@ -20,6 +23,7 @@ from codespine.analysis.flow import trace_execution_flows
20
23
  from codespine.analysis.impact import analyze_impact
21
24
  from codespine.config import SETTINGS
22
25
  from codespine.db.store import GraphStore
26
+ from codespine.sharding import ShardedGraphStore, ShardRouter
23
27
  from codespine.diff.branch_diff import compare_branches
24
28
  from codespine.indexer.engine import JavaIndexer
25
29
  from codespine.mcp.server import build_mcp_server
@@ -90,6 +94,197 @@ def _spinner_char() -> str:
90
94
  return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
91
95
 
92
96
 
97
+ def _index_shard_group(
98
+ shard_idx: int,
99
+ modules: list[tuple[str, str]],
100
+ sg,
101
+ full: bool,
102
+ embed: bool,
103
+ output_lock: threading.Lock,
104
+ parallel: bool,
105
+ ) -> tuple[int, list, int]:
106
+ """Index one group of modules that share a shard.
107
+
108
+ Modules within the group are always indexed sequentially (same KùzuDB).
109
+ Multiple groups can run concurrently in different threads when they own
110
+ different shards.
111
+
112
+ Returns (total_files_found, all_results, shard_idx).
113
+ """
114
+ results = []
115
+ total_files = 0
116
+
117
+ def _locked_echo(*args, **kwargs) -> None:
118
+ """Thread-safe click.echo."""
119
+ with output_lock:
120
+ click.echo(*args, **kwargs)
121
+
122
+ def _locked_secho(*args, **kwargs) -> None:
123
+ with output_lock:
124
+ click.secho(*args, **kwargs)
125
+
126
+ prefix = f"[S{shard_idx}] " if parallel else ""
127
+
128
+ for mod_path, project_id in modules:
129
+ # Per-module progress state (local — no shared mutation).
130
+ parse_state: dict = {"shown": False, "indexed": 0, "total": 0,
131
+ "last_ts": 0.0, "printed_zero": False}
132
+ call_state: dict = {"shown": False, "count": 0, "last_ts": 0.0,
133
+ "started_at": 0.0}
134
+
135
+ def _progress(event: str, payload: dict) -> None:
136
+ now = time.perf_counter()
137
+ if event == "scan_done":
138
+ with output_lock:
139
+ _phase(f"{prefix}Walking files...", f"{int(payload.get('files_found', 0))} files found")
140
+ return
141
+ if event == "plan_done":
142
+ to_index = int(payload.get("files_to_index", 0))
143
+ deleted = int(payload.get("deleted_files", 0))
144
+ mode = str(payload.get("mode", "incremental"))
145
+ parse_state["total"] = to_index
146
+ with output_lock:
147
+ _phase(f"{prefix}Index mode...", f"{mode} ({to_index} files, {deleted} deleted)")
148
+ if to_index == 0:
149
+ with output_lock:
150
+ _phase(f"{prefix}Parsing code...", "0/0")
151
+ parse_state["printed_zero"] = True
152
+ return
153
+ if event == "parse_progress":
154
+ indexed = int(payload.get("indexed", 0))
155
+ total = int(payload.get("total", 0))
156
+ parse_state["indexed"] = indexed
157
+ parse_state["total"] = total
158
+ if total == 0:
159
+ return
160
+ if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
161
+ if not parallel:
162
+ # In-place progress bar only makes sense in serial mode.
163
+ click.echo(
164
+ f"\r{prefix}Parsing code... {_bar(indexed, total)} {indexed}/{total} ",
165
+ nl=False,
166
+ )
167
+ else:
168
+ with output_lock:
169
+ click.echo(
170
+ f"\r{prefix}Parsing {indexed}/{total} ",
171
+ nl=False,
172
+ )
173
+ parse_state["shown"] = True
174
+ parse_state["last_ts"] = now
175
+ return
176
+ if event in ("resolve_calls_start",):
177
+ if parse_state["shown"]:
178
+ with output_lock:
179
+ click.echo()
180
+ parse_state["shown"] = False
181
+ call_state["started_at"] = now
182
+ with output_lock:
183
+ _phase(f"{prefix}Tracing calls...", "starting...")
184
+ return
185
+ if event == "resolve_calls_progress":
186
+ call_state["count"] = int(payload.get("calls_resolved", 0))
187
+ if (now - call_state["last_ts"]) >= 0.25:
188
+ elapsed_s = now - call_state["started_at"]
189
+ if not parallel:
190
+ click.echo(
191
+ f"\r{_spinner_char()} {prefix}Tracing calls... "
192
+ f"{call_state['count']:>6} resolved {elapsed_s:.1f}s ",
193
+ nl=False,
194
+ )
195
+ else:
196
+ with output_lock:
197
+ click.echo(
198
+ f"\r{prefix}Calls: {call_state['count']} ({elapsed_s:.0f}s) ",
199
+ nl=False,
200
+ )
201
+ call_state["shown"] = True
202
+ call_state["last_ts"] = now
203
+ return
204
+ if event == "resolve_calls_done":
205
+ if call_state["shown"]:
206
+ with output_lock:
207
+ click.echo()
208
+ call_state["shown"] = False
209
+ elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
210
+ n = int(payload.get("calls_resolved", 0))
211
+ with output_lock:
212
+ _phase(f"{prefix}Tracing calls...", f"{n} calls resolved ({elapsed_s:.1f}s)")
213
+ return
214
+ if event == "resolve_types_start":
215
+ with output_lock:
216
+ _phase(f"{prefix}Analyzing types...", "running")
217
+ return
218
+ if event == "resolve_types_done":
219
+ n = int(payload.get("type_relationships", 0))
220
+ with output_lock:
221
+ _phase(f"{prefix}Analyzing types...", f"{n} type relationships")
222
+ return
223
+
224
+ shard_store = sg.shard(project_id)
225
+ indexer = JavaIndexer(shard_store)
226
+ result = indexer.index_project(
227
+ mod_path, full=full, progress=_progress, project_id=project_id, embed=embed
228
+ )
229
+ results.append(result)
230
+ total_files += result.files_found
231
+
232
+ # Flush any dangling progress line.
233
+ if parse_state["shown"]:
234
+ with output_lock:
235
+ click.echo()
236
+
237
+ return shard_idx, results, total_files
238
+
239
+
240
+ def _show_shard_topology(as_json: bool) -> None:
241
+ """Display the current shard routing topology and imbalance metrics."""
242
+ router = ShardRouter()
243
+ sg = ShardedGraphStore(read_only=True)
244
+ topology = sg.describe()
245
+
246
+ # Gather project → shard mapping from all shards.
247
+ shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
248
+ for p in sg.list_project_metadata():
249
+ pid = p.get("id", "")
250
+ idx = router.shard_for(pid)
251
+ shard_project_counts[idx].append(pid)
252
+
253
+ counts = [len(v) for v in shard_project_counts.values()]
254
+ total = sum(counts)
255
+ median = sorted(counts)[len(counts) // 2] if counts else 0
256
+ max_count = max(counts) if counts else 0
257
+ imbalance = (max_count / median) if median else 1.0
258
+
259
+ if as_json:
260
+ _echo_json({
261
+ "topology": topology,
262
+ "project_distribution": {str(k): v for k, v in shard_project_counts.items()},
263
+ "imbalance_ratio": round(imbalance, 2),
264
+ }, as_json=True)
265
+ return
266
+
267
+ click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
268
+ click.echo(f" Directory : {router.shards_dir}")
269
+ click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
270
+ click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
271
+ click.echo()
272
+ header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
273
+ click.secho(header, fg="cyan")
274
+ click.echo("-" * 60)
275
+ for i, info in enumerate(topology.get("shards", [])):
276
+ plist = shard_project_counts.get(i, [])
277
+ exists_str = "yes" if info.get("exists") else "no"
278
+ click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
279
+ for pid in plist:
280
+ click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
281
+ if imbalance > 2.0:
282
+ click.secho(
283
+ f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
284
+ fg="yellow",
285
+ )
286
+
287
+
93
288
  @click.group()
94
289
  def main() -> None:
95
290
  """CodeSpine CLI."""
@@ -130,8 +325,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
130
325
  fg="yellow",
131
326
  )
132
327
 
133
- store = GraphStore(read_only=False)
134
- indexer = JavaIndexer(store)
328
+ # ShardedGraphStore routes each project to its dedicated DB shard.
329
+ # For single-project analysis this is transparent — shard() always
330
+ # returns a GraphStore pointing to the correct shard path.
331
+ sg = ShardedGraphStore(read_only=False)
332
+ # The indexer is initialised per-module below with the right shard store.
333
+ # We keep a single ShardedGraphStore to fan-out cross-module linking later.
135
334
 
136
335
  # --- Workspace → project → module detection ---
137
336
  # Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
@@ -164,96 +363,69 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
164
363
 
165
364
  root_basename = os.path.basename(abs_path)
166
365
 
167
- # Shared progress state (reset per module)
168
- parse_state = {"shown": False, "indexed": 0, "total": 0, "last_ts": 0.0, "printed_zero": False}
169
- call_state = {"shown": False, "count": 0, "last_ts": 0.0, "started_at": 0.0}
170
-
171
- def _reset_state() -> None:
172
- for k in list(parse_state):
173
- parse_state[k] = False if isinstance(parse_state[k], bool) else (0.0 if isinstance(parse_state[k], float) else 0)
174
- parse_state["last_ts"] = 0.0
175
-
176
- def _progress(event: str, payload: dict) -> None:
177
- now = time.perf_counter()
178
- if event == "scan_done":
179
- _phase("Walking files...", f"{int(payload.get('files_found', 0))} files found")
180
- return
181
- if event == "plan_done":
182
- to_index = int(payload.get("files_to_index", 0))
183
- deleted = int(payload.get("deleted_files", 0))
184
- mode = str(payload.get("mode", "incremental"))
185
- parse_state["total"] = to_index
186
- _phase("Index mode...", f"{mode} ({to_index} files to index, {deleted} deleted)")
187
- if to_index == 0:
188
- _phase("Parsing code...", "0/0")
189
- parse_state["printed_zero"] = True
190
- return
191
- if event == "parse_progress":
192
- indexed = int(payload.get("indexed", 0))
193
- total = int(payload.get("total", 0))
194
- parse_state["indexed"] = indexed
195
- parse_state["total"] = total
196
- if total == 0:
197
- return
198
- if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
199
- click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
200
- parse_state["shown"] = True
201
- parse_state["last_ts"] = now
202
- return
203
- if event == "resolve_calls_start" and parse_state["shown"]:
204
- click.echo()
205
- parse_state["shown"] = False
206
- call_state["started_at"] = now
207
- _phase("Tracing calls...", "starting...")
208
- return
209
- if event == "resolve_calls_start":
210
- call_state["started_at"] = now
211
- _phase("Tracing calls...", "starting...")
212
- return
213
- if event == "resolve_calls_progress":
214
- call_state["count"] = int(payload.get("calls_resolved", 0))
215
- if (now - call_state["last_ts"]) >= 0.25:
216
- elapsed_s = now - call_state["started_at"]
217
- click.echo(
218
- f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
219
- nl=False,
220
- )
221
- call_state["shown"] = True
222
- call_state["last_ts"] = now
223
- return
224
- if event == "resolve_calls_done":
225
- if call_state["shown"]:
226
- click.echo()
227
- call_state["shown"] = False
228
- elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
229
- _phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
230
- return
231
- if event == "resolve_types_start":
232
- _phase("Analyzing types...", "running")
233
- return
234
- if event == "resolve_types_done":
235
- _phase("Analyzing types...", f"{int(payload.get('type_relationships', 0))} type relationships")
236
- return
237
-
238
- # --- Index each module ---
366
+ # ── Group modules by target shard ─────────────────────────────────
367
+ # Modules that hash to different shards own separate KùzuDBs and can
368
+ # be indexed in parallel. Modules in the same shard (same project
369
+ # root for multi-module projects) are always indexed sequentially.
370
+ shard_groups: dict[int, list[tuple[str, str]]] = defaultdict(list)
371
+ for mod_path, pid in modules_with_ids:
372
+ shard_groups[sg.router.shard_for(pid)].append((mod_path, pid))
373
+
239
374
  is_multi = len(modules_with_ids) > 1
375
+ parallel_mode = len(shard_groups) > 1 # ≥2 shards → true parallelism
376
+ output_lock = threading.Lock()
377
+
378
+ if parallel_mode:
379
+ click.secho(
380
+ f"Parallel mode: {len(shard_groups)} shards will be indexed concurrently.",
381
+ fg="cyan",
382
+ )
383
+
384
+ # Print which shard each module lands on (multi-module only).
385
+ if is_multi:
386
+ for s_idx, group in sorted(shard_groups.items()):
387
+ for _, pid in group:
388
+ click.secho(f" {pid:<40} → shard {s_idx}", fg="cyan")
389
+
390
+ # ── Dispatch to shards ────────────────────────────────────────────
240
391
  total_files_found = 0
392
+ all_results: list = []
241
393
  last_result = None
242
- for idx, (module_path, project_id) in enumerate(modules_with_ids):
243
- if is_multi:
244
- click.echo()
245
- click.secho(f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}", fg="cyan")
246
- _reset_state()
247
- last_result = indexer.index_project(
248
- module_path, full=full, progress=_progress, project_id=project_id, embed=embed
394
+
395
+ if parallel_mode:
396
+ max_workers = min(len(shard_groups), 4)
397
+ click.echo()
398
+ futures_map = {}
399
+ with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="codespine-shard") as ex:
400
+ for s_idx, group in shard_groups.items():
401
+ f = ex.submit(
402
+ _index_shard_group,
403
+ s_idx, group, sg, full, embed, output_lock, True,
404
+ )
405
+ futures_map[f] = s_idx
406
+
407
+ for future in as_completed(futures_map):
408
+ s_idx = futures_map[future]
409
+ try:
410
+ ret_idx, results, n_files = future.result()
411
+ all_results.extend(results)
412
+ total_files_found += n_files
413
+ if results:
414
+ last_result = results[-1]
415
+ with output_lock:
416
+ click.secho(f" Shard {ret_idx} done ({n_files} files)", fg="green")
417
+ except Exception as exc: # noqa: BLE001
418
+ with output_lock:
419
+ click.secho(f" Shard {s_idx} FAILED: {exc}", fg="red")
420
+ else:
421
+ # Serial path — single shard (or single module). Full progress UX.
422
+ only_shard_idx = next(iter(shard_groups))
423
+ only_group = shard_groups[only_shard_idx]
424
+ _, all_results, total_files_found = _index_shard_group(
425
+ only_shard_idx, only_group, sg, full, embed, output_lock, False,
249
426
  )
250
- total_files_found += last_result.files_found
251
- if parse_state["shown"]:
252
- click.echo()
253
- if parse_state["total"] == 0 and not parse_state["printed_zero"]:
254
- _phase("Parsing code...", "0/0")
255
- elif parse_state["indexed"] < parse_state["total"]:
256
- _phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
427
+ if all_results:
428
+ last_result = all_results[-1]
257
429
 
258
430
  # ── Helper for in-place progress updates ────────────────────────────
259
431
  def _live_phase(label: str, status: str) -> None:
@@ -264,13 +436,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
264
436
  """Finalise an in-place phase line and move to the next line."""
265
437
  click.echo(f"\r✓ {label:<28} {result:<48}")
266
438
 
439
+ # For cross-module operations (cross-module linking, deep analysis, stats)
440
+ # we use the shard store for the root project (all modules share one shard).
441
+ root_project_id = last_result.project_id if last_result else root_basename
442
+ root_shard_store = sg.shard(root_project_id)
443
+
267
444
  # ── Cross-module call linking ──────────────────────────────────────
268
445
  if is_multi and len(modules_with_ids) > 1:
269
446
  xmod_label = "Cross-module linking..."
270
447
  _live_phase(xmod_label, "running")
271
448
  xmod_pids = [pid for _, pid in modules_with_ids]
272
449
  xmod_edges = link_cross_module_calls(
273
- store, project_ids=xmod_pids,
450
+ root_shard_store, project_ids=xmod_pids,
274
451
  progress=lambda s: _live_phase(xmod_label, s),
275
452
  )
276
453
  _finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
@@ -287,7 +464,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
287
464
  comm_label = "Detecting communities..."
288
465
  _live_phase(comm_label, "running")
289
466
  communities = detect_communities(
290
- store,
467
+ root_shard_store,
291
468
  progress=lambda s: _live_phase(comm_label, s),
292
469
  )
293
470
  _finish_phase(comm_label, f"{len(communities)} clusters found")
@@ -295,23 +472,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
295
472
  flow_label = "Detecting execution flows..."
296
473
  _live_phase(flow_label, "running")
297
474
  flows = trace_execution_flows(
298
- store,
475
+ root_shard_store,
299
476
  progress=lambda s: _live_phase(flow_label, s),
300
477
  )
301
478
  _finish_phase(flow_label, f"{len(flows)} processes found")
302
479
 
303
480
  dead_label = "Finding dead code..."
304
481
  _live_phase(dead_label, "running")
305
- dead = detect_dead_code(store, limit=500)
482
+ dead = detect_dead_code(root_shard_store, limit=500)
306
483
  _finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
307
484
 
308
485
  coup_label = "Analyzing git history..."
309
486
  _live_phase(coup_label, "running")
310
- store.clear_coupling()
487
+ root_shard_store.clear_coupling()
311
488
  coupling_root = abs_path
312
489
  coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
313
490
  coupling_pairs = compute_coupling(
314
- store,
491
+ root_shard_store,
315
492
  coupling_root,
316
493
  coupling_project,
317
494
  days=SETTINGS.default_coupling_days,
@@ -329,7 +506,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
329
506
  flow_label = "Detecting execution flows..."
330
507
  _live_phase(flow_label, "running (lightweight)")
331
508
  try:
332
- flows = trace_execution_flows(store, max_depth=3)
509
+ flows = trace_execution_flows(root_shard_store, max_depth=3)
333
510
  except Exception:
334
511
  flows = []
335
512
  _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
@@ -337,14 +514,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
337
514
  dead_label = "Finding dead code..."
338
515
  _live_phase(dead_label, "running (lightweight)")
339
516
  try:
340
- dead = detect_dead_code(store, limit=100)
517
+ dead = detect_dead_code(root_shard_store, limit=100)
341
518
  except Exception:
342
519
  dead = []
343
520
  _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
344
521
 
345
522
  _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
346
523
 
347
- vector_count = store.query_records(
524
+ vector_count = root_shard_store.query_records(
348
525
  """
349
526
  MATCH (s:Symbol)
350
527
  WHERE s.embedding IS NOT NULL
@@ -355,8 +532,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
355
532
  vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
356
533
  _phase("Generating embeddings...", f"{vectors_stored} vectors stored")
357
534
 
358
- symbol_count = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
359
- edge_count = store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
535
+ symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
536
+ edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
360
537
  symbols = int(symbol_count[0]["count"]) if symbol_count else 0
361
538
  edges = int(edge_count[0]["count"]) if edge_count else 0
362
539
  elapsed = time.perf_counter() - started
@@ -376,7 +553,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
376
553
 
377
554
  # Detect unresolved imports → hint about unindexed sibling projects
378
555
  try:
379
- unresolved = JavaIndexer.detect_unresolved_imports(store)
556
+ unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
380
557
  if unresolved:
381
558
  click.echo()
382
559
  click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
@@ -387,13 +564,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
387
564
 
388
565
  # Publish a read replica so MCP and read-only CLI commands (search, stats…)
389
566
  # run against an isolated snapshot rather than competing with the write
390
- # process's buffer pool. The MCP daemon detects the sentinel file and
391
- # hot-reloads without restarting.
567
+ # process's buffer pool. Snapshot all open shards concurrently.
392
568
  snap_label = "Publishing read replica..."
393
569
  _live_phase(snap_label, "copying")
394
- store._recycle_conn()
395
- snapped = GraphStore.snapshot_to_read_replica()
396
- _finish_phase(snap_label, "MCP will reload automatically" if snapped else "skipped (source DB not found)")
570
+ root_shard_store._recycle_conn()
571
+ sg.snapshot_all(background=False)
572
+ _finish_phase(snap_label, "MCP will reload automatically")
397
573
 
398
574
 
399
575
  @main.command()
@@ -523,10 +699,21 @@ def diff(range_spec: str, as_json: bool) -> None:
523
699
 
524
700
  @main.command()
525
701
  @click.option("--json", "as_json", is_flag=True)
526
- def stats(as_json: bool) -> None:
702
+ @click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
703
+ def stats(as_json: bool, show_shards: bool) -> None:
527
704
  """Show per-project and aggregate graph statistics."""
528
- store = GraphStore(read_only=True)
529
- projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path ORDER BY p.id")
705
+ if show_shards:
706
+ _show_shard_topology(as_json)
707
+ return
708
+
709
+ # Fan-out across all shards so stats covers every project in the cluster.
710
+ sg = ShardedGraphStore(read_only=True)
711
+ all_projects_meta = sg.list_project_metadata()
712
+
713
+ # For detailed stats we need the per-project shard store.
714
+ def _project_store(pid: str):
715
+ return sg.shard(pid)
716
+
530
717
  if not projects:
531
718
  click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
532
719
  return
@@ -534,10 +721,12 @@ def stats(as_json: bool) -> None:
534
721
  rows = []
535
722
  for p in projects:
536
723
  pid = p["id"]
537
- files = store.query_records(
724
+ # Route each query to the project's owning shard.
725
+ ps = _project_store(pid)
726
+ files = ps.query_records(
538
727
  "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
539
728
  )
540
- classes = store.query_records(
729
+ classes = ps.query_records(
541
730
  """
542
731
  MATCH (f:File) WHERE f.project_id = $pid
543
732
  WITH f
@@ -546,7 +735,7 @@ def stats(as_json: bool) -> None:
546
735
  """,
547
736
  {"pid": pid},
548
737
  )
549
- methods = store.query_records(
738
+ methods = ps.query_records(
550
739
  """
551
740
  MATCH (f:File) WHERE f.project_id = $pid
552
741
  WITH f
@@ -557,7 +746,7 @@ def stats(as_json: bool) -> None:
557
746
  """,
558
747
  {"pid": pid},
559
748
  )
560
- calls = store.query_records(
749
+ calls = ps.query_records(
561
750
  """
562
751
  MATCH (f:File) WHERE f.project_id = $pid
563
752
  WITH f
@@ -568,7 +757,7 @@ def stats(as_json: bool) -> None:
568
757
  """,
569
758
  {"pid": pid},
570
759
  )
571
- emb = store.query_records(
760
+ emb = ps.query_records(
572
761
  """
573
762
  MATCH (f:File) WHERE f.project_id = $pid
574
763
  WITH f
@@ -580,6 +769,7 @@ def stats(as_json: bool) -> None:
580
769
  rows.append({
581
770
  "project": pid,
582
771
  "path": p["path"],
772
+ "shard": sg.router.shard_for(pid),
583
773
  "files": files[0]["n"] if files else 0,
584
774
  "classes": classes[0]["n"] if classes else 0,
585
775
  "methods": methods[0]["n"] if methods else 0,
@@ -592,13 +782,13 @@ def stats(as_json: bool) -> None:
592
782
  return
593
783
 
594
784
  col_w = max(len(r["project"]) for r in rows)
595
- header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
785
+ header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
596
786
  click.secho(header, fg="cyan")
597
787
  click.echo("-" * len(header))
598
788
  total_files = total_classes = total_methods = total_calls = total_emb = 0
599
789
  for r in rows:
600
790
  click.echo(
601
- f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
791
+ f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
602
792
  )
603
793
  total_files += r["files"]
604
794
  total_classes += r["classes"]
@@ -608,7 +798,7 @@ def stats(as_json: bool) -> None:
608
798
  if len(rows) > 1:
609
799
  click.echo("-" * len(header))
610
800
  click.secho(
611
- f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
801
+ f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
612
802
  fg="green",
613
803
  )
614
804
 
@@ -4,8 +4,17 @@ from dataclasses import dataclass
4
4
 
5
5
  @dataclass(frozen=True)
6
6
  class Settings:
7
+ # Legacy single-DB paths — kept for backward compat and as defaults when
8
+ # sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
7
9
  db_path: str = os.path.expanduser("~/.codespine_db")
8
10
  db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
11
+
12
+ # Sharding — new layout stores each shard under shards_dir/{N}/db
13
+ # num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
14
+ # ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
15
+ num_shards: int = 4
16
+ shards_dir: str = os.path.expanduser("~/.codespine/shards")
17
+
9
18
  pid_file: str = os.path.expanduser("~/.codespine.pid")
10
19
  log_file: str = os.path.expanduser("~/.codespine.log")
11
20
  embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")