codespine 0.9.5__tar.gz → 0.9.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.9.5 → codespine-0.9.6}/PKG-INFO +1 -1
- {codespine-0.9.5 → codespine-0.9.6}/codespine/__init__.py +1 -1
- {codespine-0.9.5 → codespine-0.9.6}/codespine/cli.py +109 -31
- {codespine-0.9.5 → codespine-0.9.6}/codespine/config.py +9 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/db/store.py +55 -41
- codespine-0.9.6/codespine/sharding/__init__.py +9 -0
- codespine-0.9.6/codespine/sharding/router.py +123 -0
- codespine-0.9.6/codespine/sharding/store.py +312 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/SOURCES.txt +3 -0
- {codespine-0.9.5 → codespine-0.9.6}/pyproject.toml +1 -1
- {codespine-0.9.5 → codespine-0.9.6}/LICENSE +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/README.md +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/community.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/context.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/coupling.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/flow.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/impact.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/db/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/db/schema.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/diff/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/guide.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/di_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/engine.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/server.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/noise/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/noise/blocklist.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/git_state.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/merge.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/store.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/bm25.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/fuzzy.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/hybrid.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/rrf.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/search/vector.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/git_hook.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine/watch/watcher.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/gindex.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/setup.cfg +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_call_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_community_detection.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_deadcode.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_java_parser.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_multimodule_index.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_overlay.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_search_ranking.py +0 -0
- {codespine-0.9.5 → codespine-0.9.6}/tests/test_store_recovery.py +0 -0
|
@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
|
|
|
20
20
|
from codespine.analysis.impact import analyze_impact
|
|
21
21
|
from codespine.config import SETTINGS
|
|
22
22
|
from codespine.db.store import GraphStore
|
|
23
|
+
from codespine.sharding import ShardedGraphStore, ShardRouter
|
|
23
24
|
from codespine.diff.branch_diff import compare_branches
|
|
24
25
|
from codespine.indexer.engine import JavaIndexer
|
|
25
26
|
from codespine.mcp.server import build_mcp_server
|
|
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
|
|
|
90
91
|
return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
|
|
91
92
|
|
|
92
93
|
|
|
94
|
+
def _show_shard_topology(as_json: bool) -> None:
|
|
95
|
+
"""Display the current shard routing topology and imbalance metrics."""
|
|
96
|
+
router = ShardRouter()
|
|
97
|
+
sg = ShardedGraphStore(read_only=True)
|
|
98
|
+
topology = sg.describe()
|
|
99
|
+
|
|
100
|
+
# Gather project → shard mapping from all shards.
|
|
101
|
+
shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
|
|
102
|
+
for p in sg.list_project_metadata():
|
|
103
|
+
pid = p.get("id", "")
|
|
104
|
+
idx = router.shard_for(pid)
|
|
105
|
+
shard_project_counts[idx].append(pid)
|
|
106
|
+
|
|
107
|
+
counts = [len(v) for v in shard_project_counts.values()]
|
|
108
|
+
total = sum(counts)
|
|
109
|
+
median = sorted(counts)[len(counts) // 2] if counts else 0
|
|
110
|
+
max_count = max(counts) if counts else 0
|
|
111
|
+
imbalance = (max_count / median) if median else 1.0
|
|
112
|
+
|
|
113
|
+
if as_json:
|
|
114
|
+
_echo_json({
|
|
115
|
+
"topology": topology,
|
|
116
|
+
"project_distribution": {str(k): v for k, v in shard_project_counts.items()},
|
|
117
|
+
"imbalance_ratio": round(imbalance, 2),
|
|
118
|
+
}, as_json=True)
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
|
|
122
|
+
click.echo(f" Directory : {router.shards_dir}")
|
|
123
|
+
click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
|
|
124
|
+
click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
|
|
125
|
+
click.echo()
|
|
126
|
+
header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
|
|
127
|
+
click.secho(header, fg="cyan")
|
|
128
|
+
click.echo("-" * 60)
|
|
129
|
+
for i, info in enumerate(topology.get("shards", [])):
|
|
130
|
+
plist = shard_project_counts.get(i, [])
|
|
131
|
+
exists_str = "yes" if info.get("exists") else "no"
|
|
132
|
+
click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
|
|
133
|
+
for pid in plist:
|
|
134
|
+
click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
|
|
135
|
+
if imbalance > 2.0:
|
|
136
|
+
click.secho(
|
|
137
|
+
f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
|
|
138
|
+
fg="yellow",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
93
142
|
@click.group()
|
|
94
143
|
def main() -> None:
|
|
95
144
|
"""CodeSpine CLI."""
|
|
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
130
179
|
fg="yellow",
|
|
131
180
|
)
|
|
132
181
|
|
|
133
|
-
|
|
134
|
-
|
|
182
|
+
# ShardedGraphStore routes each project to its dedicated DB shard.
|
|
183
|
+
# For single-project analysis this is transparent — shard() always
|
|
184
|
+
# returns a GraphStore pointing to the correct shard path.
|
|
185
|
+
sg = ShardedGraphStore(read_only=False)
|
|
186
|
+
# The indexer is initialised per-module below with the right shard store.
|
|
187
|
+
# We keep a single ShardedGraphStore to fan-out cross-module linking later.
|
|
135
188
|
|
|
136
189
|
# --- Workspace → project → module detection ---
|
|
137
190
|
# Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
|
|
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
241
294
|
last_result = None
|
|
242
295
|
for idx, (module_path, project_id) in enumerate(modules_with_ids):
|
|
243
296
|
if is_multi:
|
|
297
|
+
shard_idx = sg.router.shard_for(project_id)
|
|
244
298
|
click.echo()
|
|
245
|
-
click.secho(
|
|
299
|
+
click.secho(
|
|
300
|
+
f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
|
|
301
|
+
fg="cyan",
|
|
302
|
+
)
|
|
246
303
|
_reset_state()
|
|
304
|
+
# Use the shard store for this project so data lands in the right DB.
|
|
305
|
+
shard_store = sg.shard(project_id)
|
|
306
|
+
indexer = JavaIndexer(shard_store)
|
|
247
307
|
last_result = indexer.index_project(
|
|
248
308
|
module_path, full=full, progress=_progress, project_id=project_id, embed=embed
|
|
249
309
|
)
|
|
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
264
324
|
"""Finalise an in-place phase line and move to the next line."""
|
|
265
325
|
click.echo(f"\r✓ {label:<28} {result:<48}")
|
|
266
326
|
|
|
327
|
+
# For cross-module operations (cross-module linking, deep analysis, stats)
|
|
328
|
+
# we use the shard store for the root project (all modules share one shard).
|
|
329
|
+
root_project_id = last_result.project_id if last_result else root_basename
|
|
330
|
+
root_shard_store = sg.shard(root_project_id)
|
|
331
|
+
|
|
267
332
|
# ── Cross-module call linking ──────────────────────────────────────
|
|
268
333
|
if is_multi and len(modules_with_ids) > 1:
|
|
269
334
|
xmod_label = "Cross-module linking..."
|
|
270
335
|
_live_phase(xmod_label, "running")
|
|
271
336
|
xmod_pids = [pid for _, pid in modules_with_ids]
|
|
272
337
|
xmod_edges = link_cross_module_calls(
|
|
273
|
-
|
|
338
|
+
root_shard_store, project_ids=xmod_pids,
|
|
274
339
|
progress=lambda s: _live_phase(xmod_label, s),
|
|
275
340
|
)
|
|
276
341
|
_finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
|
|
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
287
352
|
comm_label = "Detecting communities..."
|
|
288
353
|
_live_phase(comm_label, "running")
|
|
289
354
|
communities = detect_communities(
|
|
290
|
-
|
|
355
|
+
root_shard_store,
|
|
291
356
|
progress=lambda s: _live_phase(comm_label, s),
|
|
292
357
|
)
|
|
293
358
|
_finish_phase(comm_label, f"{len(communities)} clusters found")
|
|
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
295
360
|
flow_label = "Detecting execution flows..."
|
|
296
361
|
_live_phase(flow_label, "running")
|
|
297
362
|
flows = trace_execution_flows(
|
|
298
|
-
|
|
363
|
+
root_shard_store,
|
|
299
364
|
progress=lambda s: _live_phase(flow_label, s),
|
|
300
365
|
)
|
|
301
366
|
_finish_phase(flow_label, f"{len(flows)} processes found")
|
|
302
367
|
|
|
303
368
|
dead_label = "Finding dead code..."
|
|
304
369
|
_live_phase(dead_label, "running")
|
|
305
|
-
dead = detect_dead_code(
|
|
370
|
+
dead = detect_dead_code(root_shard_store, limit=500)
|
|
306
371
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
|
|
307
372
|
|
|
308
373
|
coup_label = "Analyzing git history..."
|
|
309
374
|
_live_phase(coup_label, "running")
|
|
310
|
-
|
|
375
|
+
root_shard_store.clear_coupling()
|
|
311
376
|
coupling_root = abs_path
|
|
312
377
|
coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
|
|
313
378
|
coupling_pairs = compute_coupling(
|
|
314
|
-
|
|
379
|
+
root_shard_store,
|
|
315
380
|
coupling_root,
|
|
316
381
|
coupling_project,
|
|
317
382
|
days=SETTINGS.default_coupling_days,
|
|
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
329
394
|
flow_label = "Detecting execution flows..."
|
|
330
395
|
_live_phase(flow_label, "running (lightweight)")
|
|
331
396
|
try:
|
|
332
|
-
flows = trace_execution_flows(
|
|
397
|
+
flows = trace_execution_flows(root_shard_store, max_depth=3)
|
|
333
398
|
except Exception:
|
|
334
399
|
flows = []
|
|
335
400
|
_finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
|
|
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
337
402
|
dead_label = "Finding dead code..."
|
|
338
403
|
_live_phase(dead_label, "running (lightweight)")
|
|
339
404
|
try:
|
|
340
|
-
dead = detect_dead_code(
|
|
405
|
+
dead = detect_dead_code(root_shard_store, limit=100)
|
|
341
406
|
except Exception:
|
|
342
407
|
dead = []
|
|
343
408
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
|
|
344
409
|
|
|
345
410
|
_phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
|
|
346
411
|
|
|
347
|
-
vector_count =
|
|
412
|
+
vector_count = root_shard_store.query_records(
|
|
348
413
|
"""
|
|
349
414
|
MATCH (s:Symbol)
|
|
350
415
|
WHERE s.embedding IS NOT NULL
|
|
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
355
420
|
vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
|
|
356
421
|
_phase("Generating embeddings...", f"{vectors_stored} vectors stored")
|
|
357
422
|
|
|
358
|
-
symbol_count =
|
|
359
|
-
edge_count =
|
|
423
|
+
symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
|
|
424
|
+
edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
|
|
360
425
|
symbols = int(symbol_count[0]["count"]) if symbol_count else 0
|
|
361
426
|
edges = int(edge_count[0]["count"]) if edge_count else 0
|
|
362
427
|
elapsed = time.perf_counter() - started
|
|
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
376
441
|
|
|
377
442
|
# Detect unresolved imports → hint about unindexed sibling projects
|
|
378
443
|
try:
|
|
379
|
-
unresolved = JavaIndexer.detect_unresolved_imports(
|
|
444
|
+
unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
|
|
380
445
|
if unresolved:
|
|
381
446
|
click.echo()
|
|
382
447
|
click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
|
|
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
387
452
|
|
|
388
453
|
# Publish a read replica so MCP and read-only CLI commands (search, stats…)
|
|
389
454
|
# run against an isolated snapshot rather than competing with the write
|
|
390
|
-
# process's buffer pool.
|
|
391
|
-
# hot-reloads without restarting.
|
|
455
|
+
# process's buffer pool. Snapshot all open shards concurrently.
|
|
392
456
|
snap_label = "Publishing read replica..."
|
|
393
457
|
_live_phase(snap_label, "copying")
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
_finish_phase(snap_label, "MCP will reload automatically"
|
|
458
|
+
root_shard_store._recycle_conn()
|
|
459
|
+
sg.snapshot_all(background=False)
|
|
460
|
+
_finish_phase(snap_label, "MCP will reload automatically")
|
|
397
461
|
|
|
398
462
|
|
|
399
463
|
@main.command()
|
|
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
|
|
|
523
587
|
|
|
524
588
|
@main.command()
|
|
525
589
|
@click.option("--json", "as_json", is_flag=True)
|
|
526
|
-
|
|
590
|
+
@click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
|
|
591
|
+
def stats(as_json: bool, show_shards: bool) -> None:
|
|
527
592
|
"""Show per-project and aggregate graph statistics."""
|
|
528
|
-
|
|
529
|
-
|
|
593
|
+
if show_shards:
|
|
594
|
+
_show_shard_topology(as_json)
|
|
595
|
+
return
|
|
596
|
+
|
|
597
|
+
# Fan-out across all shards so stats covers every project in the cluster.
|
|
598
|
+
sg = ShardedGraphStore(read_only=True)
|
|
599
|
+
all_projects_meta = sg.list_project_metadata()
|
|
600
|
+
|
|
601
|
+
# For detailed stats we need the per-project shard store.
|
|
602
|
+
def _project_store(pid: str):
|
|
603
|
+
return sg.shard(pid)
|
|
604
|
+
|
|
530
605
|
if not projects:
|
|
531
606
|
click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
|
|
532
607
|
return
|
|
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
|
|
|
534
609
|
rows = []
|
|
535
610
|
for p in projects:
|
|
536
611
|
pid = p["id"]
|
|
537
|
-
|
|
612
|
+
# Route each query to the project's owning shard.
|
|
613
|
+
ps = _project_store(pid)
|
|
614
|
+
files = ps.query_records(
|
|
538
615
|
"MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
|
|
539
616
|
)
|
|
540
|
-
classes =
|
|
617
|
+
classes = ps.query_records(
|
|
541
618
|
"""
|
|
542
619
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
543
620
|
WITH f
|
|
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
|
|
|
546
623
|
""",
|
|
547
624
|
{"pid": pid},
|
|
548
625
|
)
|
|
549
|
-
methods =
|
|
626
|
+
methods = ps.query_records(
|
|
550
627
|
"""
|
|
551
628
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
552
629
|
WITH f
|
|
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
|
|
|
557
634
|
""",
|
|
558
635
|
{"pid": pid},
|
|
559
636
|
)
|
|
560
|
-
calls =
|
|
637
|
+
calls = ps.query_records(
|
|
561
638
|
"""
|
|
562
639
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
563
640
|
WITH f
|
|
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
|
|
|
568
645
|
""",
|
|
569
646
|
{"pid": pid},
|
|
570
647
|
)
|
|
571
|
-
emb =
|
|
648
|
+
emb = ps.query_records(
|
|
572
649
|
"""
|
|
573
650
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
574
651
|
WITH f
|
|
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
|
|
|
580
657
|
rows.append({
|
|
581
658
|
"project": pid,
|
|
582
659
|
"path": p["path"],
|
|
660
|
+
"shard": sg.router.shard_for(pid),
|
|
583
661
|
"files": files[0]["n"] if files else 0,
|
|
584
662
|
"classes": classes[0]["n"] if classes else 0,
|
|
585
663
|
"methods": methods[0]["n"] if methods else 0,
|
|
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
|
|
|
592
670
|
return
|
|
593
671
|
|
|
594
672
|
col_w = max(len(r["project"]) for r in rows)
|
|
595
|
-
header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
673
|
+
header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
596
674
|
click.secho(header, fg="cyan")
|
|
597
675
|
click.echo("-" * len(header))
|
|
598
676
|
total_files = total_classes = total_methods = total_calls = total_emb = 0
|
|
599
677
|
for r in rows:
|
|
600
678
|
click.echo(
|
|
601
|
-
f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
679
|
+
f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
602
680
|
)
|
|
603
681
|
total_files += r["files"]
|
|
604
682
|
total_classes += r["classes"]
|
|
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
|
|
|
608
686
|
if len(rows) > 1:
|
|
609
687
|
click.echo("-" * len(header))
|
|
610
688
|
click.secho(
|
|
611
|
-
f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
689
|
+
f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
612
690
|
fg="green",
|
|
613
691
|
)
|
|
614
692
|
|
|
@@ -4,8 +4,17 @@ from dataclasses import dataclass
|
|
|
4
4
|
|
|
5
5
|
@dataclass(frozen=True)
|
|
6
6
|
class Settings:
|
|
7
|
+
# Legacy single-DB paths — kept for backward compat and as defaults when
|
|
8
|
+
# sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
|
|
7
9
|
db_path: str = os.path.expanduser("~/.codespine_db")
|
|
8
10
|
db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
|
|
11
|
+
|
|
12
|
+
# Sharding — new layout stores each shard under shards_dir/{N}/db
|
|
13
|
+
# num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
|
|
14
|
+
# ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
|
|
15
|
+
num_shards: int = 4
|
|
16
|
+
shards_dir: str = os.path.expanduser("~/.codespine/shards")
|
|
17
|
+
|
|
9
18
|
pid_file: str = os.path.expanduser("~/.codespine.pid")
|
|
10
19
|
log_file: str = os.path.expanduser("~/.codespine.log")
|
|
11
20
|
embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
|
|
@@ -8,7 +8,7 @@ import shutil
|
|
|
8
8
|
import threading
|
|
9
9
|
import time
|
|
10
10
|
from contextlib import contextmanager
|
|
11
|
-
from dataclasses import dataclass
|
|
11
|
+
from dataclasses import InitVar, dataclass
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
14
|
import kuzu
|
|
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
|
|
|
39
39
|
@dataclass
|
|
40
40
|
class GraphStore:
|
|
41
41
|
read_only: bool = False
|
|
42
|
+
# Optional path overrides — when provided, the store uses these paths
|
|
43
|
+
# instead of the global SETTINGS values. The ShardedGraphStore uses
|
|
44
|
+
# this to give each shard its own isolated KùzuDB directory.
|
|
45
|
+
db_path_override: InitVar[str | None] = None
|
|
46
|
+
snapshot_path_override: InitVar[str | None] = None
|
|
47
|
+
|
|
48
|
+
def __post_init__(
|
|
49
|
+
self,
|
|
50
|
+
db_path_override: str | None,
|
|
51
|
+
snapshot_path_override: str | None,
|
|
52
|
+
) -> None:
|
|
53
|
+
# Resolve effective paths — per-shard overrides win over global SETTINGS.
|
|
54
|
+
self._db_path: str = db_path_override or SETTINGS.db_path
|
|
55
|
+
self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
|
|
56
|
+
|
|
57
|
+
# Per-instance snapshot synchronisation (not class-level) so that
|
|
58
|
+
# multiple shards can snapshot concurrently without a shared bottleneck.
|
|
59
|
+
self._inst_snapshot_lock: threading.Lock = threading.Lock()
|
|
60
|
+
self._inst_snapshot_pending: threading.Event = threading.Event()
|
|
42
61
|
|
|
43
|
-
def __post_init__(self) -> None:
|
|
44
62
|
self._tls: threading.local = threading.local()
|
|
45
63
|
from codespine.overlay.store import OverlayStore
|
|
46
64
|
|
|
@@ -48,10 +66,10 @@ class GraphStore:
|
|
|
48
66
|
|
|
49
67
|
# Read-only callers (MCP, CLI reads) use the read replica when available.
|
|
50
68
|
# This isolates them from the write process's buffer pool and WAL churn.
|
|
51
|
-
if self.read_only and os.path.exists(
|
|
52
|
-
db_path =
|
|
69
|
+
if self.read_only and os.path.exists(self._snapshot_path):
|
|
70
|
+
db_path = self._snapshot_path
|
|
53
71
|
else:
|
|
54
|
-
db_path =
|
|
72
|
+
db_path = self._db_path
|
|
55
73
|
|
|
56
74
|
try:
|
|
57
75
|
self.db = self._open_with_recovery(db_path)
|
|
@@ -97,7 +115,7 @@ class GraphStore:
|
|
|
97
115
|
try:
|
|
98
116
|
ensure_schema(self._conn())
|
|
99
117
|
except Exception as exc:
|
|
100
|
-
path = getattr(self.db, "database_path",
|
|
118
|
+
path = getattr(self.db, "database_path", self._db_path)
|
|
101
119
|
if not self._is_recoverable_db_error(exc):
|
|
102
120
|
raise
|
|
103
121
|
LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
|
|
@@ -768,8 +786,7 @@ class GraphStore:
|
|
|
768
786
|
self.clear_flows()
|
|
769
787
|
self.clear_coupling()
|
|
770
788
|
|
|
771
|
-
|
|
772
|
-
def force_delete_all_data() -> list[str]:
|
|
789
|
+
def force_delete_all_data(self) -> list[str]:
|
|
773
790
|
"""Delete all CodeSpine data files without touching the Kuzu engine.
|
|
774
791
|
|
|
775
792
|
This is the nuclear option for OOM recovery: when the buffer pool is
|
|
@@ -779,12 +796,14 @@ class GraphStore:
|
|
|
779
796
|
|
|
780
797
|
Returns the list of paths that were removed.
|
|
781
798
|
"""
|
|
799
|
+
db_path = self._db_path
|
|
800
|
+
snapshot_path = self._snapshot_path
|
|
782
801
|
removed: list[str] = []
|
|
783
802
|
for path in [
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
803
|
+
db_path,
|
|
804
|
+
snapshot_path,
|
|
805
|
+
snapshot_path + ".updated",
|
|
806
|
+
snapshot_path + ".tmp",
|
|
788
807
|
SETTINGS.embedding_cache_path,
|
|
789
808
|
SETTINGS.overlay_dir,
|
|
790
809
|
SETTINGS.index_meta_dir,
|
|
@@ -801,7 +820,7 @@ class GraphStore:
|
|
|
801
820
|
pass
|
|
802
821
|
# Also remove any stale WAL files next to the DB
|
|
803
822
|
for suffix in (".wal", ".lock"):
|
|
804
|
-
wal_path =
|
|
823
|
+
wal_path = db_path + suffix
|
|
805
824
|
if os.path.exists(wal_path):
|
|
806
825
|
try:
|
|
807
826
|
os.remove(wal_path)
|
|
@@ -812,7 +831,7 @@ class GraphStore:
|
|
|
812
831
|
|
|
813
832
|
def rebuild_empty_db(self) -> None:
|
|
814
833
|
self._recycle_conn()
|
|
815
|
-
path =
|
|
834
|
+
path = self._db_path
|
|
816
835
|
# Remove the DB directory AND any stale WAL / lock files
|
|
817
836
|
self._remove_db_path(path)
|
|
818
837
|
for suffix in (".wal", ".lock"):
|
|
@@ -825,11 +844,8 @@ class GraphStore:
|
|
|
825
844
|
|
|
826
845
|
# Also remove the read replica so that read-only callers (stats, MCP)
|
|
827
846
|
# don't continue to see stale data from before the wipe.
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
SETTINGS.db_snapshot_path + ".tmp",
|
|
831
|
-
SETTINGS.db_snapshot_path + ".updated",
|
|
832
|
-
]:
|
|
847
|
+
snap = self._snapshot_path
|
|
848
|
+
for stale in [snap, snap + ".tmp", snap + ".updated"]:
|
|
833
849
|
self._remove_db_path(stale)
|
|
834
850
|
|
|
835
851
|
# Kuzu may retain stale internal state from a previous failed open of
|
|
@@ -926,18 +942,15 @@ class GraphStore:
|
|
|
926
942
|
},
|
|
927
943
|
)
|
|
928
944
|
|
|
929
|
-
|
|
930
|
-
# Only one snapshot runs at a time; a pending request supersedes queued ones.
|
|
931
|
-
_snapshot_lock: threading.Lock = threading.Lock()
|
|
932
|
-
_snapshot_pending: threading.Event = threading.Event()
|
|
933
|
-
|
|
934
|
-
@staticmethod
|
|
935
|
-
def snapshot_to_read_replica(background: bool = False) -> bool:
|
|
945
|
+
def snapshot_to_read_replica(self, background: bool = False) -> bool:
|
|
936
946
|
"""Atomically copy the write DB to the read-replica path.
|
|
937
947
|
|
|
938
948
|
The read replica is used by the MCP daemon and all read-only CLI
|
|
939
949
|
commands so they never contend with the write process's buffer pool.
|
|
940
950
|
|
|
951
|
+
Each GraphStore instance has its own snapshot lock so that multiple
|
|
952
|
+
shards can snapshot concurrently without serialising on a class lock.
|
|
953
|
+
|
|
941
954
|
Parameters
|
|
942
955
|
----------
|
|
943
956
|
background:
|
|
@@ -950,36 +963,38 @@ class GraphStore:
|
|
|
950
963
|
Returns True on success (or when dispatched to background), False if
|
|
951
964
|
the source DB does not exist.
|
|
952
965
|
"""
|
|
953
|
-
src =
|
|
966
|
+
src = self._db_path
|
|
954
967
|
if not os.path.exists(src):
|
|
955
968
|
return False
|
|
956
969
|
|
|
957
970
|
if background:
|
|
958
971
|
# Signal that a snapshot is wanted, then ensure a worker is running.
|
|
959
|
-
|
|
972
|
+
self._inst_snapshot_pending.set()
|
|
973
|
+
inst = self # capture for closure
|
|
960
974
|
|
|
961
975
|
def _worker() -> None:
|
|
962
|
-
while
|
|
963
|
-
|
|
964
|
-
with
|
|
965
|
-
|
|
976
|
+
while inst._inst_snapshot_pending.is_set():
|
|
977
|
+
inst._inst_snapshot_pending.clear()
|
|
978
|
+
with inst._inst_snapshot_lock:
|
|
979
|
+
inst._do_snapshot()
|
|
966
980
|
|
|
967
|
-
if not
|
|
981
|
+
if not self._inst_snapshot_lock.locked():
|
|
968
982
|
t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
|
|
969
983
|
t.start()
|
|
970
984
|
return True
|
|
971
985
|
|
|
972
986
|
# Foreground (blocking) path — used by CLI analyse and tests.
|
|
973
|
-
with
|
|
974
|
-
return
|
|
987
|
+
with self._inst_snapshot_lock:
|
|
988
|
+
return self._do_snapshot()
|
|
975
989
|
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
dst = SETTINGS.db_snapshot_path
|
|
990
|
+
def _do_snapshot(self) -> bool:
|
|
991
|
+
"""Perform the actual copy. Must be called with the instance snapshot lock held."""
|
|
992
|
+
src = self._db_path
|
|
993
|
+
dst = self._snapshot_path
|
|
981
994
|
if not os.path.exists(src):
|
|
982
995
|
return False
|
|
996
|
+
# Ensure the parent directory for the replica exists (shards layout).
|
|
997
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
983
998
|
tmp = dst + ".tmp"
|
|
984
999
|
try:
|
|
985
1000
|
if os.path.exists(tmp):
|
|
@@ -987,7 +1002,6 @@ class GraphStore:
|
|
|
987
1002
|
if os.path.isdir(src):
|
|
988
1003
|
shutil.copytree(src, tmp)
|
|
989
1004
|
else:
|
|
990
|
-
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
991
1005
|
shutil.copy2(src, tmp)
|
|
992
1006
|
if os.path.exists(dst):
|
|
993
1007
|
shutil.rmtree(dst, ignore_errors=True)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""CodeSpine sharding package.
|
|
2
|
+
|
|
3
|
+
Exposes the consistent-hash router and the ShardedGraphStore facade.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from codespine.sharding.router import ShardRouter
|
|
7
|
+
from codespine.sharding.store import ShardedGraphStore
|
|
8
|
+
|
|
9
|
+
__all__ = ["ShardRouter", "ShardedGraphStore"]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Consistent-hash shard router for CodeSpine.
|
|
2
|
+
|
|
3
|
+
Design
|
|
4
|
+
------
|
|
5
|
+
* ``num_shards`` physical shards — each shard owns an independent KùzuDB at
|
|
6
|
+
``~/.codespine/shards/{N}/db``.
|
|
7
|
+
* Shard key = *root project name* (the part before ``::`` in a multi-module
|
|
8
|
+
project ID). This guarantees that all modules of the same project are
|
|
9
|
+
co-located in the same shard so that cross-module call resolution still
|
|
10
|
+
works in one graph traversal.
|
|
11
|
+
* Virtual-node ring (``VIRTUAL_NODES_PER_SHARD = 150``) gives an even
|
|
12
|
+
distribution even for small shard counts.
|
|
13
|
+
* ``num_shards`` can be changed at any time; affected projects must be
|
|
14
|
+
re-indexed, but unaffected projects continue to work.
|
|
15
|
+
|
|
16
|
+
Env var override
|
|
17
|
+
----------------
|
|
18
|
+
``CODESPINE_SHARDS=N`` (integer, default 4) sets the number of shards at
|
|
19
|
+
process start. 0 or 1 disables sharding (all projects land in shard 0).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import bisect
|
|
25
|
+
import hashlib
|
|
26
|
+
import os
|
|
27
|
+
|
|
28
|
+
VIRTUAL_NODES_PER_SHARD = 150 # virtual ring entries per physical shard
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ShardRouter:
|
|
32
|
+
"""Maps project IDs to shard indices via a consistent-hash ring.
|
|
33
|
+
|
|
34
|
+
Parameters
|
|
35
|
+
----------
|
|
36
|
+
num_shards:
|
|
37
|
+
Number of physical shards. Defaults to the ``CODESPINE_SHARDS``
|
|
38
|
+
environment variable, or ``4`` if unset.
|
|
39
|
+
shards_dir:
|
|
40
|
+
Base directory that holds per-shard sub-directories.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
num_shards: int | None = None,
|
|
46
|
+
shards_dir: str | None = None,
|
|
47
|
+
) -> None:
|
|
48
|
+
_env = os.environ.get("CODESPINE_SHARDS", "").strip()
|
|
49
|
+
_default = max(1, int(_env)) if _env.isdigit() else 4
|
|
50
|
+
self.num_shards: int = max(1, num_shards if num_shards is not None else _default)
|
|
51
|
+
self.shards_dir: str = shards_dir or os.path.expanduser("~/.codespine/shards")
|
|
52
|
+
|
|
53
|
+
# Build virtual-node ring: list of (ring_point, shard_index) sorted by ring_point
|
|
54
|
+
self._ring: list[tuple[int, int]] = []
|
|
55
|
+
for shard_idx in range(self.num_shards):
|
|
56
|
+
for vn in range(VIRTUAL_NODES_PER_SHARD):
|
|
57
|
+
point = self._hash_key(f"shard-{shard_idx}-vn-{vn}")
|
|
58
|
+
self._ring.append((point, shard_idx))
|
|
59
|
+
self._ring.sort()
|
|
60
|
+
self._ring_points = [p for p, _ in self._ring]
|
|
61
|
+
|
|
62
|
+
# ------------------------------------------------------------------
|
|
63
|
+
# Routing
|
|
64
|
+
# ------------------------------------------------------------------
|
|
65
|
+
|
|
66
|
+
@staticmethod
|
|
67
|
+
def _hash_key(key: str) -> int:
|
|
68
|
+
"""Deterministic 64-bit hash of a string."""
|
|
69
|
+
raw = hashlib.md5(key.encode("utf-8")).digest()
|
|
70
|
+
# Use first 8 bytes as unsigned 64-bit integer for wide ring range.
|
|
71
|
+
return int.from_bytes(raw[:8], "big")
|
|
72
|
+
|
|
73
|
+
def _root_key(self, project_id: str) -> str:
|
|
74
|
+
"""Extract the root portion of a project_id for co-location.
|
|
75
|
+
|
|
76
|
+
For multi-module projects (format ``root::module``), all modules of
|
|
77
|
+
the same root must land on the same shard so that cross-module graph
|
|
78
|
+
traversals work without federation.
|
|
79
|
+
"""
|
|
80
|
+
return project_id.split("::")[0] if "::" in project_id else project_id
|
|
81
|
+
|
|
82
|
+
def shard_for(self, project_id: str) -> int:
|
|
83
|
+
"""Return the shard index [0, num_shards) for the given project_id."""
|
|
84
|
+
if self.num_shards == 1:
|
|
85
|
+
return 0
|
|
86
|
+
point = self._hash_key(self._root_key(project_id))
|
|
87
|
+
pos = bisect.bisect_left(self._ring_points, point)
|
|
88
|
+
# Wrap around the ring
|
|
89
|
+
_, shard_idx = self._ring[pos % len(self._ring)]
|
|
90
|
+
return shard_idx
|
|
91
|
+
|
|
92
|
+
def all_shards(self) -> list[int]:
|
|
93
|
+
"""Return all shard indices."""
|
|
94
|
+
return list(range(self.num_shards))
|
|
95
|
+
|
|
96
|
+
# ------------------------------------------------------------------
|
|
97
|
+
# Path helpers
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def db_path(self, shard_index: int) -> str:
|
|
101
|
+
"""Absolute write-DB path for a shard."""
|
|
102
|
+
return os.path.join(self.shards_dir, str(shard_index), "db")
|
|
103
|
+
|
|
104
|
+
def snapshot_path(self, shard_index: int) -> str:
|
|
105
|
+
"""Absolute read-replica path for a shard."""
|
|
106
|
+
return os.path.join(self.shards_dir, str(shard_index), "db_read")
|
|
107
|
+
|
|
108
|
+
def shard_home(self, shard_index: int) -> str:
|
|
109
|
+
"""Directory that holds all data for a shard."""
|
|
110
|
+
return os.path.join(self.shards_dir, str(shard_index))
|
|
111
|
+
|
|
112
|
+
# ------------------------------------------------------------------
|
|
113
|
+
# Helpers
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
def describe(self) -> dict:
|
|
117
|
+
"""Return a human-readable summary of the routing table."""
|
|
118
|
+
return {
|
|
119
|
+
"num_shards": self.num_shards,
|
|
120
|
+
"shards_dir": self.shards_dir,
|
|
121
|
+
"virtual_nodes_per_shard": VIRTUAL_NODES_PER_SHARD,
|
|
122
|
+
"ring_size": len(self._ring),
|
|
123
|
+
}
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""ShardedGraphStore — in-process shard coordinator.
|
|
2
|
+
|
|
3
|
+
Each project (or multi-module root) is consistently hashed to a shard index.
|
|
4
|
+
All modules of the same project share one shard so that cross-module graph
|
|
5
|
+
traversals see the full call graph without federation.
|
|
6
|
+
|
|
7
|
+
Design
|
|
8
|
+
------
|
|
9
|
+
* ``ShardedGraphStore`` maintains a pool of ``GraphStore`` instances, one per
|
|
10
|
+
shard opened so far. Shards are opened lazily on first access.
|
|
11
|
+
* Existing callers that receive a plain ``GraphStore`` continue to work
|
|
12
|
+
unchanged. The new entry point is ``ShardedGraphStore.shard(project_id)``
|
|
13
|
+
which returns the ``GraphStore`` responsible for that project.
|
|
14
|
+
* Fan-out reads (``list_project_metadata``, global search) call
|
|
15
|
+
``all_shards()`` to iterate every open shard.
|
|
16
|
+
* ``snapshot_all()`` triggers per-shard snapshots concurrently.
|
|
17
|
+
|
|
18
|
+
Migration from v0.9.x
|
|
19
|
+
---------------------
|
|
20
|
+
If ``~/.codespine_db`` exists and the new shards directory doesn't, the
|
|
21
|
+
store automatically migrates the legacy DB to shard 0's path on first access
|
|
22
|
+
so existing indexed data isn't lost.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
import shutil
|
|
30
|
+
import threading
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
from codespine.config import SETTINGS
|
|
34
|
+
from codespine.db.store import GraphStore
|
|
35
|
+
from codespine.sharding.router import ShardRouter
|
|
36
|
+
|
|
37
|
+
LOGGER = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ShardedGraphStore:
|
|
41
|
+
"""Coordinates multiple per-shard ``GraphStore`` instances.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
read_only:
|
|
46
|
+
Passed through to each ``GraphStore``.
|
|
47
|
+
num_shards:
|
|
48
|
+
Override for the shard count. Defaults to ``SETTINGS.num_shards``.
|
|
49
|
+
shards_dir:
|
|
50
|
+
Override for the shards base directory.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
read_only: bool = False,
|
|
56
|
+
num_shards: int | None = None,
|
|
57
|
+
shards_dir: str | None = None,
|
|
58
|
+
) -> None:
|
|
59
|
+
self.read_only = read_only
|
|
60
|
+
self.router = ShardRouter(
|
|
61
|
+
num_shards=num_shards or SETTINGS.num_shards,
|
|
62
|
+
shards_dir=shards_dir or SETTINGS.shards_dir,
|
|
63
|
+
)
|
|
64
|
+
self._pool: dict[int, GraphStore] = {}
|
|
65
|
+
self._lock = threading.Lock()
|
|
66
|
+
self._migrated = False
|
|
67
|
+
|
|
68
|
+
# ------------------------------------------------------------------
|
|
69
|
+
# Core routing
|
|
70
|
+
# ------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
def shard(self, project_id: str) -> GraphStore:
|
|
73
|
+
"""Return (or open) the GraphStore for this project's shard.
|
|
74
|
+
|
|
75
|
+
In write mode this always returns a valid store (creating the DB if
|
|
76
|
+
needed). In read-only mode, if the shard DB has never been written to,
|
|
77
|
+
this also creates it (returning an empty-but-valid store) so that
|
|
78
|
+
callers can safely query it without crashing.
|
|
79
|
+
"""
|
|
80
|
+
idx = self.router.shard_for(project_id)
|
|
81
|
+
store = self._get_shard(idx)
|
|
82
|
+
if store is None:
|
|
83
|
+
# Fallback: open read-only against an empty path so callers get an
|
|
84
|
+
# empty result set rather than a crash. This happens when the
|
|
85
|
+
# shard DB doesn't exist yet.
|
|
86
|
+
with self._lock:
|
|
87
|
+
if idx not in self._pool:
|
|
88
|
+
db_path = self.router.db_path(idx)
|
|
89
|
+
snap_path = self.router.snapshot_path(idx)
|
|
90
|
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
|
91
|
+
self._pool[idx] = GraphStore(
|
|
92
|
+
read_only=False, # create empty DB
|
|
93
|
+
db_path_override=db_path,
|
|
94
|
+
snapshot_path_override=snap_path,
|
|
95
|
+
)
|
|
96
|
+
store = self._pool[idx]
|
|
97
|
+
return store
|
|
98
|
+
|
|
99
|
+
def _get_shard(self, idx: int) -> GraphStore | None:
|
|
100
|
+
"""Return the GraphStore for shard *idx*, or None if it doesn't exist
|
|
101
|
+
yet and we're in read-only mode (nothing to read there)."""
|
|
102
|
+
with self._lock:
|
|
103
|
+
if idx not in self._pool:
|
|
104
|
+
self._maybe_migrate(idx)
|
|
105
|
+
db_path = self.router.db_path(idx)
|
|
106
|
+
snap_path = self.router.snapshot_path(idx)
|
|
107
|
+
|
|
108
|
+
# In read-only mode, skip shards whose DB hasn't been created
|
|
109
|
+
# yet — Kuzu refuses to create an empty DB under read_only=True.
|
|
110
|
+
if self.read_only and not os.path.exists(db_path) and not os.path.exists(snap_path):
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
# Ensure parent directory exists before Kuzu opens it.
|
|
114
|
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
|
115
|
+
self._pool[idx] = GraphStore(
|
|
116
|
+
read_only=self.read_only,
|
|
117
|
+
db_path_override=db_path,
|
|
118
|
+
snapshot_path_override=snap_path,
|
|
119
|
+
)
|
|
120
|
+
return self._pool[idx]
|
|
121
|
+
|
|
122
|
+
def _maybe_migrate(self, idx: int) -> None:
|
|
123
|
+
"""One-time migration: copy legacy ~/.codespine_db → shard 0 DB path.
|
|
124
|
+
|
|
125
|
+
Guard: only triggers when the shards_dir matches the compiled-in
|
|
126
|
+
default (SETTINGS.shards_dir). Custom / test shards_dir values are
|
|
127
|
+
never eligible for migration so that test isolation is preserved.
|
|
128
|
+
"""
|
|
129
|
+
if self._migrated or idx != 0:
|
|
130
|
+
return
|
|
131
|
+
self._migrated = True
|
|
132
|
+
|
|
133
|
+
# Safety guard: never auto-migrate when using a non-default shards dir.
|
|
134
|
+
# This prevents test code that passes a temp dir from accidentally
|
|
135
|
+
# touching production data.
|
|
136
|
+
if os.path.realpath(self.router.shards_dir) != os.path.realpath(SETTINGS.shards_dir):
|
|
137
|
+
return
|
|
138
|
+
|
|
139
|
+
legacy = SETTINGS.db_path # ~/.codespine_db
|
|
140
|
+
target = self.router.db_path(0)
|
|
141
|
+
if not os.path.exists(legacy):
|
|
142
|
+
return
|
|
143
|
+
if os.path.exists(target):
|
|
144
|
+
# Sharded layout already initialised — don't overwrite.
|
|
145
|
+
return
|
|
146
|
+
LOGGER.info(
|
|
147
|
+
"Migrating legacy DB %s → shard 0 path %s", legacy, target
|
|
148
|
+
)
|
|
149
|
+
try:
|
|
150
|
+
os.makedirs(os.path.dirname(target), exist_ok=True)
|
|
151
|
+
# Copy first, delete after — if the copy fails the original is safe.
|
|
152
|
+
if os.path.isdir(legacy):
|
|
153
|
+
shutil.copytree(legacy, target)
|
|
154
|
+
else:
|
|
155
|
+
shutil.copy2(legacy, target)
|
|
156
|
+
shutil.rmtree(legacy, ignore_errors=True)
|
|
157
|
+
# Also migrate read replica if present.
|
|
158
|
+
legacy_snap = SETTINGS.db_snapshot_path
|
|
159
|
+
target_snap = self.router.snapshot_path(0)
|
|
160
|
+
if os.path.exists(legacy_snap) and not os.path.exists(target_snap):
|
|
161
|
+
if os.path.isdir(legacy_snap):
|
|
162
|
+
shutil.copytree(legacy_snap, target_snap)
|
|
163
|
+
else:
|
|
164
|
+
shutil.copy2(legacy_snap, target_snap)
|
|
165
|
+
shutil.rmtree(legacy_snap, ignore_errors=True)
|
|
166
|
+
except OSError as exc:
|
|
167
|
+
LOGGER.warning("Migration from legacy path failed: %s — starting fresh", exc)
|
|
168
|
+
|
|
169
|
+
def all_shards(self) -> list[GraphStore]:
|
|
170
|
+
"""Open and return all physical shards that exist (for fan-out reads).
|
|
171
|
+
|
|
172
|
+
Non-existent shards are skipped in read-only mode to avoid Kuzu
|
|
173
|
+
errors when opening empty paths under read_only=True.
|
|
174
|
+
"""
|
|
175
|
+
stores = []
|
|
176
|
+
for i in self.router.all_shards():
|
|
177
|
+
s = self._get_shard(i)
|
|
178
|
+
if s is not None:
|
|
179
|
+
stores.append(s)
|
|
180
|
+
return stores
|
|
181
|
+
|
|
182
|
+
def open_shards(self) -> list[GraphStore]:
|
|
183
|
+
"""Return only shards that have already been opened (no I/O)."""
|
|
184
|
+
with self._lock:
|
|
185
|
+
return [s for s in self._pool.values() if s is not None]
|
|
186
|
+
|
|
187
|
+
# ------------------------------------------------------------------
|
|
188
|
+
# Delegated project-scoped operations
|
|
189
|
+
# ------------------------------------------------------------------
|
|
190
|
+
|
|
191
|
+
def upsert_project(self, project_id: str, path: str) -> None:
|
|
192
|
+
self.shard(project_id).upsert_project(project_id, path)
|
|
193
|
+
|
|
194
|
+
def clear_project(self, project_id: str) -> None:
|
|
195
|
+
self.shard(project_id).clear_project(project_id)
|
|
196
|
+
|
|
197
|
+
def get_project_metadata(self, project_id: str) -> dict[str, Any] | None:
|
|
198
|
+
return self.shard(project_id).get_project_metadata(project_id)
|
|
199
|
+
|
|
200
|
+
def set_project_overlay_dirty(self, project_id: str, dirty: bool) -> None:
|
|
201
|
+
self.shard(project_id).set_project_overlay_dirty(project_id, dirty)
|
|
202
|
+
|
|
203
|
+
def set_project_indexed_commit(self, project_id: str, commit: str) -> None:
|
|
204
|
+
self.shard(project_id).set_project_indexed_commit(project_id, commit)
|
|
205
|
+
|
|
206
|
+
def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
|
|
207
|
+
return self.shard(project_id).project_file_hashes(project_id)
|
|
208
|
+
|
|
209
|
+
def project_has_embeddings(self, project_id: str) -> bool:
|
|
210
|
+
return self.shard(project_id).project_has_embeddings(project_id)
|
|
211
|
+
|
|
212
|
+
def upsert_file_from_entry(self, entry: dict, project_path: str) -> None:
|
|
213
|
+
project_id = entry.get("project_id", "")
|
|
214
|
+
self.shard(project_id).upsert_file_from_entry(entry, project_path)
|
|
215
|
+
|
|
216
|
+
def clear_file_by_path(self, project_id: str, project_path: str, file_path: str) -> None:
|
|
217
|
+
self.shard(project_id).clear_file_by_path(project_id, project_path, file_path)
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Fan-out global reads
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def list_project_metadata(self) -> list[dict[str, Any]]:
|
|
224
|
+
"""Aggregate project list across all shards."""
|
|
225
|
+
results: list[dict[str, Any]] = []
|
|
226
|
+
seen: set[str] = set()
|
|
227
|
+
for store in self.all_shards():
|
|
228
|
+
for rec in store.list_project_metadata():
|
|
229
|
+
pid = rec.get("id", "")
|
|
230
|
+
if pid and pid not in seen:
|
|
231
|
+
seen.add(pid)
|
|
232
|
+
results.append(rec)
|
|
233
|
+
results.sort(key=lambda r: r.get("id", ""))
|
|
234
|
+
return results
|
|
235
|
+
|
|
236
|
+
def query_records(
|
|
237
|
+
self,
|
|
238
|
+
query: str,
|
|
239
|
+
params: dict[str, Any] | None = None,
|
|
240
|
+
*,
|
|
241
|
+
project_id: str | None = None,
|
|
242
|
+
) -> list[dict[str, Any]]:
|
|
243
|
+
"""Execute a Cypher query.
|
|
244
|
+
|
|
245
|
+
If ``project_id`` is given, the query runs only on that project's
|
|
246
|
+
shard (fast path). Otherwise the query fans out to all shards and
|
|
247
|
+
results are concatenated.
|
|
248
|
+
|
|
249
|
+
Note: fan-out only makes sense for queries whose results are
|
|
250
|
+
independent per shard (e.g. listing nodes). Queries that aggregate
|
|
251
|
+
across shards (e.g. global COUNT) will return per-shard subtotals.
|
|
252
|
+
"""
|
|
253
|
+
if project_id:
|
|
254
|
+
return self.shard(project_id).query_records(query, params)
|
|
255
|
+
merged: list[dict[str, Any]] = []
|
|
256
|
+
for store in self.all_shards():
|
|
257
|
+
merged.extend(store.query_records(query, params))
|
|
258
|
+
return merged
|
|
259
|
+
|
|
260
|
+
# ------------------------------------------------------------------
|
|
261
|
+
# Snapshot all shards
|
|
262
|
+
# ------------------------------------------------------------------
|
|
263
|
+
|
|
264
|
+
def snapshot_all(self, background: bool = False) -> None:
|
|
265
|
+
"""Snapshot every open shard.
|
|
266
|
+
|
|
267
|
+
In background mode all snapshots run concurrently in daemon threads
|
|
268
|
+
(one per shard) rather than sequentially.
|
|
269
|
+
"""
|
|
270
|
+
for store in self.open_shards():
|
|
271
|
+
store.snapshot_to_read_replica(background=background)
|
|
272
|
+
|
|
273
|
+
# ------------------------------------------------------------------
|
|
274
|
+
# Global reset / status
|
|
275
|
+
# ------------------------------------------------------------------
|
|
276
|
+
|
|
277
|
+
def force_delete_all_data(self) -> list[str]:
|
|
278
|
+
"""Delete all shards' data files. Equivalent to force_delete per shard."""
|
|
279
|
+
removed: list[str] = []
|
|
280
|
+
for store in self.all_shards():
|
|
281
|
+
removed.extend(store.force_delete_all_data())
|
|
282
|
+
return removed
|
|
283
|
+
|
|
284
|
+
def describe(self) -> dict:
|
|
285
|
+
"""Return a human-readable description of the shard topology."""
|
|
286
|
+
shard_info = []
|
|
287
|
+
for idx in self.router.all_shards():
|
|
288
|
+
db_path = self.router.db_path(idx)
|
|
289
|
+
shard_info.append({
|
|
290
|
+
"index": idx,
|
|
291
|
+
"db_path": db_path,
|
|
292
|
+
"exists": os.path.exists(db_path),
|
|
293
|
+
"open": idx in self._pool,
|
|
294
|
+
})
|
|
295
|
+
return {
|
|
296
|
+
**self.router.describe(),
|
|
297
|
+
"shards": shard_info,
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
# ------------------------------------------------------------------
|
|
301
|
+
# Backward-compat helpers — delegate to shard 0 for single-project use
|
|
302
|
+
# ------------------------------------------------------------------
|
|
303
|
+
|
|
304
|
+
@property
|
|
305
|
+
def overlay_store(self):
|
|
306
|
+
"""Expose the overlay store from shard 0 for backward compat."""
|
|
307
|
+
return self._get_shard(0).overlay_store
|
|
308
|
+
|
|
309
|
+
@staticmethod
|
|
310
|
+
def stable_id(*parts: str) -> str:
|
|
311
|
+
"""Stable SHA1-based identifier (shard-independent)."""
|
|
312
|
+
return GraphStore.stable_id(*parts)
|
|
@@ -45,6 +45,9 @@ codespine/search/fuzzy.py
|
|
|
45
45
|
codespine/search/hybrid.py
|
|
46
46
|
codespine/search/rrf.py
|
|
47
47
|
codespine/search/vector.py
|
|
48
|
+
codespine/sharding/__init__.py
|
|
49
|
+
codespine/sharding/router.py
|
|
50
|
+
codespine/sharding/store.py
|
|
48
51
|
codespine/watch/__init__.py
|
|
49
52
|
codespine/watch/git_hook.py
|
|
50
53
|
codespine/watch/watcher.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|