codespine 0.9.4__tar.gz → 0.9.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.9.4 → codespine-0.9.6}/PKG-INFO +1 -1
- {codespine-0.9.4 → codespine-0.9.6}/codespine/__init__.py +1 -1
- {codespine-0.9.4 → codespine-0.9.6}/codespine/cli.py +109 -31
- {codespine-0.9.4 → codespine-0.9.6}/codespine/config.py +9 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/db/store.py +76 -50
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/call_resolver.py +1 -1
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/engine.py +28 -14
- codespine-0.9.6/codespine/sharding/__init__.py +9 -0
- codespine-0.9.6/codespine/sharding/router.py +123 -0
- codespine-0.9.6/codespine/sharding/store.py +312 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/SOURCES.txt +3 -0
- {codespine-0.9.4 → codespine-0.9.6}/pyproject.toml +1 -1
- {codespine-0.9.4 → codespine-0.9.6}/LICENSE +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/README.md +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/community.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/context.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/coupling.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/flow.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/analysis/impact.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/db/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/db/schema.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/diff/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/guide.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/di_resolver.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/mcp/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/mcp/server.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/noise/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/noise/blocklist.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/git_state.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/merge.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/overlay/store.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/bm25.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/fuzzy.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/hybrid.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/rrf.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/search/vector.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/__init__.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/git_hook.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine/watch/watcher.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/gindex.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/setup.cfg +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_call_resolver.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_community_detection.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_deadcode.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_java_parser.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_multimodule_index.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_overlay.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_search_ranking.py +0 -0
- {codespine-0.9.4 → codespine-0.9.6}/tests/test_store_recovery.py +0 -0
|
@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
|
|
|
20
20
|
from codespine.analysis.impact import analyze_impact
|
|
21
21
|
from codespine.config import SETTINGS
|
|
22
22
|
from codespine.db.store import GraphStore
|
|
23
|
+
from codespine.sharding import ShardedGraphStore, ShardRouter
|
|
23
24
|
from codespine.diff.branch_diff import compare_branches
|
|
24
25
|
from codespine.indexer.engine import JavaIndexer
|
|
25
26
|
from codespine.mcp.server import build_mcp_server
|
|
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
|
|
|
90
91
|
return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
|
|
91
92
|
|
|
92
93
|
|
|
94
|
+
def _show_shard_topology(as_json: bool) -> None:
|
|
95
|
+
"""Display the current shard routing topology and imbalance metrics."""
|
|
96
|
+
router = ShardRouter()
|
|
97
|
+
sg = ShardedGraphStore(read_only=True)
|
|
98
|
+
topology = sg.describe()
|
|
99
|
+
|
|
100
|
+
# Gather project → shard mapping from all shards.
|
|
101
|
+
shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
|
|
102
|
+
for p in sg.list_project_metadata():
|
|
103
|
+
pid = p.get("id", "")
|
|
104
|
+
idx = router.shard_for(pid)
|
|
105
|
+
shard_project_counts[idx].append(pid)
|
|
106
|
+
|
|
107
|
+
counts = [len(v) for v in shard_project_counts.values()]
|
|
108
|
+
total = sum(counts)
|
|
109
|
+
median = sorted(counts)[len(counts) // 2] if counts else 0
|
|
110
|
+
max_count = max(counts) if counts else 0
|
|
111
|
+
imbalance = (max_count / median) if median else 1.0
|
|
112
|
+
|
|
113
|
+
if as_json:
|
|
114
|
+
_echo_json({
|
|
115
|
+
"topology": topology,
|
|
116
|
+
"project_distribution": {str(k): v for k, v in shard_project_counts.items()},
|
|
117
|
+
"imbalance_ratio": round(imbalance, 2),
|
|
118
|
+
}, as_json=True)
|
|
119
|
+
return
|
|
120
|
+
|
|
121
|
+
click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
|
|
122
|
+
click.echo(f" Directory : {router.shards_dir}")
|
|
123
|
+
click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
|
|
124
|
+
click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
|
|
125
|
+
click.echo()
|
|
126
|
+
header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
|
|
127
|
+
click.secho(header, fg="cyan")
|
|
128
|
+
click.echo("-" * 60)
|
|
129
|
+
for i, info in enumerate(topology.get("shards", [])):
|
|
130
|
+
plist = shard_project_counts.get(i, [])
|
|
131
|
+
exists_str = "yes" if info.get("exists") else "no"
|
|
132
|
+
click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
|
|
133
|
+
for pid in plist:
|
|
134
|
+
click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
|
|
135
|
+
if imbalance > 2.0:
|
|
136
|
+
click.secho(
|
|
137
|
+
f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
|
|
138
|
+
fg="yellow",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
93
142
|
@click.group()
|
|
94
143
|
def main() -> None:
|
|
95
144
|
"""CodeSpine CLI."""
|
|
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
130
179
|
fg="yellow",
|
|
131
180
|
)
|
|
132
181
|
|
|
133
|
-
|
|
134
|
-
|
|
182
|
+
# ShardedGraphStore routes each project to its dedicated DB shard.
|
|
183
|
+
# For single-project analysis this is transparent — shard() always
|
|
184
|
+
# returns a GraphStore pointing to the correct shard path.
|
|
185
|
+
sg = ShardedGraphStore(read_only=False)
|
|
186
|
+
# The indexer is initialised per-module below with the right shard store.
|
|
187
|
+
# We keep a single ShardedGraphStore to fan-out cross-module linking later.
|
|
135
188
|
|
|
136
189
|
# --- Workspace → project → module detection ---
|
|
137
190
|
# Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
|
|
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
241
294
|
last_result = None
|
|
242
295
|
for idx, (module_path, project_id) in enumerate(modules_with_ids):
|
|
243
296
|
if is_multi:
|
|
297
|
+
shard_idx = sg.router.shard_for(project_id)
|
|
244
298
|
click.echo()
|
|
245
|
-
click.secho(
|
|
299
|
+
click.secho(
|
|
300
|
+
f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
|
|
301
|
+
fg="cyan",
|
|
302
|
+
)
|
|
246
303
|
_reset_state()
|
|
304
|
+
# Use the shard store for this project so data lands in the right DB.
|
|
305
|
+
shard_store = sg.shard(project_id)
|
|
306
|
+
indexer = JavaIndexer(shard_store)
|
|
247
307
|
last_result = indexer.index_project(
|
|
248
308
|
module_path, full=full, progress=_progress, project_id=project_id, embed=embed
|
|
249
309
|
)
|
|
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
264
324
|
"""Finalise an in-place phase line and move to the next line."""
|
|
265
325
|
click.echo(f"\r✓ {label:<28} {result:<48}")
|
|
266
326
|
|
|
327
|
+
# For cross-module operations (cross-module linking, deep analysis, stats)
|
|
328
|
+
# we use the shard store for the root project (all modules share one shard).
|
|
329
|
+
root_project_id = last_result.project_id if last_result else root_basename
|
|
330
|
+
root_shard_store = sg.shard(root_project_id)
|
|
331
|
+
|
|
267
332
|
# ── Cross-module call linking ──────────────────────────────────────
|
|
268
333
|
if is_multi and len(modules_with_ids) > 1:
|
|
269
334
|
xmod_label = "Cross-module linking..."
|
|
270
335
|
_live_phase(xmod_label, "running")
|
|
271
336
|
xmod_pids = [pid for _, pid in modules_with_ids]
|
|
272
337
|
xmod_edges = link_cross_module_calls(
|
|
273
|
-
|
|
338
|
+
root_shard_store, project_ids=xmod_pids,
|
|
274
339
|
progress=lambda s: _live_phase(xmod_label, s),
|
|
275
340
|
)
|
|
276
341
|
_finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
|
|
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
287
352
|
comm_label = "Detecting communities..."
|
|
288
353
|
_live_phase(comm_label, "running")
|
|
289
354
|
communities = detect_communities(
|
|
290
|
-
|
|
355
|
+
root_shard_store,
|
|
291
356
|
progress=lambda s: _live_phase(comm_label, s),
|
|
292
357
|
)
|
|
293
358
|
_finish_phase(comm_label, f"{len(communities)} clusters found")
|
|
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
295
360
|
flow_label = "Detecting execution flows..."
|
|
296
361
|
_live_phase(flow_label, "running")
|
|
297
362
|
flows = trace_execution_flows(
|
|
298
|
-
|
|
363
|
+
root_shard_store,
|
|
299
364
|
progress=lambda s: _live_phase(flow_label, s),
|
|
300
365
|
)
|
|
301
366
|
_finish_phase(flow_label, f"{len(flows)} processes found")
|
|
302
367
|
|
|
303
368
|
dead_label = "Finding dead code..."
|
|
304
369
|
_live_phase(dead_label, "running")
|
|
305
|
-
dead = detect_dead_code(
|
|
370
|
+
dead = detect_dead_code(root_shard_store, limit=500)
|
|
306
371
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
|
|
307
372
|
|
|
308
373
|
coup_label = "Analyzing git history..."
|
|
309
374
|
_live_phase(coup_label, "running")
|
|
310
|
-
|
|
375
|
+
root_shard_store.clear_coupling()
|
|
311
376
|
coupling_root = abs_path
|
|
312
377
|
coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
|
|
313
378
|
coupling_pairs = compute_coupling(
|
|
314
|
-
|
|
379
|
+
root_shard_store,
|
|
315
380
|
coupling_root,
|
|
316
381
|
coupling_project,
|
|
317
382
|
days=SETTINGS.default_coupling_days,
|
|
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
329
394
|
flow_label = "Detecting execution flows..."
|
|
330
395
|
_live_phase(flow_label, "running (lightweight)")
|
|
331
396
|
try:
|
|
332
|
-
flows = trace_execution_flows(
|
|
397
|
+
flows = trace_execution_flows(root_shard_store, max_depth=3)
|
|
333
398
|
except Exception:
|
|
334
399
|
flows = []
|
|
335
400
|
_finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
|
|
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
337
402
|
dead_label = "Finding dead code..."
|
|
338
403
|
_live_phase(dead_label, "running (lightweight)")
|
|
339
404
|
try:
|
|
340
|
-
dead = detect_dead_code(
|
|
405
|
+
dead = detect_dead_code(root_shard_store, limit=100)
|
|
341
406
|
except Exception:
|
|
342
407
|
dead = []
|
|
343
408
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
|
|
344
409
|
|
|
345
410
|
_phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
|
|
346
411
|
|
|
347
|
-
vector_count =
|
|
412
|
+
vector_count = root_shard_store.query_records(
|
|
348
413
|
"""
|
|
349
414
|
MATCH (s:Symbol)
|
|
350
415
|
WHERE s.embedding IS NOT NULL
|
|
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
355
420
|
vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
|
|
356
421
|
_phase("Generating embeddings...", f"{vectors_stored} vectors stored")
|
|
357
422
|
|
|
358
|
-
symbol_count =
|
|
359
|
-
edge_count =
|
|
423
|
+
symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
|
|
424
|
+
edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
|
|
360
425
|
symbols = int(symbol_count[0]["count"]) if symbol_count else 0
|
|
361
426
|
edges = int(edge_count[0]["count"]) if edge_count else 0
|
|
362
427
|
elapsed = time.perf_counter() - started
|
|
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
376
441
|
|
|
377
442
|
# Detect unresolved imports → hint about unindexed sibling projects
|
|
378
443
|
try:
|
|
379
|
-
unresolved = JavaIndexer.detect_unresolved_imports(
|
|
444
|
+
unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
|
|
380
445
|
if unresolved:
|
|
381
446
|
click.echo()
|
|
382
447
|
click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
|
|
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
387
452
|
|
|
388
453
|
# Publish a read replica so MCP and read-only CLI commands (search, stats…)
|
|
389
454
|
# run against an isolated snapshot rather than competing with the write
|
|
390
|
-
# process's buffer pool.
|
|
391
|
-
# hot-reloads without restarting.
|
|
455
|
+
# process's buffer pool. Snapshot all open shards concurrently.
|
|
392
456
|
snap_label = "Publishing read replica..."
|
|
393
457
|
_live_phase(snap_label, "copying")
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
_finish_phase(snap_label, "MCP will reload automatically"
|
|
458
|
+
root_shard_store._recycle_conn()
|
|
459
|
+
sg.snapshot_all(background=False)
|
|
460
|
+
_finish_phase(snap_label, "MCP will reload automatically")
|
|
397
461
|
|
|
398
462
|
|
|
399
463
|
@main.command()
|
|
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
|
|
|
523
587
|
|
|
524
588
|
@main.command()
|
|
525
589
|
@click.option("--json", "as_json", is_flag=True)
|
|
526
|
-
|
|
590
|
+
@click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
|
|
591
|
+
def stats(as_json: bool, show_shards: bool) -> None:
|
|
527
592
|
"""Show per-project and aggregate graph statistics."""
|
|
528
|
-
|
|
529
|
-
|
|
593
|
+
if show_shards:
|
|
594
|
+
_show_shard_topology(as_json)
|
|
595
|
+
return
|
|
596
|
+
|
|
597
|
+
# Fan-out across all shards so stats covers every project in the cluster.
|
|
598
|
+
sg = ShardedGraphStore(read_only=True)
|
|
599
|
+
all_projects_meta = sg.list_project_metadata()
|
|
600
|
+
|
|
601
|
+
# For detailed stats we need the per-project shard store.
|
|
602
|
+
def _project_store(pid: str):
|
|
603
|
+
return sg.shard(pid)
|
|
604
|
+
|
|
530
605
|
if not projects:
|
|
531
606
|
click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
|
|
532
607
|
return
|
|
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
|
|
|
534
609
|
rows = []
|
|
535
610
|
for p in projects:
|
|
536
611
|
pid = p["id"]
|
|
537
|
-
|
|
612
|
+
# Route each query to the project's owning shard.
|
|
613
|
+
ps = _project_store(pid)
|
|
614
|
+
files = ps.query_records(
|
|
538
615
|
"MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
|
|
539
616
|
)
|
|
540
|
-
classes =
|
|
617
|
+
classes = ps.query_records(
|
|
541
618
|
"""
|
|
542
619
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
543
620
|
WITH f
|
|
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
|
|
|
546
623
|
""",
|
|
547
624
|
{"pid": pid},
|
|
548
625
|
)
|
|
549
|
-
methods =
|
|
626
|
+
methods = ps.query_records(
|
|
550
627
|
"""
|
|
551
628
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
552
629
|
WITH f
|
|
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
|
|
|
557
634
|
""",
|
|
558
635
|
{"pid": pid},
|
|
559
636
|
)
|
|
560
|
-
calls =
|
|
637
|
+
calls = ps.query_records(
|
|
561
638
|
"""
|
|
562
639
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
563
640
|
WITH f
|
|
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
|
|
|
568
645
|
""",
|
|
569
646
|
{"pid": pid},
|
|
570
647
|
)
|
|
571
|
-
emb =
|
|
648
|
+
emb = ps.query_records(
|
|
572
649
|
"""
|
|
573
650
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
574
651
|
WITH f
|
|
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
|
|
|
580
657
|
rows.append({
|
|
581
658
|
"project": pid,
|
|
582
659
|
"path": p["path"],
|
|
660
|
+
"shard": sg.router.shard_for(pid),
|
|
583
661
|
"files": files[0]["n"] if files else 0,
|
|
584
662
|
"classes": classes[0]["n"] if classes else 0,
|
|
585
663
|
"methods": methods[0]["n"] if methods else 0,
|
|
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
|
|
|
592
670
|
return
|
|
593
671
|
|
|
594
672
|
col_w = max(len(r["project"]) for r in rows)
|
|
595
|
-
header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
673
|
+
header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
596
674
|
click.secho(header, fg="cyan")
|
|
597
675
|
click.echo("-" * len(header))
|
|
598
676
|
total_files = total_classes = total_methods = total_calls = total_emb = 0
|
|
599
677
|
for r in rows:
|
|
600
678
|
click.echo(
|
|
601
|
-
f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
679
|
+
f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
602
680
|
)
|
|
603
681
|
total_files += r["files"]
|
|
604
682
|
total_classes += r["classes"]
|
|
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
|
|
|
608
686
|
if len(rows) > 1:
|
|
609
687
|
click.echo("-" * len(header))
|
|
610
688
|
click.secho(
|
|
611
|
-
f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
689
|
+
f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
612
690
|
fg="green",
|
|
613
691
|
)
|
|
614
692
|
|
|
@@ -4,8 +4,17 @@ from dataclasses import dataclass
|
|
|
4
4
|
|
|
5
5
|
@dataclass(frozen=True)
|
|
6
6
|
class Settings:
|
|
7
|
+
# Legacy single-DB paths — kept for backward compat and as defaults when
|
|
8
|
+
# sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
|
|
7
9
|
db_path: str = os.path.expanduser("~/.codespine_db")
|
|
8
10
|
db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
|
|
11
|
+
|
|
12
|
+
# Sharding — new layout stores each shard under shards_dir/{N}/db
|
|
13
|
+
# num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
|
|
14
|
+
# ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
|
|
15
|
+
num_shards: int = 4
|
|
16
|
+
shards_dir: str = os.path.expanduser("~/.codespine/shards")
|
|
17
|
+
|
|
9
18
|
pid_file: str = os.path.expanduser("~/.codespine.pid")
|
|
10
19
|
log_file: str = os.path.expanduser("~/.codespine.log")
|
|
11
20
|
embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
|
|
@@ -8,7 +8,7 @@ import shutil
|
|
|
8
8
|
import threading
|
|
9
9
|
import time
|
|
10
10
|
from contextlib import contextmanager
|
|
11
|
-
from dataclasses import dataclass
|
|
11
|
+
from dataclasses import InitVar, dataclass
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
14
|
import kuzu
|
|
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
|
|
|
39
39
|
@dataclass
|
|
40
40
|
class GraphStore:
|
|
41
41
|
read_only: bool = False
|
|
42
|
+
# Optional path overrides — when provided, the store uses these paths
|
|
43
|
+
# instead of the global SETTINGS values. The ShardedGraphStore uses
|
|
44
|
+
# this to give each shard its own isolated KùzuDB directory.
|
|
45
|
+
db_path_override: InitVar[str | None] = None
|
|
46
|
+
snapshot_path_override: InitVar[str | None] = None
|
|
47
|
+
|
|
48
|
+
def __post_init__(
|
|
49
|
+
self,
|
|
50
|
+
db_path_override: str | None,
|
|
51
|
+
snapshot_path_override: str | None,
|
|
52
|
+
) -> None:
|
|
53
|
+
# Resolve effective paths — per-shard overrides win over global SETTINGS.
|
|
54
|
+
self._db_path: str = db_path_override or SETTINGS.db_path
|
|
55
|
+
self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
|
|
56
|
+
|
|
57
|
+
# Per-instance snapshot synchronisation (not class-level) so that
|
|
58
|
+
# multiple shards can snapshot concurrently without a shared bottleneck.
|
|
59
|
+
self._inst_snapshot_lock: threading.Lock = threading.Lock()
|
|
60
|
+
self._inst_snapshot_pending: threading.Event = threading.Event()
|
|
42
61
|
|
|
43
|
-
def __post_init__(self) -> None:
|
|
44
62
|
self._tls: threading.local = threading.local()
|
|
45
63
|
from codespine.overlay.store import OverlayStore
|
|
46
64
|
|
|
@@ -48,10 +66,10 @@ class GraphStore:
|
|
|
48
66
|
|
|
49
67
|
# Read-only callers (MCP, CLI reads) use the read replica when available.
|
|
50
68
|
# This isolates them from the write process's buffer pool and WAL churn.
|
|
51
|
-
if self.read_only and os.path.exists(
|
|
52
|
-
db_path =
|
|
69
|
+
if self.read_only and os.path.exists(self._snapshot_path):
|
|
70
|
+
db_path = self._snapshot_path
|
|
53
71
|
else:
|
|
54
|
-
db_path =
|
|
72
|
+
db_path = self._db_path
|
|
55
73
|
|
|
56
74
|
try:
|
|
57
75
|
self.db = self._open_with_recovery(db_path)
|
|
@@ -97,7 +115,7 @@ class GraphStore:
|
|
|
97
115
|
try:
|
|
98
116
|
ensure_schema(self._conn())
|
|
99
117
|
except Exception as exc:
|
|
100
|
-
path = getattr(self.db, "database_path",
|
|
118
|
+
path = getattr(self.db, "database_path", self._db_path)
|
|
101
119
|
if not self._is_recoverable_db_error(exc):
|
|
102
120
|
raise
|
|
103
121
|
LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
|
|
@@ -527,15 +545,27 @@ class GraphStore:
|
|
|
527
545
|
rows = [{"source_id": r["source_id"], "target_id": r["target_id"],
|
|
528
546
|
"confidence": float(r["confidence"]), "reason": r["reason"]}
|
|
529
547
|
for r in records]
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
548
|
+
if create_mode:
|
|
549
|
+
self.execute(
|
|
550
|
+
"""
|
|
551
|
+
UNWIND $rows AS row
|
|
552
|
+
MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
|
|
553
|
+
CREATE (src)-[:CALLS {confidence: row.confidence, reason: row.reason}]->(dst)
|
|
554
|
+
""",
|
|
555
|
+
{"rows": rows},
|
|
556
|
+
)
|
|
557
|
+
else:
|
|
558
|
+
# Properties are SET, not part of the MERGE pattern — ensures at most
|
|
559
|
+
# one CALLS edge per (src, dst) pair regardless of confidence value.
|
|
560
|
+
self.execute(
|
|
561
|
+
"""
|
|
562
|
+
UNWIND $rows AS row
|
|
563
|
+
MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
|
|
564
|
+
MERGE (src)-[r:CALLS]->(dst)
|
|
565
|
+
SET r.confidence = row.confidence, r.reason = row.reason
|
|
566
|
+
""",
|
|
567
|
+
{"rows": rows},
|
|
568
|
+
)
|
|
539
569
|
|
|
540
570
|
def add_reference(self, rel: str, src_label: str, src_id: str, dst_label: str, dst_id: str, confidence: float) -> None:
|
|
541
571
|
if rel not in {"REFERENCES_TYPE", "IMPLEMENTS", "OVERRIDES"}:
|
|
@@ -756,8 +786,7 @@ class GraphStore:
|
|
|
756
786
|
self.clear_flows()
|
|
757
787
|
self.clear_coupling()
|
|
758
788
|
|
|
759
|
-
|
|
760
|
-
def force_delete_all_data() -> list[str]:
|
|
789
|
+
def force_delete_all_data(self) -> list[str]:
|
|
761
790
|
"""Delete all CodeSpine data files without touching the Kuzu engine.
|
|
762
791
|
|
|
763
792
|
This is the nuclear option for OOM recovery: when the buffer pool is
|
|
@@ -767,12 +796,14 @@ class GraphStore:
|
|
|
767
796
|
|
|
768
797
|
Returns the list of paths that were removed.
|
|
769
798
|
"""
|
|
799
|
+
db_path = self._db_path
|
|
800
|
+
snapshot_path = self._snapshot_path
|
|
770
801
|
removed: list[str] = []
|
|
771
802
|
for path in [
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
803
|
+
db_path,
|
|
804
|
+
snapshot_path,
|
|
805
|
+
snapshot_path + ".updated",
|
|
806
|
+
snapshot_path + ".tmp",
|
|
776
807
|
SETTINGS.embedding_cache_path,
|
|
777
808
|
SETTINGS.overlay_dir,
|
|
778
809
|
SETTINGS.index_meta_dir,
|
|
@@ -789,7 +820,7 @@ class GraphStore:
|
|
|
789
820
|
pass
|
|
790
821
|
# Also remove any stale WAL files next to the DB
|
|
791
822
|
for suffix in (".wal", ".lock"):
|
|
792
|
-
wal_path =
|
|
823
|
+
wal_path = db_path + suffix
|
|
793
824
|
if os.path.exists(wal_path):
|
|
794
825
|
try:
|
|
795
826
|
os.remove(wal_path)
|
|
@@ -800,7 +831,7 @@ class GraphStore:
|
|
|
800
831
|
|
|
801
832
|
def rebuild_empty_db(self) -> None:
|
|
802
833
|
self._recycle_conn()
|
|
803
|
-
path =
|
|
834
|
+
path = self._db_path
|
|
804
835
|
# Remove the DB directory AND any stale WAL / lock files
|
|
805
836
|
self._remove_db_path(path)
|
|
806
837
|
for suffix in (".wal", ".lock"):
|
|
@@ -813,11 +844,8 @@ class GraphStore:
|
|
|
813
844
|
|
|
814
845
|
# Also remove the read replica so that read-only callers (stats, MCP)
|
|
815
846
|
# don't continue to see stale data from before the wipe.
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
SETTINGS.db_snapshot_path + ".tmp",
|
|
819
|
-
SETTINGS.db_snapshot_path + ".updated",
|
|
820
|
-
]:
|
|
847
|
+
snap = self._snapshot_path
|
|
848
|
+
for stale in [snap, snap + ".tmp", snap + ".updated"]:
|
|
821
849
|
self._remove_db_path(stale)
|
|
822
850
|
|
|
823
851
|
# Kuzu may retain stale internal state from a previous failed open of
|
|
@@ -914,18 +942,15 @@ class GraphStore:
|
|
|
914
942
|
},
|
|
915
943
|
)
|
|
916
944
|
|
|
917
|
-
|
|
918
|
-
# Only one snapshot runs at a time; a pending request supersedes queued ones.
|
|
919
|
-
_snapshot_lock: threading.Lock = threading.Lock()
|
|
920
|
-
_snapshot_pending: threading.Event = threading.Event()
|
|
921
|
-
|
|
922
|
-
@staticmethod
|
|
923
|
-
def snapshot_to_read_replica(background: bool = False) -> bool:
|
|
945
|
+
def snapshot_to_read_replica(self, background: bool = False) -> bool:
|
|
924
946
|
"""Atomically copy the write DB to the read-replica path.
|
|
925
947
|
|
|
926
948
|
The read replica is used by the MCP daemon and all read-only CLI
|
|
927
949
|
commands so they never contend with the write process's buffer pool.
|
|
928
950
|
|
|
951
|
+
Each GraphStore instance has its own snapshot lock so that multiple
|
|
952
|
+
shards can snapshot concurrently without serialising on a class lock.
|
|
953
|
+
|
|
929
954
|
Parameters
|
|
930
955
|
----------
|
|
931
956
|
background:
|
|
@@ -938,36 +963,38 @@ class GraphStore:
|
|
|
938
963
|
Returns True on success (or when dispatched to background), False if
|
|
939
964
|
the source DB does not exist.
|
|
940
965
|
"""
|
|
941
|
-
src =
|
|
966
|
+
src = self._db_path
|
|
942
967
|
if not os.path.exists(src):
|
|
943
968
|
return False
|
|
944
969
|
|
|
945
970
|
if background:
|
|
946
971
|
# Signal that a snapshot is wanted, then ensure a worker is running.
|
|
947
|
-
|
|
972
|
+
self._inst_snapshot_pending.set()
|
|
973
|
+
inst = self # capture for closure
|
|
948
974
|
|
|
949
975
|
def _worker() -> None:
|
|
950
|
-
while
|
|
951
|
-
|
|
952
|
-
with
|
|
953
|
-
|
|
976
|
+
while inst._inst_snapshot_pending.is_set():
|
|
977
|
+
inst._inst_snapshot_pending.clear()
|
|
978
|
+
with inst._inst_snapshot_lock:
|
|
979
|
+
inst._do_snapshot()
|
|
954
980
|
|
|
955
|
-
if not
|
|
981
|
+
if not self._inst_snapshot_lock.locked():
|
|
956
982
|
t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
|
|
957
983
|
t.start()
|
|
958
984
|
return True
|
|
959
985
|
|
|
960
986
|
# Foreground (blocking) path — used by CLI analyse and tests.
|
|
961
|
-
with
|
|
962
|
-
return
|
|
987
|
+
with self._inst_snapshot_lock:
|
|
988
|
+
return self._do_snapshot()
|
|
963
989
|
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
dst = SETTINGS.db_snapshot_path
|
|
990
|
+
def _do_snapshot(self) -> bool:
|
|
991
|
+
"""Perform the actual copy. Must be called with the instance snapshot lock held."""
|
|
992
|
+
src = self._db_path
|
|
993
|
+
dst = self._snapshot_path
|
|
969
994
|
if not os.path.exists(src):
|
|
970
995
|
return False
|
|
996
|
+
# Ensure the parent directory for the replica exists (shards layout).
|
|
997
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
971
998
|
tmp = dst + ".tmp"
|
|
972
999
|
try:
|
|
973
1000
|
if os.path.exists(tmp):
|
|
@@ -975,7 +1002,6 @@ class GraphStore:
|
|
|
975
1002
|
if os.path.isdir(src):
|
|
976
1003
|
shutil.copytree(src, tmp)
|
|
977
1004
|
else:
|
|
978
|
-
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
979
1005
|
shutil.copy2(src, tmp)
|
|
980
1006
|
if os.path.exists(dst):
|
|
981
1007
|
shutil.rmtree(dst, ignore_errors=True)
|
|
@@ -5,7 +5,7 @@ from typing import Iterator
|
|
|
5
5
|
|
|
6
6
|
from codespine.noise.blocklist import MIN_FUZZY_NAME_LEN, NOISE_METHOD_NAMES
|
|
7
7
|
|
|
8
|
-
MAX_FUZZY_TARGETS = 12
|
|
8
|
+
MAX_FUZZY_TARGETS = 6 # reduced from 12 — keeps precision, halves low-confidence edge fan-out
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
def _simple_type_name(type_name: str | None) -> str:
|
|
@@ -221,7 +221,7 @@ class JavaIndexer:
|
|
|
221
221
|
calls_resolved = 0
|
|
222
222
|
type_relationships = 0
|
|
223
223
|
file_batch_size = max(1, int(getattr(SETTINGS, "index_file_batch_size", 64)))
|
|
224
|
-
edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size",
|
|
224
|
+
edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size", 5000)))
|
|
225
225
|
|
|
226
226
|
if not full:
|
|
227
227
|
method_catalog, class_catalog, fqcn_to_class_ids, class_methods = (
|
|
@@ -480,22 +480,36 @@ class JavaIndexer:
|
|
|
480
480
|
self.store._recycle_conn()
|
|
481
481
|
|
|
482
482
|
self._emit(progress, "resolve_calls_start")
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
483
|
+
# Deduplicate (src, dst) pairs — the same pair can appear many times when
|
|
484
|
+
# a method calls another method multiple times at different call sites.
|
|
485
|
+
# Keep the highest-confidence resolution to avoid N writes per pair.
|
|
486
|
+
best_calls: dict[tuple[str, str], tuple[float, str]] = {}
|
|
487
|
+
for src, dst, confidence, reason in resolve_calls(
|
|
488
|
+
method_catalog, method_calls, method_context, class_catalog
|
|
489
|
+
):
|
|
490
|
+
key = (src, dst)
|
|
491
|
+
if key not in best_calls or confidence > best_calls[key][0]:
|
|
492
|
+
best_calls[key] = (confidence, reason)
|
|
493
|
+
|
|
494
|
+
# Stream writes in batches — never hold the full set in RAM.
|
|
495
|
+
call_buf: list[dict] = []
|
|
496
|
+
for (src, dst), (confidence, reason) in best_calls.items():
|
|
497
|
+
call_buf.append(
|
|
498
|
+
{"source_id": src, "target_id": dst,
|
|
499
|
+
"confidence": confidence, "reason": reason}
|
|
492
500
|
)
|
|
493
|
-
|
|
501
|
+
if len(call_buf) >= edge_batch_size:
|
|
502
|
+
with self.store.transaction():
|
|
503
|
+
self.store.add_calls_batch(call_buf)
|
|
504
|
+
calls_resolved += len(call_buf)
|
|
505
|
+
self.store._recycle_conn()
|
|
506
|
+
self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
|
|
507
|
+
call_buf = []
|
|
508
|
+
if call_buf:
|
|
494
509
|
with self.store.transaction():
|
|
495
|
-
self.store.add_calls_batch(
|
|
496
|
-
calls_resolved += len(
|
|
510
|
+
self.store.add_calls_batch(call_buf)
|
|
511
|
+
calls_resolved += len(call_buf)
|
|
497
512
|
self.store._recycle_conn()
|
|
498
|
-
self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
|
|
499
513
|
self._emit(progress, "resolve_calls_done", calls_resolved=calls_resolved)
|
|
500
514
|
|
|
501
515
|
self._emit(progress, "resolve_types_start")
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""CodeSpine sharding package.
|
|
2
|
+
|
|
3
|
+
Exposes the consistent-hash router and the ShardedGraphStore facade.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from codespine.sharding.router import ShardRouter
|
|
7
|
+
from codespine.sharding.store import ShardedGraphStore
|
|
8
|
+
|
|
9
|
+
__all__ = ["ShardRouter", "ShardedGraphStore"]
|