PyPI - codespine - Versions diffs - 0.9.5__tar.gz → 0.9.6__tar.gz - Mend

codespine 0.9.5tar.gz → 0.9.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{codespine-0.9.5 → codespine-0.9.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codespine
-Version: 0.9.5
+Version: 0.9.6
 Summary: Local Java code intelligence indexer backed by a graph database
 Author: CodeSpine contributors
 License: MIT License

{codespine-0.9.5 → codespine-0.9.6}/codespine/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 """CodeSpine package."""
 __all__ = ["__version__"]
-__version__ = "0.9.5"
+__version__ = "0.9.6"

{codespine-0.9.5 → codespine-0.9.6}/codespine/cli.py RENAMED Viewed

@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
 from codespine.analysis.impact import analyze_impact
 from codespine.config import SETTINGS
 from codespine.db.store import GraphStore
+from codespine.sharding import ShardedGraphStore, ShardRouter
 from codespine.diff.branch_diff import compare_branches
 from codespine.indexer.engine import JavaIndexer
 from codespine.mcp.server import build_mcp_server
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
     return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
+def _show_shard_topology(as_json: bool) -> None:
+    """Display the current shard routing topology and imbalance metrics."""
+    router = ShardRouter()
+    sg = ShardedGraphStore(read_only=True)
+    topology = sg.describe()
+    # Gather project → shard mapping from all shards.
+    shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
+    for p in sg.list_project_metadata():
+        pid = p.get("id", "")
+        idx = router.shard_for(pid)
+        shard_project_counts[idx].append(pid)
+    counts = [len(v) for v in shard_project_counts.values()]
+    total = sum(counts)
+    median = sorted(counts)[len(counts) // 2] if counts else 0
+    max_count = max(counts) if counts else 0
+    imbalance = (max_count / median) if median else 1.0
+    if as_json:
+        _echo_json({
+            "topology": topology,
+            "project_distribution": {str(k): v for k, v in shard_project_counts.items()},
+            "imbalance_ratio": round(imbalance, 2),
+        }, as_json=True)
+        return
+    click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
+    click.echo(f"  Directory : {router.shards_dir}")
+    click.echo(f"  Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
+    click.echo(f"  Projects  : {total} total, imbalance ratio {imbalance:.2f}x")
+    click.echo()
+    header = f"{'Shard':>6}  {'Projects':>9}  {'DB exists':>10}  Path"
+    click.secho(header, fg="cyan")
+    click.echo("-" * 60)
+    for i, info in enumerate(topology.get("shards", [])):
+        plist = shard_project_counts.get(i, [])
+        exists_str = "yes" if info.get("exists") else "no"
+        click.echo(f"{i:>6}  {len(plist):>9}  {exists_str:>10}  {info.get('db_path', '')}")
+        for pid in plist:
+            click.echo(f"{'':>6}  {'':>9}  {'':>10}    {pid}")
+    if imbalance > 2.0:
+        click.secho(
+            f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
+            fg="yellow",
+        )
 @click.group()
 def main() -> None:
     """CodeSpine CLI."""
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
                 fg="yellow",
             )
-    store = GraphStore(read_only=False)
-    indexer = JavaIndexer(store)
+    # ShardedGraphStore routes each project to its dedicated DB shard.
+    # For single-project analysis this is transparent — shard() always
+    # returns a GraphStore pointing to the correct shard path.
+    sg = ShardedGraphStore(read_only=False)
+    # The indexer is initialised per-module below with the right shard store.
+    # We keep a single ShardedGraphStore to fan-out cross-module linking later.
     # --- Workspace → project → module detection ---
     # Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     last_result = None
     for idx, (module_path, project_id) in enumerate(modules_with_ids):
         if is_multi:
+            shard_idx = sg.router.shard_for(project_id)
             click.echo()
-            click.secho(f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}", fg="cyan")
+            click.secho(
+                f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}  (shard {shard_idx})",
+                fg="cyan",
+            )
         _reset_state()
+        # Use the shard store for this project so data lands in the right DB.
+        shard_store = sg.shard(project_id)
+        indexer = JavaIndexer(shard_store)
         last_result = indexer.index_project(
             module_path, full=full, progress=_progress, project_id=project_id, embed=embed
         )
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         """Finalise an in-place phase line and move to the next line."""
         click.echo(f"\r✓ {label:<28} {result:<48}")
+    # For cross-module operations (cross-module linking, deep analysis, stats)
+    # we use the shard store for the root project (all modules share one shard).
+    root_project_id = last_result.project_id if last_result else root_basename
+    root_shard_store = sg.shard(root_project_id)
     # ── Cross-module call linking ──────────────────────────────────────
     if is_multi and len(modules_with_ids) > 1:
         xmod_label = "Cross-module linking..."
         _live_phase(xmod_label, "running")
         xmod_pids = [pid for _, pid in modules_with_ids]
         xmod_edges = link_cross_module_calls(
-            store, project_ids=xmod_pids,
+            root_shard_store, project_ids=xmod_pids,
             progress=lambda s: _live_phase(xmod_label, s),
         )
         _finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         comm_label = "Detecting communities..."
         _live_phase(comm_label, "running")
         communities = detect_communities(
-            store,
+            root_shard_store,
             progress=lambda s: _live_phase(comm_label, s),
         )
         _finish_phase(comm_label, f"{len(communities)} clusters found")
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         flow_label = "Detecting execution flows..."
         _live_phase(flow_label, "running")
         flows = trace_execution_flows(
-            store,
+            root_shard_store,
             progress=lambda s: _live_phase(flow_label, s),
         )
         _finish_phase(flow_label, f"{len(flows)} processes found")
         dead_label = "Finding dead code..."
         _live_phase(dead_label, "running")
-        dead = detect_dead_code(store, limit=500)
+        dead = detect_dead_code(root_shard_store, limit=500)
         _finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
         coup_label = "Analyzing git history..."
         _live_phase(coup_label, "running")
-        store.clear_coupling()
+        root_shard_store.clear_coupling()
         coupling_root = abs_path
         coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
         coupling_pairs = compute_coupling(
-            store,
+            root_shard_store,
             coupling_root,
             coupling_project,
             days=SETTINGS.default_coupling_days,
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         flow_label = "Detecting execution flows..."
         _live_phase(flow_label, "running (lightweight)")
         try:
-            flows = trace_execution_flows(store, max_depth=3)
+            flows = trace_execution_flows(root_shard_store, max_depth=3)
         except Exception:
             flows = []
         _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         dead_label = "Finding dead code..."
         _live_phase(dead_label, "running (lightweight)")
         try:
-            dead = detect_dead_code(store, limit=100)
+            dead = detect_dead_code(root_shard_store, limit=100)
         except Exception:
             dead = []
         _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
         _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
-    vector_count = store.query_records(
+    vector_count = root_shard_store.query_records(
         """
         MATCH (s:Symbol)
         WHERE s.embedding IS NOT NULL
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
     _phase("Generating embeddings...", f"{vectors_stored} vectors stored")
-    symbol_count = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
-    edge_count = store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
+    symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
+    edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
     symbols = int(symbol_count[0]["count"]) if symbol_count else 0
     edges = int(edge_count[0]["count"]) if edge_count else 0
     elapsed = time.perf_counter() - started
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     # Detect unresolved imports → hint about unindexed sibling projects
     try:
-        unresolved = JavaIndexer.detect_unresolved_imports(store)
+        unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
         if unresolved:
             click.echo()
             click.secho("⚠  Unresolved imports — consider indexing these projects:", fg="yellow")
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     # Publish a read replica so MCP and read-only CLI commands (search, stats…)
     # run against an isolated snapshot rather than competing with the write
-    # process's buffer pool.  The MCP daemon detects the sentinel file and
-    # hot-reloads without restarting.
+    # process's buffer pool.  Snapshot all open shards concurrently.
     snap_label = "Publishing read replica..."
     _live_phase(snap_label, "copying")
-    store._recycle_conn()
-    snapped = GraphStore.snapshot_to_read_replica()
-    _finish_phase(snap_label, "MCP will reload automatically" if snapped else "skipped (source DB not found)")
+    root_shard_store._recycle_conn()
+    sg.snapshot_all(background=False)
+    _finish_phase(snap_label, "MCP will reload automatically")
 @main.command()
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
 @main.command()
 @click.option("--json", "as_json", is_flag=True)
-def stats(as_json: bool) -> None:
+@click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
+def stats(as_json: bool, show_shards: bool) -> None:
     """Show per-project and aggregate graph statistics."""
-    store = GraphStore(read_only=True)
-    projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path ORDER BY p.id")
+    if show_shards:
+        _show_shard_topology(as_json)
+        return
+    # Fan-out across all shards so stats covers every project in the cluster.
+    sg = ShardedGraphStore(read_only=True)
+    all_projects_meta = sg.list_project_metadata()
+    # For detailed stats we need the per-project shard store.
+    def _project_store(pid: str):
+        return sg.shard(pid)
     if not projects:
         click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
         return
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
     rows = []
     for p in projects:
         pid = p["id"]
-        files = store.query_records(
+        # Route each query to the project's owning shard.
+        ps = _project_store(pid)
+        files = ps.query_records(
             "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
         )
-        classes = store.query_records(
+        classes = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        methods = store.query_records(
+        methods = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        calls = store.query_records(
+        calls = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        emb = store.query_records(
+        emb = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
         rows.append({
             "project": pid,
             "path": p["path"],
+            "shard": sg.router.shard_for(pid),
             "files": files[0]["n"] if files else 0,
             "classes": classes[0]["n"] if classes else 0,
             "methods": methods[0]["n"] if methods else 0,
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
         return
     col_w = max(len(r["project"]) for r in rows)
-    header = f"{'Project':<{col_w}}  {'Files':>6}  {'Classes':>8}  {'Methods':>8}  {'Calls':>7}  {'Emb':>6}  Path"
+    header = f"{'Project':<{col_w}}  {'Shard':>5}  {'Files':>6}  {'Classes':>8}  {'Methods':>8}  {'Calls':>7}  {'Emb':>6}  Path"
     click.secho(header, fg="cyan")
     click.echo("-" * len(header))
     total_files = total_classes = total_methods = total_calls = total_emb = 0
     for r in rows:
         click.echo(
-            f"{r['project']:<{col_w}}  {r['files']:>6}  {r['classes']:>8}  {r['methods']:>8}  {r['calls_out']:>7}  {r['embeddings']:>6}  {r['path']}"
+            f"{r['project']:<{col_w}}  {r.get('shard', 0):>5}  {r['files']:>6}  {r['classes']:>8}  {r['methods']:>8}  {r['calls_out']:>7}  {r['embeddings']:>6}  {r['path']}"
         )
         total_files += r["files"]
         total_classes += r["classes"]
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
     if len(rows) > 1:
         click.echo("-" * len(header))
         click.secho(
-            f"{'TOTAL':<{col_w}}  {total_files:>6}  {total_classes:>8}  {total_methods:>8}  {total_calls:>7}  {total_emb:>6}",
+            f"{'TOTAL':<{col_w}}  {'':>5}  {total_files:>6}  {total_classes:>8}  {total_methods:>8}  {total_calls:>7}  {total_emb:>6}",
             fg="green",
         )

{codespine-0.9.5 → codespine-0.9.6}/codespine/config.py RENAMED Viewed

@@ -4,8 +4,17 @@ from dataclasses import dataclass
 @dataclass(frozen=True)
 class Settings:
+    # Legacy single-DB paths — kept for backward compat and as defaults when
+    # sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
     db_path: str = os.path.expanduser("~/.codespine_db")
     db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
+    # Sharding — new layout stores each shard under shards_dir/{N}/db
+    # num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
+    # ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
+    num_shards: int = 4
+    shards_dir: str = os.path.expanduser("~/.codespine/shards")
     pid_file: str = os.path.expanduser("~/.codespine.pid")
     log_file: str = os.path.expanduser("~/.codespine.log")
     embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")

{codespine-0.9.5 → codespine-0.9.6}/codespine/db/store.py RENAMED Viewed

@@ -8,7 +8,7 @@ import shutil
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import dataclass
+from dataclasses import InitVar, dataclass
 from typing import Any
 import kuzu
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
 @dataclass
 class GraphStore:
     read_only: bool = False
+    # Optional path overrides — when provided, the store uses these paths
+    # instead of the global SETTINGS values.  The ShardedGraphStore uses
+    # this to give each shard its own isolated KùzuDB directory.
+    db_path_override: InitVar[str | None] = None
+    snapshot_path_override: InitVar[str | None] = None
+    def __post_init__(
+        self,
+        db_path_override: str | None,
+        snapshot_path_override: str | None,
+    ) -> None:
+        # Resolve effective paths — per-shard overrides win over global SETTINGS.
+        self._db_path: str = db_path_override or SETTINGS.db_path
+        self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
+        # Per-instance snapshot synchronisation (not class-level) so that
+        # multiple shards can snapshot concurrently without a shared bottleneck.
+        self._inst_snapshot_lock: threading.Lock = threading.Lock()
+        self._inst_snapshot_pending: threading.Event = threading.Event()
-    def __post_init__(self) -> None:
         self._tls: threading.local = threading.local()
         from codespine.overlay.store import OverlayStore
@@ -48,10 +66,10 @@ class GraphStore:
         # Read-only callers (MCP, CLI reads) use the read replica when available.
         # This isolates them from the write process's buffer pool and WAL churn.
-        if self.read_only and os.path.exists(SETTINGS.db_snapshot_path):
-            db_path = SETTINGS.db_snapshot_path
+        if self.read_only and os.path.exists(self._snapshot_path):
+            db_path = self._snapshot_path
         else:
-            db_path = SETTINGS.db_path
+            db_path = self._db_path
         try:
             self.db = self._open_with_recovery(db_path)
@@ -97,7 +115,7 @@ class GraphStore:
         try:
             ensure_schema(self._conn())
         except Exception as exc:
-            path = getattr(self.db, "database_path", SETTINGS.db_path)
+            path = getattr(self.db, "database_path", self._db_path)
             if not self._is_recoverable_db_error(exc):
                 raise
             LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
@@ -768,8 +786,7 @@ class GraphStore:
         self.clear_flows()
         self.clear_coupling()
-    @staticmethod
-    def force_delete_all_data() -> list[str]:
+    def force_delete_all_data(self) -> list[str]:
         """Delete all CodeSpine data files without touching the Kuzu engine.
         This is the nuclear option for OOM recovery: when the buffer pool is
@@ -779,12 +796,14 @@ class GraphStore:
         Returns the list of paths that were removed.
         """
+        db_path = self._db_path
+        snapshot_path = self._snapshot_path
         removed: list[str] = []
         for path in [
-            SETTINGS.db_path,
-            SETTINGS.db_snapshot_path,
-            SETTINGS.db_snapshot_path + ".updated",
-            SETTINGS.db_snapshot_path + ".tmp",
+            db_path,
+            snapshot_path,
+            snapshot_path + ".updated",
+            snapshot_path + ".tmp",
             SETTINGS.embedding_cache_path,
             SETTINGS.overlay_dir,
             SETTINGS.index_meta_dir,
@@ -801,7 +820,7 @@ class GraphStore:
                 pass
         # Also remove any stale WAL files next to the DB
         for suffix in (".wal", ".lock"):
-            wal_path = SETTINGS.db_path + suffix
+            wal_path = db_path + suffix
             if os.path.exists(wal_path):
                 try:
                     os.remove(wal_path)
@@ -812,7 +831,7 @@ class GraphStore:
     def rebuild_empty_db(self) -> None:
         self._recycle_conn()
-        path = SETTINGS.db_path
+        path = self._db_path
         # Remove the DB directory AND any stale WAL / lock files
         self._remove_db_path(path)
         for suffix in (".wal", ".lock"):
@@ -825,11 +844,8 @@ class GraphStore:
         # Also remove the read replica so that read-only callers (stats, MCP)
         # don't continue to see stale data from before the wipe.
-        for stale in [
-            SETTINGS.db_snapshot_path,
-            SETTINGS.db_snapshot_path + ".tmp",
-            SETTINGS.db_snapshot_path + ".updated",
-        ]:
+        snap = self._snapshot_path
+        for stale in [snap, snap + ".tmp", snap + ".updated"]:
             self._remove_db_path(stale)
         # Kuzu may retain stale internal state from a previous failed open of
@@ -926,18 +942,15 @@ class GraphStore:
             },
         )
-    # Lock and flag for background snapshot coalescing.
-    # Only one snapshot runs at a time; a pending request supersedes queued ones.
-    _snapshot_lock: threading.Lock = threading.Lock()
-    _snapshot_pending: threading.Event = threading.Event()
-    @staticmethod
-    def snapshot_to_read_replica(background: bool = False) -> bool:
+    def snapshot_to_read_replica(self, background: bool = False) -> bool:
         """Atomically copy the write DB to the read-replica path.
         The read replica is used by the MCP daemon and all read-only CLI
         commands so they never contend with the write process's buffer pool.
+        Each GraphStore instance has its own snapshot lock so that multiple
+        shards can snapshot concurrently without serialising on a class lock.
         Parameters
         ----------
         background:
@@ -950,36 +963,38 @@ class GraphStore:
         Returns True on success (or when dispatched to background), False if
         the source DB does not exist.
         """
-        src = SETTINGS.db_path
+        src = self._db_path
         if not os.path.exists(src):
             return False
         if background:
             # Signal that a snapshot is wanted, then ensure a worker is running.
-            GraphStore._snapshot_pending.set()
+            self._inst_snapshot_pending.set()
+            inst = self  # capture for closure
             def _worker() -> None:
-                while GraphStore._snapshot_pending.is_set():
-                    GraphStore._snapshot_pending.clear()
-                    with GraphStore._snapshot_lock:
-                        GraphStore._do_snapshot()
+                while inst._inst_snapshot_pending.is_set():
+                    inst._inst_snapshot_pending.clear()
+                    with inst._inst_snapshot_lock:
+                        inst._do_snapshot()
-            if not GraphStore._snapshot_lock.locked():
+            if not self._inst_snapshot_lock.locked():
                 t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
                 t.start()
             return True
         # Foreground (blocking) path — used by CLI analyse and tests.
-        with GraphStore._snapshot_lock:
-            return GraphStore._do_snapshot()
+        with self._inst_snapshot_lock:
+            return self._do_snapshot()
-    @staticmethod
-    def _do_snapshot() -> bool:
-        """Perform the actual copy.  Must be called with _snapshot_lock held."""
-        src = SETTINGS.db_path
-        dst = SETTINGS.db_snapshot_path
+    def _do_snapshot(self) -> bool:
+        """Perform the actual copy.  Must be called with the instance snapshot lock held."""
+        src = self._db_path
+        dst = self._snapshot_path
         if not os.path.exists(src):
             return False
+        # Ensure the parent directory for the replica exists (shards layout).
+        os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
         tmp = dst + ".tmp"
         try:
             if os.path.exists(tmp):
@@ -987,7 +1002,6 @@ class GraphStore:
             if os.path.isdir(src):
                 shutil.copytree(src, tmp)
             else:
-                os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
                 shutil.copy2(src, tmp)
             if os.path.exists(dst):
                 shutil.rmtree(dst, ignore_errors=True)

codespine-0.9.6/codespine/sharding/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""CodeSpine sharding package.
+Exposes the consistent-hash router and the ShardedGraphStore facade.
+"""
+from codespine.sharding.router import ShardRouter
+from codespine.sharding.store import ShardedGraphStore
+__all__ = ["ShardRouter", "ShardedGraphStore"]

codespine-0.9.6/codespine/sharding/router.py ADDED Viewed

@@ -0,0 +1,123 @@
+"""Consistent-hash shard router for CodeSpine.
+Design
+------
+* ``num_shards`` physical shards — each shard owns an independent KùzuDB at
+  ``~/.codespine/shards/{N}/db``.
+* Shard key = *root project name* (the part before ``::`` in a multi-module
+  project ID).  This guarantees that all modules of the same project are
+  co-located in the same shard so that cross-module call resolution still
+  works in one graph traversal.
+* Virtual-node ring (``VIRTUAL_NODES_PER_SHARD = 150``) gives an even
+  distribution even for small shard counts.
+* ``num_shards`` can be changed at any time; affected projects must be
+  re-indexed, but unaffected projects continue to work.
+Env var override
+----------------
+``CODESPINE_SHARDS=N`` (integer, default 4) sets the number of shards at
+process start.  0 or 1 disables sharding (all projects land in shard 0).
+"""
+from __future__ import annotations
+import bisect
+import hashlib
+import os
+VIRTUAL_NODES_PER_SHARD = 150  # virtual ring entries per physical shard
+class ShardRouter:
+    """Maps project IDs to shard indices via a consistent-hash ring.
+    Parameters
+    ----------
+    num_shards:
+        Number of physical shards.  Defaults to the ``CODESPINE_SHARDS``
+        environment variable, or ``4`` if unset.
+    shards_dir:
+        Base directory that holds per-shard sub-directories.
+    """
+    def __init__(
+        self,
+        num_shards: int | None = None,
+        shards_dir: str | None = None,
+    ) -> None:
+        _env = os.environ.get("CODESPINE_SHARDS", "").strip()
+        _default = max(1, int(_env)) if _env.isdigit() else 4
+        self.num_shards: int = max(1, num_shards if num_shards is not None else _default)
+        self.shards_dir: str = shards_dir or os.path.expanduser("~/.codespine/shards")
+        # Build virtual-node ring: list of (ring_point, shard_index) sorted by ring_point
+        self._ring: list[tuple[int, int]] = []
+        for shard_idx in range(self.num_shards):
+            for vn in range(VIRTUAL_NODES_PER_SHARD):
+                point = self._hash_key(f"shard-{shard_idx}-vn-{vn}")
+                self._ring.append((point, shard_idx))
+        self._ring.sort()
+        self._ring_points = [p for p, _ in self._ring]
+    # ------------------------------------------------------------------
+    # Routing
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _hash_key(key: str) -> int:
+        """Deterministic 64-bit hash of a string."""
+        raw = hashlib.md5(key.encode("utf-8")).digest()
+        # Use first 8 bytes as unsigned 64-bit integer for wide ring range.
+        return int.from_bytes(raw[:8], "big")
+    def _root_key(self, project_id: str) -> str:
+        """Extract the root portion of a project_id for co-location.
+        For multi-module projects (format ``root::module``), all modules of
+        the same root must land on the same shard so that cross-module graph
+        traversals work without federation.
+        """
+        return project_id.split("::")[0] if "::" in project_id else project_id
+    def shard_for(self, project_id: str) -> int:
+        """Return the shard index [0, num_shards) for the given project_id."""
+        if self.num_shards == 1:
+            return 0
+        point = self._hash_key(self._root_key(project_id))
+        pos = bisect.bisect_left(self._ring_points, point)
+        # Wrap around the ring
+        _, shard_idx = self._ring[pos % len(self._ring)]
+        return shard_idx
+    def all_shards(self) -> list[int]:
+        """Return all shard indices."""
+        return list(range(self.num_shards))
+    # ------------------------------------------------------------------
+    # Path helpers
+    # ------------------------------------------------------------------
+    def db_path(self, shard_index: int) -> str:
+        """Absolute write-DB path for a shard."""
+        return os.path.join(self.shards_dir, str(shard_index), "db")
+    def snapshot_path(self, shard_index: int) -> str:
+        """Absolute read-replica path for a shard."""
+        return os.path.join(self.shards_dir, str(shard_index), "db_read")
+    def shard_home(self, shard_index: int) -> str:
+        """Directory that holds all data for a shard."""
+        return os.path.join(self.shards_dir, str(shard_index))
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def describe(self) -> dict:
+        """Return a human-readable summary of the routing table."""
+        return {
+            "num_shards": self.num_shards,
+            "shards_dir": self.shards_dir,
+            "virtual_nodes_per_shard": VIRTUAL_NODES_PER_SHARD,
+            "ring_size": len(self._ring),
+        }

codespine-0.9.6/codespine/sharding/store.py ADDED Viewed

@@ -0,0 +1,312 @@
+"""ShardedGraphStore — in-process shard coordinator.
+Each project (or multi-module root) is consistently hashed to a shard index.
+All modules of the same project share one shard so that cross-module graph
+traversals see the full call graph without federation.
+Design
+------
+* ``ShardedGraphStore`` maintains a pool of ``GraphStore`` instances, one per
+  shard opened so far.  Shards are opened lazily on first access.
+* Existing callers that receive a plain ``GraphStore`` continue to work
+  unchanged.  The new entry point is ``ShardedGraphStore.shard(project_id)``
+  which returns the ``GraphStore`` responsible for that project.
+* Fan-out reads (``list_project_metadata``, global search) call
+  ``all_shards()`` to iterate every open shard.
+* ``snapshot_all()`` triggers per-shard snapshots concurrently.
+Migration from v0.9.x
+---------------------
+If ``~/.codespine_db`` exists and the new shards directory doesn't, the
+store automatically migrates the legacy DB to shard 0's path on first access
+so existing indexed data isn't lost.
+"""
+from __future__ import annotations
+import logging
+import os
+import shutil
+import threading
+from typing import Any
+from codespine.config import SETTINGS
+from codespine.db.store import GraphStore
+from codespine.sharding.router import ShardRouter
+LOGGER = logging.getLogger(__name__)
+class ShardedGraphStore:
+    """Coordinates multiple per-shard ``GraphStore`` instances.
+    Parameters
+    ----------
+    read_only:
+        Passed through to each ``GraphStore``.
+    num_shards:
+        Override for the shard count.  Defaults to ``SETTINGS.num_shards``.
+    shards_dir:
+        Override for the shards base directory.
+    """
+    def __init__(
+        self,
+        read_only: bool = False,
+        num_shards: int | None = None,
+        shards_dir: str | None = None,
+    ) -> None:
+        self.read_only = read_only
+        self.router = ShardRouter(
+            num_shards=num_shards or SETTINGS.num_shards,
+            shards_dir=shards_dir or SETTINGS.shards_dir,
+        )
+        self._pool: dict[int, GraphStore] = {}
+        self._lock = threading.Lock()
+        self._migrated = False
+    # ------------------------------------------------------------------
+    # Core routing
+    # ------------------------------------------------------------------
+    def shard(self, project_id: str) -> GraphStore:
+        """Return (or open) the GraphStore for this project's shard.
+        In write mode this always returns a valid store (creating the DB if
+        needed).  In read-only mode, if the shard DB has never been written to,
+        this also creates it (returning an empty-but-valid store) so that
+        callers can safely query it without crashing.
+        """
+        idx = self.router.shard_for(project_id)
+        store = self._get_shard(idx)
+        if store is None:
+            # Fallback: open read-only against an empty path so callers get an
+            # empty result set rather than a crash.  This happens when the
+            # shard DB doesn't exist yet.
+            with self._lock:
+                if idx not in self._pool:
+                    db_path = self.router.db_path(idx)
+                    snap_path = self.router.snapshot_path(idx)
+                    os.makedirs(os.path.dirname(db_path), exist_ok=True)
+                    self._pool[idx] = GraphStore(
+                        read_only=False,  # create empty DB
+                        db_path_override=db_path,
+                        snapshot_path_override=snap_path,
+                    )
+                store = self._pool[idx]
+        return store
+    def _get_shard(self, idx: int) -> GraphStore | None:
+        """Return the GraphStore for shard *idx*, or None if it doesn't exist
+        yet and we're in read-only mode (nothing to read there)."""
+        with self._lock:
+            if idx not in self._pool:
+                self._maybe_migrate(idx)
+                db_path = self.router.db_path(idx)
+                snap_path = self.router.snapshot_path(idx)
+                # In read-only mode, skip shards whose DB hasn't been created
+                # yet — Kuzu refuses to create an empty DB under read_only=True.
+                if self.read_only and not os.path.exists(db_path) and not os.path.exists(snap_path):
+                    return None
+                # Ensure parent directory exists before Kuzu opens it.
+                os.makedirs(os.path.dirname(db_path), exist_ok=True)
+                self._pool[idx] = GraphStore(
+                    read_only=self.read_only,
+                    db_path_override=db_path,
+                    snapshot_path_override=snap_path,
+                )
+            return self._pool[idx]
+    def _maybe_migrate(self, idx: int) -> None:
+        """One-time migration: copy legacy ~/.codespine_db → shard 0 DB path.
+        Guard: only triggers when the shards_dir matches the compiled-in
+        default (SETTINGS.shards_dir).  Custom / test shards_dir values are
+        never eligible for migration so that test isolation is preserved.
+        """
+        if self._migrated or idx != 0:
+            return
+        self._migrated = True
+        # Safety guard: never auto-migrate when using a non-default shards dir.
+        # This prevents test code that passes a temp dir from accidentally
+        # touching production data.
+        if os.path.realpath(self.router.shards_dir) != os.path.realpath(SETTINGS.shards_dir):
+            return
+        legacy = SETTINGS.db_path  # ~/.codespine_db
+        target = self.router.db_path(0)
+        if not os.path.exists(legacy):
+            return
+        if os.path.exists(target):
+            # Sharded layout already initialised — don't overwrite.
+            return
+        LOGGER.info(
+            "Migrating legacy DB %s → shard 0 path %s", legacy, target
+        )
+        try:
+            os.makedirs(os.path.dirname(target), exist_ok=True)
+            # Copy first, delete after — if the copy fails the original is safe.
+            if os.path.isdir(legacy):
+                shutil.copytree(legacy, target)
+            else:
+                shutil.copy2(legacy, target)
+            shutil.rmtree(legacy, ignore_errors=True)
+            # Also migrate read replica if present.
+            legacy_snap = SETTINGS.db_snapshot_path
+            target_snap = self.router.snapshot_path(0)
+            if os.path.exists(legacy_snap) and not os.path.exists(target_snap):
+                if os.path.isdir(legacy_snap):
+                    shutil.copytree(legacy_snap, target_snap)
+                else:
+                    shutil.copy2(legacy_snap, target_snap)
+                shutil.rmtree(legacy_snap, ignore_errors=True)
+        except OSError as exc:
+            LOGGER.warning("Migration from legacy path failed: %s — starting fresh", exc)
+    def all_shards(self) -> list[GraphStore]:
+        """Open and return all physical shards that exist (for fan-out reads).
+        Non-existent shards are skipped in read-only mode to avoid Kuzu
+        errors when opening empty paths under read_only=True.
+        """
+        stores = []
+        for i in self.router.all_shards():
+            s = self._get_shard(i)
+            if s is not None:
+                stores.append(s)
+        return stores
+    def open_shards(self) -> list[GraphStore]:
+        """Return only shards that have already been opened (no I/O)."""
+        with self._lock:
+            return [s for s in self._pool.values() if s is not None]
+    # ------------------------------------------------------------------
+    # Delegated project-scoped operations
+    # ------------------------------------------------------------------
+    def upsert_project(self, project_id: str, path: str) -> None:
+        self.shard(project_id).upsert_project(project_id, path)
+    def clear_project(self, project_id: str) -> None:
+        self.shard(project_id).clear_project(project_id)
+    def get_project_metadata(self, project_id: str) -> dict[str, Any] | None:
+        return self.shard(project_id).get_project_metadata(project_id)
+    def set_project_overlay_dirty(self, project_id: str, dirty: bool) -> None:
+        self.shard(project_id).set_project_overlay_dirty(project_id, dirty)
+    def set_project_indexed_commit(self, project_id: str, commit: str) -> None:
+        self.shard(project_id).set_project_indexed_commit(project_id, commit)
+    def project_file_hashes(self, project_id: str) -> dict[str, dict[str, str]]:
+        return self.shard(project_id).project_file_hashes(project_id)
+    def project_has_embeddings(self, project_id: str) -> bool:
+        return self.shard(project_id).project_has_embeddings(project_id)
+    def upsert_file_from_entry(self, entry: dict, project_path: str) -> None:
+        project_id = entry.get("project_id", "")
+        self.shard(project_id).upsert_file_from_entry(entry, project_path)
+    def clear_file_by_path(self, project_id: str, project_path: str, file_path: str) -> None:
+        self.shard(project_id).clear_file_by_path(project_id, project_path, file_path)
+    # ------------------------------------------------------------------
+    # Fan-out global reads
+    # ------------------------------------------------------------------
+    def list_project_metadata(self) -> list[dict[str, Any]]:
+        """Aggregate project list across all shards."""
+        results: list[dict[str, Any]] = []
+        seen: set[str] = set()
+        for store in self.all_shards():
+            for rec in store.list_project_metadata():
+                pid = rec.get("id", "")
+                if pid and pid not in seen:
+                    seen.add(pid)
+                    results.append(rec)
+        results.sort(key=lambda r: r.get("id", ""))
+        return results
+    def query_records(
+        self,
+        query: str,
+        params: dict[str, Any] | None = None,
+        *,
+        project_id: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Execute a Cypher query.
+        If ``project_id`` is given, the query runs only on that project's
+        shard (fast path).  Otherwise the query fans out to all shards and
+        results are concatenated.
+        Note: fan-out only makes sense for queries whose results are
+        independent per shard (e.g. listing nodes).  Queries that aggregate
+        across shards (e.g. global COUNT) will return per-shard subtotals.
+        """
+        if project_id:
+            return self.shard(project_id).query_records(query, params)
+        merged: list[dict[str, Any]] = []
+        for store in self.all_shards():
+            merged.extend(store.query_records(query, params))
+        return merged
+    # ------------------------------------------------------------------
+    # Snapshot all shards
+    # ------------------------------------------------------------------
+    def snapshot_all(self, background: bool = False) -> None:
+        """Snapshot every open shard.
+        In background mode all snapshots run concurrently in daemon threads
+        (one per shard) rather than sequentially.
+        """
+        for store in self.open_shards():
+            store.snapshot_to_read_replica(background=background)
+    # ------------------------------------------------------------------
+    # Global reset / status
+    # ------------------------------------------------------------------
+    def force_delete_all_data(self) -> list[str]:
+        """Delete all shards' data files.  Equivalent to force_delete per shard."""
+        removed: list[str] = []
+        for store in self.all_shards():
+            removed.extend(store.force_delete_all_data())
+        return removed
+    def describe(self) -> dict:
+        """Return a human-readable description of the shard topology."""
+        shard_info = []
+        for idx in self.router.all_shards():
+            db_path = self.router.db_path(idx)
+            shard_info.append({
+                "index": idx,
+                "db_path": db_path,
+                "exists": os.path.exists(db_path),
+                "open": idx in self._pool,
+            })
+        return {
+            **self.router.describe(),
+            "shards": shard_info,
+        }
+    # ------------------------------------------------------------------
+    # Backward-compat helpers — delegate to shard 0 for single-project use
+    # ------------------------------------------------------------------
+    @property
+    def overlay_store(self):
+        """Expose the overlay store from shard 0 for backward compat."""
+        return self._get_shard(0).overlay_store
+    @staticmethod
+    def stable_id(*parts: str) -> str:
+        """Stable SHA1-based identifier (shard-independent)."""
+        return GraphStore.stable_id(*parts)

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codespine
-Version: 0.9.5
+Version: 0.9.6
 Summary: Local Java code intelligence indexer backed by a graph database
 Author: CodeSpine contributors
 License: MIT License

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/SOURCES.txt RENAMED Viewed

@@ -45,6 +45,9 @@ codespine/search/fuzzy.py
 codespine/search/hybrid.py
 codespine/search/rrf.py
 codespine/search/vector.py
+codespine/sharding/__init__.py
+codespine/sharding/router.py
+codespine/sharding/store.py
 codespine/watch/__init__.py
 codespine/watch/git_hook.py
 codespine/watch/watcher.py

{codespine-0.9.5 → codespine-0.9.6}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "codespine"
-version = "0.9.5"
+version = "0.9.6"
 description = "Local Java code intelligence indexer backed by a graph database"
 readme = "README.md"
 requires-python = ">=3.10"

{codespine-0.9.5 → codespine-0.9.6}/LICENSE RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/README.md RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/community.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/context.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/coupling.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/crossmodule.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/deadcode.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/flow.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/analysis/impact.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/db/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/db/schema.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/diff/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/diff/branch_diff.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/guide.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/call_resolver.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/di_resolver.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/engine.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/java_parser.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/indexer/symbol_builder.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/mcp/server.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/noise/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/noise/blocklist.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/git_state.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/merge.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/overlay/store.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/bm25.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/fuzzy.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/hybrid.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/rrf.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/search/vector.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/watch/__init__.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/watch/git_hook.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine/watch/watcher.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/entry_points.txt RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/requires.txt RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/codespine.egg-info/top_level.txt RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/gindex.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/setup.cfg RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_branch_diff_normalize.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_call_resolver.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_community_detection.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_deadcode.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_index_and_hybrid.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_java_parser.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_multimodule_index.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_overlay.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_search_ranking.py RENAMED Viewed

File without changes

{codespine-0.9.5 → codespine-0.9.6}/tests/test_store_recovery.py RENAMED Viewed

File without changes

codespine 0.9.5__tar.gz → 0.9.6__tar.gz

codespine 0.9.5tar.gz → 0.9.6tar.gz