PyPI - codespine - Versions diffs - 0.9.4__tar.gz → 0.9.6__tar.gz - Mend

codespine 0.9.4tar.gz → 0.9.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{codespine-0.9.4 → codespine-0.9.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: codespine
-Version: 0.9.4
+Version: 0.9.6
 Summary: Local Java code intelligence indexer backed by a graph database
 Author: CodeSpine contributors
 License: MIT License

{codespine-0.9.4 → codespine-0.9.6}/codespine/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 """CodeSpine package."""
 __all__ = ["__version__"]
-__version__ = "0.9.4"
+__version__ = "0.9.6"

{codespine-0.9.4 → codespine-0.9.6}/codespine/cli.py RENAMED Viewed

@@ -20,6 +20,7 @@ from codespine.analysis.flow import trace_execution_flows
 from codespine.analysis.impact import analyze_impact
 from codespine.config import SETTINGS
 from codespine.db.store import GraphStore
+from codespine.sharding import ShardedGraphStore, ShardRouter
 from codespine.diff.branch_diff import compare_branches
 from codespine.indexer.engine import JavaIndexer
 from codespine.mcp.server import build_mcp_server
@@ -90,6 +91,54 @@ def _spinner_char() -> str:
     return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
+def _show_shard_topology(as_json: bool) -> None:
+    """Display the current shard routing topology and imbalance metrics."""
+    router = ShardRouter()
+    sg = ShardedGraphStore(read_only=True)
+    topology = sg.describe()
+    # Gather project → shard mapping from all shards.
+    shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
+    for p in sg.list_project_metadata():
+        pid = p.get("id", "")
+        idx = router.shard_for(pid)
+        shard_project_counts[idx].append(pid)
+    counts = [len(v) for v in shard_project_counts.values()]
+    total = sum(counts)
+    median = sorted(counts)[len(counts) // 2] if counts else 0
+    max_count = max(counts) if counts else 0
+    imbalance = (max_count / median) if median else 1.0
+    if as_json:
+        _echo_json({
+            "topology": topology,
+            "project_distribution": {str(k): v for k, v in shard_project_counts.items()},
+            "imbalance_ratio": round(imbalance, 2),
+        }, as_json=True)
+        return
+    click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
+    click.echo(f"  Directory : {router.shards_dir}")
+    click.echo(f"  Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
+    click.echo(f"  Projects  : {total} total, imbalance ratio {imbalance:.2f}x")
+    click.echo()
+    header = f"{'Shard':>6}  {'Projects':>9}  {'DB exists':>10}  Path"
+    click.secho(header, fg="cyan")
+    click.echo("-" * 60)
+    for i, info in enumerate(topology.get("shards", [])):
+        plist = shard_project_counts.get(i, [])
+        exists_str = "yes" if info.get("exists") else "no"
+        click.echo(f"{i:>6}  {len(plist):>9}  {exists_str:>10}  {info.get('db_path', '')}")
+        for pid in plist:
+            click.echo(f"{'':>6}  {'':>9}  {'':>10}    {pid}")
+    if imbalance > 2.0:
+        click.secho(
+            f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
+            fg="yellow",
+        )
 @click.group()
 def main() -> None:
     """CodeSpine CLI."""
@@ -130,8 +179,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
                 fg="yellow",
             )
-    store = GraphStore(read_only=False)
-    indexer = JavaIndexer(store)
+    # ShardedGraphStore routes each project to its dedicated DB shard.
+    # For single-project analysis this is transparent — shard() always
+    # returns a GraphStore pointing to the correct shard path.
+    sg = ShardedGraphStore(read_only=False)
+    # The indexer is initialised per-module below with the right shard store.
+    # We keep a single ShardedGraphStore to fan-out cross-module linking later.
     # --- Workspace → project → module detection ---
     # Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
@@ -241,9 +294,16 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     last_result = None
     for idx, (module_path, project_id) in enumerate(modules_with_ids):
         if is_multi:
+            shard_idx = sg.router.shard_for(project_id)
             click.echo()
-            click.secho(f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}", fg="cyan")
+            click.secho(
+                f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id}  (shard {shard_idx})",
+                fg="cyan",
+            )
         _reset_state()
+        # Use the shard store for this project so data lands in the right DB.
+        shard_store = sg.shard(project_id)
+        indexer = JavaIndexer(shard_store)
         last_result = indexer.index_project(
             module_path, full=full, progress=_progress, project_id=project_id, embed=embed
         )
@@ -264,13 +324,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         """Finalise an in-place phase line and move to the next line."""
         click.echo(f"\r✓ {label:<28} {result:<48}")
+    # For cross-module operations (cross-module linking, deep analysis, stats)
+    # we use the shard store for the root project (all modules share one shard).
+    root_project_id = last_result.project_id if last_result else root_basename
+    root_shard_store = sg.shard(root_project_id)
     # ── Cross-module call linking ──────────────────────────────────────
     if is_multi and len(modules_with_ids) > 1:
         xmod_label = "Cross-module linking..."
         _live_phase(xmod_label, "running")
         xmod_pids = [pid for _, pid in modules_with_ids]
         xmod_edges = link_cross_module_calls(
-            store, project_ids=xmod_pids,
+            root_shard_store, project_ids=xmod_pids,
             progress=lambda s: _live_phase(xmod_label, s),
         )
         _finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
@@ -287,7 +352,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         comm_label = "Detecting communities..."
         _live_phase(comm_label, "running")
         communities = detect_communities(
-            store,
+            root_shard_store,
             progress=lambda s: _live_phase(comm_label, s),
         )
         _finish_phase(comm_label, f"{len(communities)} clusters found")
@@ -295,23 +360,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         flow_label = "Detecting execution flows..."
         _live_phase(flow_label, "running")
         flows = trace_execution_flows(
-            store,
+            root_shard_store,
             progress=lambda s: _live_phase(flow_label, s),
         )
         _finish_phase(flow_label, f"{len(flows)} processes found")
         dead_label = "Finding dead code..."
         _live_phase(dead_label, "running")
-        dead = detect_dead_code(store, limit=500)
+        dead = detect_dead_code(root_shard_store, limit=500)
         _finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
         coup_label = "Analyzing git history..."
         _live_phase(coup_label, "running")
-        store.clear_coupling()
+        root_shard_store.clear_coupling()
         coupling_root = abs_path
         coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
         coupling_pairs = compute_coupling(
-            store,
+            root_shard_store,
             coupling_root,
             coupling_project,
             days=SETTINGS.default_coupling_days,
@@ -329,7 +394,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         flow_label = "Detecting execution flows..."
         _live_phase(flow_label, "running (lightweight)")
         try:
-            flows = trace_execution_flows(store, max_depth=3)
+            flows = trace_execution_flows(root_shard_store, max_depth=3)
         except Exception:
             flows = []
         _finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
@@ -337,14 +402,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
         dead_label = "Finding dead code..."
         _live_phase(dead_label, "running (lightweight)")
         try:
-            dead = detect_dead_code(store, limit=100)
+            dead = detect_dead_code(root_shard_store, limit=100)
         except Exception:
             dead = []
         _finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
         _phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
-    vector_count = store.query_records(
+    vector_count = root_shard_store.query_records(
         """
         MATCH (s:Symbol)
         WHERE s.embedding IS NOT NULL
@@ -355,8 +420,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
     _phase("Generating embeddings...", f"{vectors_stored} vectors stored")
-    symbol_count = store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
-    edge_count = store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
+    symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
+    edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
     symbols = int(symbol_count[0]["count"]) if symbol_count else 0
     edges = int(edge_count[0]["count"]) if edge_count else 0
     elapsed = time.perf_counter() - started
@@ -376,7 +441,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     # Detect unresolved imports → hint about unindexed sibling projects
     try:
-        unresolved = JavaIndexer.detect_unresolved_imports(store)
+        unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
         if unresolved:
             click.echo()
             click.secho("⚠  Unresolved imports — consider indexing these projects:", fg="yellow")
@@ -387,13 +452,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
     # Publish a read replica so MCP and read-only CLI commands (search, stats…)
     # run against an isolated snapshot rather than competing with the write
-    # process's buffer pool.  The MCP daemon detects the sentinel file and
-    # hot-reloads without restarting.
+    # process's buffer pool.  Snapshot all open shards concurrently.
     snap_label = "Publishing read replica..."
     _live_phase(snap_label, "copying")
-    store._recycle_conn()
-    snapped = GraphStore.snapshot_to_read_replica()
-    _finish_phase(snap_label, "MCP will reload automatically" if snapped else "skipped (source DB not found)")
+    root_shard_store._recycle_conn()
+    sg.snapshot_all(background=False)
+    _finish_phase(snap_label, "MCP will reload automatically")
 @main.command()
@@ -523,10 +587,21 @@ def diff(range_spec: str, as_json: bool) -> None:
 @main.command()
 @click.option("--json", "as_json", is_flag=True)
-def stats(as_json: bool) -> None:
+@click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
+def stats(as_json: bool, show_shards: bool) -> None:
     """Show per-project and aggregate graph statistics."""
-    store = GraphStore(read_only=True)
-    projects = store.query_records("MATCH (p:Project) RETURN p.id as id, p.path as path ORDER BY p.id")
+    if show_shards:
+        _show_shard_topology(as_json)
+        return
+    # Fan-out across all shards so stats covers every project in the cluster.
+    sg = ShardedGraphStore(read_only=True)
+    all_projects_meta = sg.list_project_metadata()
+    # For detailed stats we need the per-project shard store.
+    def _project_store(pid: str):
+        return sg.shard(pid)
     if not projects:
         click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
         return
@@ -534,10 +609,12 @@ def stats(as_json: bool) -> None:
     rows = []
     for p in projects:
         pid = p["id"]
-        files = store.query_records(
+        # Route each query to the project's owning shard.
+        ps = _project_store(pid)
+        files = ps.query_records(
             "MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
         )
-        classes = store.query_records(
+        classes = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -546,7 +623,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        methods = store.query_records(
+        methods = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -557,7 +634,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        calls = store.query_records(
+        calls = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -568,7 +645,7 @@ def stats(as_json: bool) -> None:
             """,
             {"pid": pid},
         )
-        emb = store.query_records(
+        emb = ps.query_records(
             """
             MATCH (f:File) WHERE f.project_id = $pid
             WITH f
@@ -580,6 +657,7 @@ def stats(as_json: bool) -> None:
         rows.append({
             "project": pid,
             "path": p["path"],
+            "shard": sg.router.shard_for(pid),
             "files": files[0]["n"] if files else 0,
             "classes": classes[0]["n"] if classes else 0,
             "methods": methods[0]["n"] if methods else 0,
@@ -592,13 +670,13 @@ def stats(as_json: bool) -> None:
         return
     col_w = max(len(r["project"]) for r in rows)
-    header = f"{'Project':<{col_w}}  {'Files':>6}  {'Classes':>8}  {'Methods':>8}  {'Calls':>7}  {'Emb':>6}  Path"
+    header = f"{'Project':<{col_w}}  {'Shard':>5}  {'Files':>6}  {'Classes':>8}  {'Methods':>8}  {'Calls':>7}  {'Emb':>6}  Path"
     click.secho(header, fg="cyan")
     click.echo("-" * len(header))
     total_files = total_classes = total_methods = total_calls = total_emb = 0
     for r in rows:
         click.echo(
-            f"{r['project']:<{col_w}}  {r['files']:>6}  {r['classes']:>8}  {r['methods']:>8}  {r['calls_out']:>7}  {r['embeddings']:>6}  {r['path']}"
+            f"{r['project']:<{col_w}}  {r.get('shard', 0):>5}  {r['files']:>6}  {r['classes']:>8}  {r['methods']:>8}  {r['calls_out']:>7}  {r['embeddings']:>6}  {r['path']}"
         )
         total_files += r["files"]
         total_classes += r["classes"]
@@ -608,7 +686,7 @@ def stats(as_json: bool) -> None:
     if len(rows) > 1:
         click.echo("-" * len(header))
         click.secho(
-            f"{'TOTAL':<{col_w}}  {total_files:>6}  {total_classes:>8}  {total_methods:>8}  {total_calls:>7}  {total_emb:>6}",
+            f"{'TOTAL':<{col_w}}  {'':>5}  {total_files:>6}  {total_classes:>8}  {total_methods:>8}  {total_calls:>7}  {total_emb:>6}",
             fg="green",
         )

{codespine-0.9.4 → codespine-0.9.6}/codespine/config.py RENAMED Viewed

@@ -4,8 +4,17 @@ from dataclasses import dataclass
 @dataclass(frozen=True)
 class Settings:
+    # Legacy single-DB paths — kept for backward compat and as defaults when
+    # sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
     db_path: str = os.path.expanduser("~/.codespine_db")
     db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
+    # Sharding — new layout stores each shard under shards_dir/{N}/db
+    # num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
+    # ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
+    num_shards: int = 4
+    shards_dir: str = os.path.expanduser("~/.codespine/shards")
     pid_file: str = os.path.expanduser("~/.codespine.pid")
     log_file: str = os.path.expanduser("~/.codespine.log")
     embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")

{codespine-0.9.4 → codespine-0.9.6}/codespine/db/store.py RENAMED Viewed

@@ -8,7 +8,7 @@ import shutil
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import dataclass
+from dataclasses import InitVar, dataclass
 from typing import Any
 import kuzu
@@ -39,8 +39,26 @@ _RECOVERABLE_DB_ERROR_MARKERS = (
 @dataclass
 class GraphStore:
     read_only: bool = False
+    # Optional path overrides — when provided, the store uses these paths
+    # instead of the global SETTINGS values.  The ShardedGraphStore uses
+    # this to give each shard its own isolated KùzuDB directory.
+    db_path_override: InitVar[str | None] = None
+    snapshot_path_override: InitVar[str | None] = None
+    def __post_init__(
+        self,
+        db_path_override: str | None,
+        snapshot_path_override: str | None,
+    ) -> None:
+        # Resolve effective paths — per-shard overrides win over global SETTINGS.
+        self._db_path: str = db_path_override or SETTINGS.db_path
+        self._snapshot_path: str = snapshot_path_override or SETTINGS.db_snapshot_path
+        # Per-instance snapshot synchronisation (not class-level) so that
+        # multiple shards can snapshot concurrently without a shared bottleneck.
+        self._inst_snapshot_lock: threading.Lock = threading.Lock()
+        self._inst_snapshot_pending: threading.Event = threading.Event()
-    def __post_init__(self) -> None:
         self._tls: threading.local = threading.local()
         from codespine.overlay.store import OverlayStore
@@ -48,10 +66,10 @@ class GraphStore:
         # Read-only callers (MCP, CLI reads) use the read replica when available.
         # This isolates them from the write process's buffer pool and WAL churn.
-        if self.read_only and os.path.exists(SETTINGS.db_snapshot_path):
-            db_path = SETTINGS.db_snapshot_path
+        if self.read_only and os.path.exists(self._snapshot_path):
+            db_path = self._snapshot_path
         else:
-            db_path = SETTINGS.db_path
+            db_path = self._db_path
         try:
             self.db = self._open_with_recovery(db_path)
@@ -97,7 +115,7 @@ class GraphStore:
         try:
             ensure_schema(self._conn())
         except Exception as exc:
-            path = getattr(self.db, "database_path", SETTINGS.db_path)
+            path = getattr(self.db, "database_path", self._db_path)
             if not self._is_recoverable_db_error(exc):
                 raise
             LOGGER.warning("Rebuilding corrupted or incompatible Kuzu DB at %s during schema init: %s", path, exc)
@@ -527,15 +545,27 @@ class GraphStore:
         rows = [{"source_id": r["source_id"], "target_id": r["target_id"],
                   "confidence": float(r["confidence"]), "reason": r["reason"]}
                  for r in records]
-        op = "CREATE" if create_mode else "MERGE"
-        self.execute(
-            f"""
-            UNWIND $rows AS row
-            MATCH (src:Method {{id: row.source_id}}), (dst:Method {{id: row.target_id}})
-            {op} (src)-[:CALLS {{confidence: row.confidence, reason: row.reason}}]->(dst)
-            """,
-            {"rows": rows},
-        )
+        if create_mode:
+            self.execute(
+                """
+                UNWIND $rows AS row
+                MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
+                CREATE (src)-[:CALLS {confidence: row.confidence, reason: row.reason}]->(dst)
+                """,
+                {"rows": rows},
+            )
+        else:
+            # Properties are SET, not part of the MERGE pattern — ensures at most
+            # one CALLS edge per (src, dst) pair regardless of confidence value.
+            self.execute(
+                """
+                UNWIND $rows AS row
+                MATCH (src:Method {id: row.source_id}), (dst:Method {id: row.target_id})
+                MERGE (src)-[r:CALLS]->(dst)
+                SET r.confidence = row.confidence, r.reason = row.reason
+                """,
+                {"rows": rows},
+            )
     def add_reference(self, rel: str, src_label: str, src_id: str, dst_label: str, dst_id: str, confidence: float) -> None:
         if rel not in {"REFERENCES_TYPE", "IMPLEMENTS", "OVERRIDES"}:
@@ -756,8 +786,7 @@ class GraphStore:
         self.clear_flows()
         self.clear_coupling()
-    @staticmethod
-    def force_delete_all_data() -> list[str]:
+    def force_delete_all_data(self) -> list[str]:
         """Delete all CodeSpine data files without touching the Kuzu engine.
         This is the nuclear option for OOM recovery: when the buffer pool is
@@ -767,12 +796,14 @@ class GraphStore:
         Returns the list of paths that were removed.
         """
+        db_path = self._db_path
+        snapshot_path = self._snapshot_path
         removed: list[str] = []
         for path in [
-            SETTINGS.db_path,
-            SETTINGS.db_snapshot_path,
-            SETTINGS.db_snapshot_path + ".updated",
-            SETTINGS.db_snapshot_path + ".tmp",
+            db_path,
+            snapshot_path,
+            snapshot_path + ".updated",
+            snapshot_path + ".tmp",
             SETTINGS.embedding_cache_path,
             SETTINGS.overlay_dir,
             SETTINGS.index_meta_dir,
@@ -789,7 +820,7 @@ class GraphStore:
                 pass
         # Also remove any stale WAL files next to the DB
         for suffix in (".wal", ".lock"):
-            wal_path = SETTINGS.db_path + suffix
+            wal_path = db_path + suffix
             if os.path.exists(wal_path):
                 try:
                     os.remove(wal_path)
@@ -800,7 +831,7 @@ class GraphStore:
     def rebuild_empty_db(self) -> None:
         self._recycle_conn()
-        path = SETTINGS.db_path
+        path = self._db_path
         # Remove the DB directory AND any stale WAL / lock files
         self._remove_db_path(path)
         for suffix in (".wal", ".lock"):
@@ -813,11 +844,8 @@ class GraphStore:
         # Also remove the read replica so that read-only callers (stats, MCP)
         # don't continue to see stale data from before the wipe.
-        for stale in [
-            SETTINGS.db_snapshot_path,
-            SETTINGS.db_snapshot_path + ".tmp",
-            SETTINGS.db_snapshot_path + ".updated",
-        ]:
+        snap = self._snapshot_path
+        for stale in [snap, snap + ".tmp", snap + ".updated"]:
             self._remove_db_path(stale)
         # Kuzu may retain stale internal state from a previous failed open of
@@ -914,18 +942,15 @@ class GraphStore:
             },
         )
-    # Lock and flag for background snapshot coalescing.
-    # Only one snapshot runs at a time; a pending request supersedes queued ones.
-    _snapshot_lock: threading.Lock = threading.Lock()
-    _snapshot_pending: threading.Event = threading.Event()
-    @staticmethod
-    def snapshot_to_read_replica(background: bool = False) -> bool:
+    def snapshot_to_read_replica(self, background: bool = False) -> bool:
         """Atomically copy the write DB to the read-replica path.
         The read replica is used by the MCP daemon and all read-only CLI
         commands so they never contend with the write process's buffer pool.
+        Each GraphStore instance has its own snapshot lock so that multiple
+        shards can snapshot concurrently without serialising on a class lock.
         Parameters
         ----------
         background:
@@ -938,36 +963,38 @@ class GraphStore:
         Returns True on success (or when dispatched to background), False if
         the source DB does not exist.
         """
-        src = SETTINGS.db_path
+        src = self._db_path
         if not os.path.exists(src):
             return False
         if background:
             # Signal that a snapshot is wanted, then ensure a worker is running.
-            GraphStore._snapshot_pending.set()
+            self._inst_snapshot_pending.set()
+            inst = self  # capture for closure
             def _worker() -> None:
-                while GraphStore._snapshot_pending.is_set():
-                    GraphStore._snapshot_pending.clear()
-                    with GraphStore._snapshot_lock:
-                        GraphStore._do_snapshot()
+                while inst._inst_snapshot_pending.is_set():
+                    inst._inst_snapshot_pending.clear()
+                    with inst._inst_snapshot_lock:
+                        inst._do_snapshot()
-            if not GraphStore._snapshot_lock.locked():
+            if not self._inst_snapshot_lock.locked():
                 t = threading.Thread(target=_worker, daemon=True, name="codespine-snapshot")
                 t.start()
             return True
         # Foreground (blocking) path — used by CLI analyse and tests.
-        with GraphStore._snapshot_lock:
-            return GraphStore._do_snapshot()
+        with self._inst_snapshot_lock:
+            return self._do_snapshot()
-    @staticmethod
-    def _do_snapshot() -> bool:
-        """Perform the actual copy.  Must be called with _snapshot_lock held."""
-        src = SETTINGS.db_path
-        dst = SETTINGS.db_snapshot_path
+    def _do_snapshot(self) -> bool:
+        """Perform the actual copy.  Must be called with the instance snapshot lock held."""
+        src = self._db_path
+        dst = self._snapshot_path
         if not os.path.exists(src):
             return False
+        # Ensure the parent directory for the replica exists (shards layout).
+        os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
         tmp = dst + ".tmp"
         try:
             if os.path.exists(tmp):
@@ -975,7 +1002,6 @@ class GraphStore:
             if os.path.isdir(src):
                 shutil.copytree(src, tmp)
             else:
-                os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
                 shutil.copy2(src, tmp)
             if os.path.exists(dst):
                 shutil.rmtree(dst, ignore_errors=True)

{codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/call_resolver.py RENAMED Viewed

@@ -5,7 +5,7 @@ from typing import Iterator
 from codespine.noise.blocklist import MIN_FUZZY_NAME_LEN, NOISE_METHOD_NAMES
-MAX_FUZZY_TARGETS = 12
+MAX_FUZZY_TARGETS = 6  # reduced from 12 — keeps precision, halves low-confidence edge fan-out
 def _simple_type_name(type_name: str | None) -> str:

{codespine-0.9.4 → codespine-0.9.6}/codespine/indexer/engine.py RENAMED Viewed

@@ -221,7 +221,7 @@ class JavaIndexer:
         calls_resolved = 0
         type_relationships = 0
         file_batch_size = max(1, int(getattr(SETTINGS, "index_file_batch_size", 64)))
-        edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size", 2000)))
+        edge_batch_size = max(1, int(getattr(SETTINGS, "edge_write_batch_size", 5000)))
         if not full:
             method_catalog, class_catalog, fqcn_to_class_ids, class_methods = (
@@ -480,22 +480,36 @@ class JavaIndexer:
                 self.store._recycle_conn()
         self._emit(progress, "resolve_calls_start")
-        call_rows: list[dict] = []
-        for src, dst, confidence, reason in resolve_calls(method_catalog, method_calls, method_context, class_catalog):
-            call_rows.append(
-                {
-                    "source_id": src,
-                    "target_id": dst,
-                    "confidence": confidence,
-                    "reason": reason,
-                }
+        # Deduplicate (src, dst) pairs — the same pair can appear many times when
+        # a method calls another method multiple times at different call sites.
+        # Keep the highest-confidence resolution to avoid N writes per pair.
+        best_calls: dict[tuple[str, str], tuple[float, str]] = {}
+        for src, dst, confidence, reason in resolve_calls(
+            method_catalog, method_calls, method_context, class_catalog
+        ):
+            key = (src, dst)
+            if key not in best_calls or confidence > best_calls[key][0]:
+                best_calls[key] = (confidence, reason)
+        # Stream writes in batches — never hold the full set in RAM.
+        call_buf: list[dict] = []
+        for (src, dst), (confidence, reason) in best_calls.items():
+            call_buf.append(
+                {"source_id": src, "target_id": dst,
+                 "confidence": confidence, "reason": reason}
             )
-        for call_chunk in self._chunked(call_rows, edge_batch_size):
+            if len(call_buf) >= edge_batch_size:
+                with self.store.transaction():
+                    self.store.add_calls_batch(call_buf)
+                calls_resolved += len(call_buf)
+                self.store._recycle_conn()
+                self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
+                call_buf = []
+        if call_buf:
             with self.store.transaction():
-                self.store.add_calls_batch(call_chunk)
-            calls_resolved += len(call_chunk)
+                self.store.add_calls_batch(call_buf)
+            calls_resolved += len(call_buf)
             self.store._recycle_conn()
-            self._emit(progress, "resolve_calls_progress", calls_resolved=calls_resolved)
         self._emit(progress, "resolve_calls_done", calls_resolved=calls_resolved)
         self._emit(progress, "resolve_types_start")

codespine-0.9.6/codespine/sharding/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""CodeSpine sharding package.
+Exposes the consistent-hash router and the ShardedGraphStore facade.
+"""
+from codespine.sharding.router import ShardRouter
+from codespine.sharding.store import ShardedGraphStore
+__all__ = ["ShardRouter", "ShardedGraphStore"]

codespine 0.9.4__tar.gz → 0.9.6__tar.gz

codespine 0.9.4tar.gz → 0.9.6tar.gz