codespine 0.9.5__tar.gz → 0.9.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.9.5 → codespine-0.9.7}/PKG-INFO +1 -1
- {codespine-0.9.5 → codespine-0.9.7}/codespine/__init__.py +1 -1
- {codespine-0.9.5 → codespine-0.9.7}/codespine/cli.py +306 -116
- {codespine-0.9.5 → codespine-0.9.7}/codespine/config.py +9 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/db/store.py +126 -70
- codespine-0.9.7/codespine/sharding/__init__.py +9 -0
- codespine-0.9.7/codespine/sharding/router.py +123 -0
- codespine-0.9.7/codespine/sharding/store.py +312 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/SOURCES.txt +4 -0
- {codespine-0.9.5 → codespine-0.9.7}/pyproject.toml +1 -1
- codespine-0.9.7/tests/test_sharding.py +200 -0
- {codespine-0.9.5 → codespine-0.9.7}/LICENSE +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/README.md +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/community.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/context.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/coupling.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/flow.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/analysis/impact.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/db/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/db/schema.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/diff/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/guide.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/di_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/engine.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/mcp/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/mcp/server.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/noise/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/noise/blocklist.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/git_state.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/merge.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/overlay/store.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/bm25.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/fuzzy.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/hybrid.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/rrf.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/search/vector.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/__init__.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/git_hook.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine/watch/watcher.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/gindex.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/setup.cfg +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_call_resolver.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_community_detection.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_deadcode.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_java_parser.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_multimodule_index.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_overlay.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_search_ranking.py +0 -0
- {codespine-0.9.5 → codespine-0.9.7}/tests/test_store_recovery.py +0 -0
|
@@ -6,7 +6,10 @@ import os
|
|
|
6
6
|
import signal
|
|
7
7
|
import subprocess
|
|
8
8
|
import sys
|
|
9
|
+
import threading
|
|
9
10
|
import time
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
13
|
|
|
11
14
|
import click
|
|
12
15
|
import psutil
|
|
@@ -20,6 +23,7 @@ from codespine.analysis.flow import trace_execution_flows
|
|
|
20
23
|
from codespine.analysis.impact import analyze_impact
|
|
21
24
|
from codespine.config import SETTINGS
|
|
22
25
|
from codespine.db.store import GraphStore
|
|
26
|
+
from codespine.sharding import ShardedGraphStore, ShardRouter
|
|
23
27
|
from codespine.diff.branch_diff import compare_branches
|
|
24
28
|
from codespine.indexer.engine import JavaIndexer
|
|
25
29
|
from codespine.mcp.server import build_mcp_server
|
|
@@ -90,6 +94,197 @@ def _spinner_char() -> str:
|
|
|
90
94
|
return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
|
|
91
95
|
|
|
92
96
|
|
|
97
|
+
def _index_shard_group(
|
|
98
|
+
shard_idx: int,
|
|
99
|
+
modules: list[tuple[str, str]],
|
|
100
|
+
sg,
|
|
101
|
+
full: bool,
|
|
102
|
+
embed: bool,
|
|
103
|
+
output_lock: threading.Lock,
|
|
104
|
+
parallel: bool,
|
|
105
|
+
) -> tuple[int, list, int]:
|
|
106
|
+
"""Index one group of modules that share a shard.
|
|
107
|
+
|
|
108
|
+
Modules within the group are always indexed sequentially (same KùzuDB).
|
|
109
|
+
Multiple groups can run concurrently in different threads when they own
|
|
110
|
+
different shards.
|
|
111
|
+
|
|
112
|
+
Returns (total_files_found, all_results, shard_idx).
|
|
113
|
+
"""
|
|
114
|
+
results = []
|
|
115
|
+
total_files = 0
|
|
116
|
+
|
|
117
|
+
def _locked_echo(*args, **kwargs) -> None:
|
|
118
|
+
"""Thread-safe click.echo."""
|
|
119
|
+
with output_lock:
|
|
120
|
+
click.echo(*args, **kwargs)
|
|
121
|
+
|
|
122
|
+
def _locked_secho(*args, **kwargs) -> None:
|
|
123
|
+
with output_lock:
|
|
124
|
+
click.secho(*args, **kwargs)
|
|
125
|
+
|
|
126
|
+
prefix = f"[S{shard_idx}] " if parallel else ""
|
|
127
|
+
|
|
128
|
+
for mod_path, project_id in modules:
|
|
129
|
+
# Per-module progress state (local — no shared mutation).
|
|
130
|
+
parse_state: dict = {"shown": False, "indexed": 0, "total": 0,
|
|
131
|
+
"last_ts": 0.0, "printed_zero": False}
|
|
132
|
+
call_state: dict = {"shown": False, "count": 0, "last_ts": 0.0,
|
|
133
|
+
"started_at": 0.0}
|
|
134
|
+
|
|
135
|
+
def _progress(event: str, payload: dict) -> None:
|
|
136
|
+
now = time.perf_counter()
|
|
137
|
+
if event == "scan_done":
|
|
138
|
+
with output_lock:
|
|
139
|
+
_phase(f"{prefix}Walking files...", f"{int(payload.get('files_found', 0))} files found")
|
|
140
|
+
return
|
|
141
|
+
if event == "plan_done":
|
|
142
|
+
to_index = int(payload.get("files_to_index", 0))
|
|
143
|
+
deleted = int(payload.get("deleted_files", 0))
|
|
144
|
+
mode = str(payload.get("mode", "incremental"))
|
|
145
|
+
parse_state["total"] = to_index
|
|
146
|
+
with output_lock:
|
|
147
|
+
_phase(f"{prefix}Index mode...", f"{mode} ({to_index} files, {deleted} deleted)")
|
|
148
|
+
if to_index == 0:
|
|
149
|
+
with output_lock:
|
|
150
|
+
_phase(f"{prefix}Parsing code...", "0/0")
|
|
151
|
+
parse_state["printed_zero"] = True
|
|
152
|
+
return
|
|
153
|
+
if event == "parse_progress":
|
|
154
|
+
indexed = int(payload.get("indexed", 0))
|
|
155
|
+
total = int(payload.get("total", 0))
|
|
156
|
+
parse_state["indexed"] = indexed
|
|
157
|
+
parse_state["total"] = total
|
|
158
|
+
if total == 0:
|
|
159
|
+
return
|
|
160
|
+
if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
|
|
161
|
+
if not parallel:
|
|
162
|
+
# In-place progress bar only makes sense in serial mode.
|
|
163
|
+
click.echo(
|
|
164
|
+
f"\r{prefix}Parsing code... {_bar(indexed, total)} {indexed}/{total} ",
|
|
165
|
+
nl=False,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
with output_lock:
|
|
169
|
+
click.echo(
|
|
170
|
+
f"\r{prefix}Parsing {indexed}/{total} ",
|
|
171
|
+
nl=False,
|
|
172
|
+
)
|
|
173
|
+
parse_state["shown"] = True
|
|
174
|
+
parse_state["last_ts"] = now
|
|
175
|
+
return
|
|
176
|
+
if event in ("resolve_calls_start",):
|
|
177
|
+
if parse_state["shown"]:
|
|
178
|
+
with output_lock:
|
|
179
|
+
click.echo()
|
|
180
|
+
parse_state["shown"] = False
|
|
181
|
+
call_state["started_at"] = now
|
|
182
|
+
with output_lock:
|
|
183
|
+
_phase(f"{prefix}Tracing calls...", "starting...")
|
|
184
|
+
return
|
|
185
|
+
if event == "resolve_calls_progress":
|
|
186
|
+
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
187
|
+
if (now - call_state["last_ts"]) >= 0.25:
|
|
188
|
+
elapsed_s = now - call_state["started_at"]
|
|
189
|
+
if not parallel:
|
|
190
|
+
click.echo(
|
|
191
|
+
f"\r{_spinner_char()} {prefix}Tracing calls... "
|
|
192
|
+
f"{call_state['count']:>6} resolved {elapsed_s:.1f}s ",
|
|
193
|
+
nl=False,
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
with output_lock:
|
|
197
|
+
click.echo(
|
|
198
|
+
f"\r{prefix}Calls: {call_state['count']} ({elapsed_s:.0f}s) ",
|
|
199
|
+
nl=False,
|
|
200
|
+
)
|
|
201
|
+
call_state["shown"] = True
|
|
202
|
+
call_state["last_ts"] = now
|
|
203
|
+
return
|
|
204
|
+
if event == "resolve_calls_done":
|
|
205
|
+
if call_state["shown"]:
|
|
206
|
+
with output_lock:
|
|
207
|
+
click.echo()
|
|
208
|
+
call_state["shown"] = False
|
|
209
|
+
elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
|
|
210
|
+
n = int(payload.get("calls_resolved", 0))
|
|
211
|
+
with output_lock:
|
|
212
|
+
_phase(f"{prefix}Tracing calls...", f"{n} calls resolved ({elapsed_s:.1f}s)")
|
|
213
|
+
return
|
|
214
|
+
if event == "resolve_types_start":
|
|
215
|
+
with output_lock:
|
|
216
|
+
_phase(f"{prefix}Analyzing types...", "running")
|
|
217
|
+
return
|
|
218
|
+
if event == "resolve_types_done":
|
|
219
|
+
n = int(payload.get("type_relationships", 0))
|
|
220
|
+
with output_lock:
|
|
221
|
+
_phase(f"{prefix}Analyzing types...", f"{n} type relationships")
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
shard_store = sg.shard(project_id)
|
|
225
|
+
indexer = JavaIndexer(shard_store)
|
|
226
|
+
result = indexer.index_project(
|
|
227
|
+
mod_path, full=full, progress=_progress, project_id=project_id, embed=embed
|
|
228
|
+
)
|
|
229
|
+
results.append(result)
|
|
230
|
+
total_files += result.files_found
|
|
231
|
+
|
|
232
|
+
# Flush any dangling progress line.
|
|
233
|
+
if parse_state["shown"]:
|
|
234
|
+
with output_lock:
|
|
235
|
+
click.echo()
|
|
236
|
+
|
|
237
|
+
return shard_idx, results, total_files
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _show_shard_topology(as_json: bool) -> None:
|
|
241
|
+
"""Display the current shard routing topology and imbalance metrics."""
|
|
242
|
+
router = ShardRouter()
|
|
243
|
+
sg = ShardedGraphStore(read_only=True)
|
|
244
|
+
topology = sg.describe()
|
|
245
|
+
|
|
246
|
+
# Gather project → shard mapping from all shards.
|
|
247
|
+
shard_project_counts: dict[int, list[str]] = {i: [] for i in range(router.num_shards)}
|
|
248
|
+
for p in sg.list_project_metadata():
|
|
249
|
+
pid = p.get("id", "")
|
|
250
|
+
idx = router.shard_for(pid)
|
|
251
|
+
shard_project_counts[idx].append(pid)
|
|
252
|
+
|
|
253
|
+
counts = [len(v) for v in shard_project_counts.values()]
|
|
254
|
+
total = sum(counts)
|
|
255
|
+
median = sorted(counts)[len(counts) // 2] if counts else 0
|
|
256
|
+
max_count = max(counts) if counts else 0
|
|
257
|
+
imbalance = (max_count / median) if median else 1.0
|
|
258
|
+
|
|
259
|
+
if as_json:
|
|
260
|
+
_echo_json({
|
|
261
|
+
"topology": topology,
|
|
262
|
+
"project_distribution": {str(k): v for k, v in shard_project_counts.items()},
|
|
263
|
+
"imbalance_ratio": round(imbalance, 2),
|
|
264
|
+
}, as_json=True)
|
|
265
|
+
return
|
|
266
|
+
|
|
267
|
+
click.secho(f"Shard topology ({router.num_shards} shards)", fg="cyan")
|
|
268
|
+
click.echo(f" Directory : {router.shards_dir}")
|
|
269
|
+
click.echo(f" Ring size : {len(router._ring)} virtual nodes ({router.num_shards} × {150})")
|
|
270
|
+
click.echo(f" Projects : {total} total, imbalance ratio {imbalance:.2f}x")
|
|
271
|
+
click.echo()
|
|
272
|
+
header = f"{'Shard':>6} {'Projects':>9} {'DB exists':>10} Path"
|
|
273
|
+
click.secho(header, fg="cyan")
|
|
274
|
+
click.echo("-" * 60)
|
|
275
|
+
for i, info in enumerate(topology.get("shards", [])):
|
|
276
|
+
plist = shard_project_counts.get(i, [])
|
|
277
|
+
exists_str = "yes" if info.get("exists") else "no"
|
|
278
|
+
click.echo(f"{i:>6} {len(plist):>9} {exists_str:>10} {info.get('db_path', '')}")
|
|
279
|
+
for pid in plist:
|
|
280
|
+
click.echo(f"{'':>6} {'':>9} {'':>10} {pid}")
|
|
281
|
+
if imbalance > 2.0:
|
|
282
|
+
click.secho(
|
|
283
|
+
f"\nWarning: imbalance ratio {imbalance:.1f}x. Consider re-indexing to redistribute projects.",
|
|
284
|
+
fg="yellow",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
93
288
|
@click.group()
|
|
94
289
|
def main() -> None:
|
|
95
290
|
"""CodeSpine CLI."""
|
|
@@ -130,8 +325,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
130
325
|
fg="yellow",
|
|
131
326
|
)
|
|
132
327
|
|
|
133
|
-
|
|
134
|
-
|
|
328
|
+
# ShardedGraphStore routes each project to its dedicated DB shard.
|
|
329
|
+
# For single-project analysis this is transparent — shard() always
|
|
330
|
+
# returns a GraphStore pointing to the correct shard path.
|
|
331
|
+
sg = ShardedGraphStore(read_only=False)
|
|
332
|
+
# The indexer is initialised per-module below with the right shard store.
|
|
333
|
+
# We keep a single ShardedGraphStore to fan-out cross-module linking later.
|
|
135
334
|
|
|
136
335
|
# --- Workspace → project → module detection ---
|
|
137
336
|
# Level 1: workspace (e.g. ~/IdeaProjects/) may contain independent projects.
|
|
@@ -164,96 +363,69 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
164
363
|
|
|
165
364
|
root_basename = os.path.basename(abs_path)
|
|
166
365
|
|
|
167
|
-
#
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
def _progress(event: str, payload: dict) -> None:
|
|
177
|
-
now = time.perf_counter()
|
|
178
|
-
if event == "scan_done":
|
|
179
|
-
_phase("Walking files...", f"{int(payload.get('files_found', 0))} files found")
|
|
180
|
-
return
|
|
181
|
-
if event == "plan_done":
|
|
182
|
-
to_index = int(payload.get("files_to_index", 0))
|
|
183
|
-
deleted = int(payload.get("deleted_files", 0))
|
|
184
|
-
mode = str(payload.get("mode", "incremental"))
|
|
185
|
-
parse_state["total"] = to_index
|
|
186
|
-
_phase("Index mode...", f"{mode} ({to_index} files to index, {deleted} deleted)")
|
|
187
|
-
if to_index == 0:
|
|
188
|
-
_phase("Parsing code...", "0/0")
|
|
189
|
-
parse_state["printed_zero"] = True
|
|
190
|
-
return
|
|
191
|
-
if event == "parse_progress":
|
|
192
|
-
indexed = int(payload.get("indexed", 0))
|
|
193
|
-
total = int(payload.get("total", 0))
|
|
194
|
-
parse_state["indexed"] = indexed
|
|
195
|
-
parse_state["total"] = total
|
|
196
|
-
if total == 0:
|
|
197
|
-
return
|
|
198
|
-
if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
|
|
199
|
-
click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
|
|
200
|
-
parse_state["shown"] = True
|
|
201
|
-
parse_state["last_ts"] = now
|
|
202
|
-
return
|
|
203
|
-
if event == "resolve_calls_start" and parse_state["shown"]:
|
|
204
|
-
click.echo()
|
|
205
|
-
parse_state["shown"] = False
|
|
206
|
-
call_state["started_at"] = now
|
|
207
|
-
_phase("Tracing calls...", "starting...")
|
|
208
|
-
return
|
|
209
|
-
if event == "resolve_calls_start":
|
|
210
|
-
call_state["started_at"] = now
|
|
211
|
-
_phase("Tracing calls...", "starting...")
|
|
212
|
-
return
|
|
213
|
-
if event == "resolve_calls_progress":
|
|
214
|
-
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
215
|
-
if (now - call_state["last_ts"]) >= 0.25:
|
|
216
|
-
elapsed_s = now - call_state["started_at"]
|
|
217
|
-
click.echo(
|
|
218
|
-
f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
|
|
219
|
-
nl=False,
|
|
220
|
-
)
|
|
221
|
-
call_state["shown"] = True
|
|
222
|
-
call_state["last_ts"] = now
|
|
223
|
-
return
|
|
224
|
-
if event == "resolve_calls_done":
|
|
225
|
-
if call_state["shown"]:
|
|
226
|
-
click.echo()
|
|
227
|
-
call_state["shown"] = False
|
|
228
|
-
elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
|
|
229
|
-
_phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
|
|
230
|
-
return
|
|
231
|
-
if event == "resolve_types_start":
|
|
232
|
-
_phase("Analyzing types...", "running")
|
|
233
|
-
return
|
|
234
|
-
if event == "resolve_types_done":
|
|
235
|
-
_phase("Analyzing types...", f"{int(payload.get('type_relationships', 0))} type relationships")
|
|
236
|
-
return
|
|
237
|
-
|
|
238
|
-
# --- Index each module ---
|
|
366
|
+
# ── Group modules by target shard ─────────────────────────────────
|
|
367
|
+
# Modules that hash to different shards own separate KùzuDBs and can
|
|
368
|
+
# be indexed in parallel. Modules in the same shard (same project
|
|
369
|
+
# root for multi-module projects) are always indexed sequentially.
|
|
370
|
+
shard_groups: dict[int, list[tuple[str, str]]] = defaultdict(list)
|
|
371
|
+
for mod_path, pid in modules_with_ids:
|
|
372
|
+
shard_groups[sg.router.shard_for(pid)].append((mod_path, pid))
|
|
373
|
+
|
|
239
374
|
is_multi = len(modules_with_ids) > 1
|
|
375
|
+
parallel_mode = len(shard_groups) > 1 # ≥2 shards → true parallelism
|
|
376
|
+
output_lock = threading.Lock()
|
|
377
|
+
|
|
378
|
+
if parallel_mode:
|
|
379
|
+
click.secho(
|
|
380
|
+
f"Parallel mode: {len(shard_groups)} shards will be indexed concurrently.",
|
|
381
|
+
fg="cyan",
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Print which shard each module lands on (multi-module only).
|
|
385
|
+
if is_multi:
|
|
386
|
+
for s_idx, group in sorted(shard_groups.items()):
|
|
387
|
+
for _, pid in group:
|
|
388
|
+
click.secho(f" {pid:<40} → shard {s_idx}", fg="cyan")
|
|
389
|
+
|
|
390
|
+
# ── Dispatch to shards ────────────────────────────────────────────
|
|
240
391
|
total_files_found = 0
|
|
392
|
+
all_results: list = []
|
|
241
393
|
last_result = None
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
394
|
+
|
|
395
|
+
if parallel_mode:
|
|
396
|
+
max_workers = min(len(shard_groups), 4)
|
|
397
|
+
click.echo()
|
|
398
|
+
futures_map = {}
|
|
399
|
+
with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="codespine-shard") as ex:
|
|
400
|
+
for s_idx, group in shard_groups.items():
|
|
401
|
+
f = ex.submit(
|
|
402
|
+
_index_shard_group,
|
|
403
|
+
s_idx, group, sg, full, embed, output_lock, True,
|
|
404
|
+
)
|
|
405
|
+
futures_map[f] = s_idx
|
|
406
|
+
|
|
407
|
+
for future in as_completed(futures_map):
|
|
408
|
+
s_idx = futures_map[future]
|
|
409
|
+
try:
|
|
410
|
+
ret_idx, results, n_files = future.result()
|
|
411
|
+
all_results.extend(results)
|
|
412
|
+
total_files_found += n_files
|
|
413
|
+
if results:
|
|
414
|
+
last_result = results[-1]
|
|
415
|
+
with output_lock:
|
|
416
|
+
click.secho(f" Shard {ret_idx} done ({n_files} files)", fg="green")
|
|
417
|
+
except Exception as exc: # noqa: BLE001
|
|
418
|
+
with output_lock:
|
|
419
|
+
click.secho(f" Shard {s_idx} FAILED: {exc}", fg="red")
|
|
420
|
+
else:
|
|
421
|
+
# Serial path — single shard (or single module). Full progress UX.
|
|
422
|
+
only_shard_idx = next(iter(shard_groups))
|
|
423
|
+
only_group = shard_groups[only_shard_idx]
|
|
424
|
+
_, all_results, total_files_found = _index_shard_group(
|
|
425
|
+
only_shard_idx, only_group, sg, full, embed, output_lock, False,
|
|
249
426
|
)
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
click.echo()
|
|
253
|
-
if parse_state["total"] == 0 and not parse_state["printed_zero"]:
|
|
254
|
-
_phase("Parsing code...", "0/0")
|
|
255
|
-
elif parse_state["indexed"] < parse_state["total"]:
|
|
256
|
-
_phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
|
|
427
|
+
if all_results:
|
|
428
|
+
last_result = all_results[-1]
|
|
257
429
|
|
|
258
430
|
# ── Helper for in-place progress updates ────────────────────────────
|
|
259
431
|
def _live_phase(label: str, status: str) -> None:
|
|
@@ -264,13 +436,18 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
264
436
|
"""Finalise an in-place phase line and move to the next line."""
|
|
265
437
|
click.echo(f"\r✓ {label:<28} {result:<48}")
|
|
266
438
|
|
|
439
|
+
# For cross-module operations (cross-module linking, deep analysis, stats)
|
|
440
|
+
# we use the shard store for the root project (all modules share one shard).
|
|
441
|
+
root_project_id = last_result.project_id if last_result else root_basename
|
|
442
|
+
root_shard_store = sg.shard(root_project_id)
|
|
443
|
+
|
|
267
444
|
# ── Cross-module call linking ──────────────────────────────────────
|
|
268
445
|
if is_multi and len(modules_with_ids) > 1:
|
|
269
446
|
xmod_label = "Cross-module linking..."
|
|
270
447
|
_live_phase(xmod_label, "running")
|
|
271
448
|
xmod_pids = [pid for _, pid in modules_with_ids]
|
|
272
449
|
xmod_edges = link_cross_module_calls(
|
|
273
|
-
|
|
450
|
+
root_shard_store, project_ids=xmod_pids,
|
|
274
451
|
progress=lambda s: _live_phase(xmod_label, s),
|
|
275
452
|
)
|
|
276
453
|
_finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
|
|
@@ -287,7 +464,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
287
464
|
comm_label = "Detecting communities..."
|
|
288
465
|
_live_phase(comm_label, "running")
|
|
289
466
|
communities = detect_communities(
|
|
290
|
-
|
|
467
|
+
root_shard_store,
|
|
291
468
|
progress=lambda s: _live_phase(comm_label, s),
|
|
292
469
|
)
|
|
293
470
|
_finish_phase(comm_label, f"{len(communities)} clusters found")
|
|
@@ -295,23 +472,23 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
295
472
|
flow_label = "Detecting execution flows..."
|
|
296
473
|
_live_phase(flow_label, "running")
|
|
297
474
|
flows = trace_execution_flows(
|
|
298
|
-
|
|
475
|
+
root_shard_store,
|
|
299
476
|
progress=lambda s: _live_phase(flow_label, s),
|
|
300
477
|
)
|
|
301
478
|
_finish_phase(flow_label, f"{len(flows)} processes found")
|
|
302
479
|
|
|
303
480
|
dead_label = "Finding dead code..."
|
|
304
481
|
_live_phase(dead_label, "running")
|
|
305
|
-
dead = detect_dead_code(
|
|
482
|
+
dead = detect_dead_code(root_shard_store, limit=500)
|
|
306
483
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} unreachable symbols")
|
|
307
484
|
|
|
308
485
|
coup_label = "Analyzing git history..."
|
|
309
486
|
_live_phase(coup_label, "running")
|
|
310
|
-
|
|
487
|
+
root_shard_store.clear_coupling()
|
|
311
488
|
coupling_root = abs_path
|
|
312
489
|
coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
|
|
313
490
|
coupling_pairs = compute_coupling(
|
|
314
|
-
|
|
491
|
+
root_shard_store,
|
|
315
492
|
coupling_root,
|
|
316
493
|
coupling_project,
|
|
317
494
|
days=SETTINGS.default_coupling_days,
|
|
@@ -329,7 +506,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
329
506
|
flow_label = "Detecting execution flows..."
|
|
330
507
|
_live_phase(flow_label, "running (lightweight)")
|
|
331
508
|
try:
|
|
332
|
-
flows = trace_execution_flows(
|
|
509
|
+
flows = trace_execution_flows(root_shard_store, max_depth=3)
|
|
333
510
|
except Exception:
|
|
334
511
|
flows = []
|
|
335
512
|
_finish_phase(flow_label, f"{len(flows)} flows (lightweight; rerun with --deep for full)")
|
|
@@ -337,14 +514,14 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
337
514
|
dead_label = "Finding dead code..."
|
|
338
515
|
_live_phase(dead_label, "running (lightweight)")
|
|
339
516
|
try:
|
|
340
|
-
dead = detect_dead_code(
|
|
517
|
+
dead = detect_dead_code(root_shard_store, limit=100)
|
|
341
518
|
except Exception:
|
|
342
519
|
dead = []
|
|
343
520
|
_finish_phase(dead_label, f"{_dead_result_count(dead)} candidates (lightweight; rerun with --deep for full)")
|
|
344
521
|
|
|
345
522
|
_phase("Analyzing git history...", "skipped (large repo; rerun with --deep)")
|
|
346
523
|
|
|
347
|
-
vector_count =
|
|
524
|
+
vector_count = root_shard_store.query_records(
|
|
348
525
|
"""
|
|
349
526
|
MATCH (s:Symbol)
|
|
350
527
|
WHERE s.embedding IS NOT NULL
|
|
@@ -355,8 +532,8 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
355
532
|
vectors_stored = int(vector_count[0]["count"]) if vector_count else embeddings_generated
|
|
356
533
|
_phase("Generating embeddings...", f"{vectors_stored} vectors stored")
|
|
357
534
|
|
|
358
|
-
symbol_count =
|
|
359
|
-
edge_count =
|
|
535
|
+
symbol_count = root_shard_store.query_records("MATCH (s:Symbol) RETURN count(s) as count")
|
|
536
|
+
edge_count = root_shard_store.query_records("MATCH ()-[r]->() RETURN count(r) as count")
|
|
360
537
|
symbols = int(symbol_count[0]["count"]) if symbol_count else 0
|
|
361
538
|
edges = int(edge_count[0]["count"]) if edge_count else 0
|
|
362
539
|
elapsed = time.perf_counter() - started
|
|
@@ -376,7 +553,7 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
376
553
|
|
|
377
554
|
# Detect unresolved imports → hint about unindexed sibling projects
|
|
378
555
|
try:
|
|
379
|
-
unresolved = JavaIndexer.detect_unresolved_imports(
|
|
556
|
+
unresolved = JavaIndexer.detect_unresolved_imports(root_shard_store)
|
|
380
557
|
if unresolved:
|
|
381
558
|
click.echo()
|
|
382
559
|
click.secho("⚠ Unresolved imports — consider indexing these projects:", fg="yellow")
|
|
@@ -387,13 +564,12 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
387
564
|
|
|
388
565
|
# Publish a read replica so MCP and read-only CLI commands (search, stats…)
|
|
389
566
|
# run against an isolated snapshot rather than competing with the write
|
|
390
|
-
# process's buffer pool.
|
|
391
|
-
# hot-reloads without restarting.
|
|
567
|
+
# process's buffer pool. Snapshot all open shards concurrently.
|
|
392
568
|
snap_label = "Publishing read replica..."
|
|
393
569
|
_live_phase(snap_label, "copying")
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
_finish_phase(snap_label, "MCP will reload automatically"
|
|
570
|
+
root_shard_store._recycle_conn()
|
|
571
|
+
sg.snapshot_all(background=False)
|
|
572
|
+
_finish_phase(snap_label, "MCP will reload automatically")
|
|
397
573
|
|
|
398
574
|
|
|
399
575
|
@main.command()
|
|
@@ -523,10 +699,21 @@ def diff(range_spec: str, as_json: bool) -> None:
|
|
|
523
699
|
|
|
524
700
|
@main.command()
|
|
525
701
|
@click.option("--json", "as_json", is_flag=True)
|
|
526
|
-
|
|
702
|
+
@click.option("--shards", "show_shards", is_flag=True, help="Show shard topology and load distribution.")
|
|
703
|
+
def stats(as_json: bool, show_shards: bool) -> None:
|
|
527
704
|
"""Show per-project and aggregate graph statistics."""
|
|
528
|
-
|
|
529
|
-
|
|
705
|
+
if show_shards:
|
|
706
|
+
_show_shard_topology(as_json)
|
|
707
|
+
return
|
|
708
|
+
|
|
709
|
+
# Fan-out across all shards so stats covers every project in the cluster.
|
|
710
|
+
sg = ShardedGraphStore(read_only=True)
|
|
711
|
+
all_projects_meta = sg.list_project_metadata()
|
|
712
|
+
|
|
713
|
+
# For detailed stats we need the per-project shard store.
|
|
714
|
+
def _project_store(pid: str):
|
|
715
|
+
return sg.shard(pid)
|
|
716
|
+
|
|
530
717
|
if not projects:
|
|
531
718
|
click.secho("No projects indexed yet. Run 'codespine analyse <path>'.", fg="yellow")
|
|
532
719
|
return
|
|
@@ -534,10 +721,12 @@ def stats(as_json: bool) -> None:
|
|
|
534
721
|
rows = []
|
|
535
722
|
for p in projects:
|
|
536
723
|
pid = p["id"]
|
|
537
|
-
|
|
724
|
+
# Route each query to the project's owning shard.
|
|
725
|
+
ps = _project_store(pid)
|
|
726
|
+
files = ps.query_records(
|
|
538
727
|
"MATCH (f:File) WHERE f.project_id = $pid RETURN count(f) as n", {"pid": pid}
|
|
539
728
|
)
|
|
540
|
-
classes =
|
|
729
|
+
classes = ps.query_records(
|
|
541
730
|
"""
|
|
542
731
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
543
732
|
WITH f
|
|
@@ -546,7 +735,7 @@ def stats(as_json: bool) -> None:
|
|
|
546
735
|
""",
|
|
547
736
|
{"pid": pid},
|
|
548
737
|
)
|
|
549
|
-
methods =
|
|
738
|
+
methods = ps.query_records(
|
|
550
739
|
"""
|
|
551
740
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
552
741
|
WITH f
|
|
@@ -557,7 +746,7 @@ def stats(as_json: bool) -> None:
|
|
|
557
746
|
""",
|
|
558
747
|
{"pid": pid},
|
|
559
748
|
)
|
|
560
|
-
calls =
|
|
749
|
+
calls = ps.query_records(
|
|
561
750
|
"""
|
|
562
751
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
563
752
|
WITH f
|
|
@@ -568,7 +757,7 @@ def stats(as_json: bool) -> None:
|
|
|
568
757
|
""",
|
|
569
758
|
{"pid": pid},
|
|
570
759
|
)
|
|
571
|
-
emb =
|
|
760
|
+
emb = ps.query_records(
|
|
572
761
|
"""
|
|
573
762
|
MATCH (f:File) WHERE f.project_id = $pid
|
|
574
763
|
WITH f
|
|
@@ -580,6 +769,7 @@ def stats(as_json: bool) -> None:
|
|
|
580
769
|
rows.append({
|
|
581
770
|
"project": pid,
|
|
582
771
|
"path": p["path"],
|
|
772
|
+
"shard": sg.router.shard_for(pid),
|
|
583
773
|
"files": files[0]["n"] if files else 0,
|
|
584
774
|
"classes": classes[0]["n"] if classes else 0,
|
|
585
775
|
"methods": methods[0]["n"] if methods else 0,
|
|
@@ -592,13 +782,13 @@ def stats(as_json: bool) -> None:
|
|
|
592
782
|
return
|
|
593
783
|
|
|
594
784
|
col_w = max(len(r["project"]) for r in rows)
|
|
595
|
-
header = f"{'Project':<{col_w}} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
785
|
+
header = f"{'Project':<{col_w}} {'Shard':>5} {'Files':>6} {'Classes':>8} {'Methods':>8} {'Calls':>7} {'Emb':>6} Path"
|
|
596
786
|
click.secho(header, fg="cyan")
|
|
597
787
|
click.echo("-" * len(header))
|
|
598
788
|
total_files = total_classes = total_methods = total_calls = total_emb = 0
|
|
599
789
|
for r in rows:
|
|
600
790
|
click.echo(
|
|
601
|
-
f"{r['project']:<{col_w}} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
791
|
+
f"{r['project']:<{col_w}} {r.get('shard', 0):>5} {r['files']:>6} {r['classes']:>8} {r['methods']:>8} {r['calls_out']:>7} {r['embeddings']:>6} {r['path']}"
|
|
602
792
|
)
|
|
603
793
|
total_files += r["files"]
|
|
604
794
|
total_classes += r["classes"]
|
|
@@ -608,7 +798,7 @@ def stats(as_json: bool) -> None:
|
|
|
608
798
|
if len(rows) > 1:
|
|
609
799
|
click.echo("-" * len(header))
|
|
610
800
|
click.secho(
|
|
611
|
-
f"{'TOTAL':<{col_w}} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
801
|
+
f"{'TOTAL':<{col_w}} {'':>5} {total_files:>6} {total_classes:>8} {total_methods:>8} {total_calls:>7} {total_emb:>6}",
|
|
612
802
|
fg="green",
|
|
613
803
|
)
|
|
614
804
|
|
|
@@ -4,8 +4,17 @@ from dataclasses import dataclass
|
|
|
4
4
|
|
|
5
5
|
@dataclass(frozen=True)
|
|
6
6
|
class Settings:
|
|
7
|
+
# Legacy single-DB paths — kept for backward compat and as defaults when
|
|
8
|
+
# sharding is disabled (num_shards == 1 or CODESPINE_SHARDS not set).
|
|
7
9
|
db_path: str = os.path.expanduser("~/.codespine_db")
|
|
8
10
|
db_snapshot_path: str = os.path.expanduser("~/.codespine_db_read")
|
|
11
|
+
|
|
12
|
+
# Sharding — new layout stores each shard under shards_dir/{N}/db
|
|
13
|
+
# num_shards: int, overridable via CODESPINE_SHARDS env var at runtime.
|
|
14
|
+
# ShardRouter reads CODESPINE_SHARDS directly; this field is the compiled default.
|
|
15
|
+
num_shards: int = 4
|
|
16
|
+
shards_dir: str = os.path.expanduser("~/.codespine/shards")
|
|
17
|
+
|
|
9
18
|
pid_file: str = os.path.expanduser("~/.codespine.pid")
|
|
10
19
|
log_file: str = os.path.expanduser("~/.codespine.log")
|
|
11
20
|
embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")
|