codespine 0.9.6__tar.gz → 0.9.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.9.6 → codespine-0.9.7}/PKG-INFO +1 -1
- {codespine-0.9.6 → codespine-0.9.7}/codespine/__init__.py +1 -1
- {codespine-0.9.6 → codespine-0.9.7}/codespine/cli.py +205 -93
- {codespine-0.9.6 → codespine-0.9.7}/codespine/db/store.py +71 -29
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/SOURCES.txt +1 -0
- {codespine-0.9.6 → codespine-0.9.7}/pyproject.toml +1 -1
- codespine-0.9.7/tests/test_sharding.py +200 -0
- {codespine-0.9.6 → codespine-0.9.7}/LICENSE +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/README.md +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/community.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/context.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/coupling.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/crossmodule.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/flow.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/impact.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/config.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/db/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/db/schema.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/diff/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/guide.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/di_resolver.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/engine.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/mcp/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/mcp/server.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/noise/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/noise/blocklist.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/git_state.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/merge.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/store.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/bm25.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/fuzzy.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/hybrid.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/rrf.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/search/vector.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/router.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/store.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/__init__.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/git_hook.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/watcher.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/gindex.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/setup.cfg +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_call_resolver.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_community_detection.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_deadcode.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_java_parser.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_multimodule_index.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_overlay.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_search_ranking.py +0 -0
- {codespine-0.9.6 → codespine-0.9.7}/tests/test_store_recovery.py +0 -0
|
@@ -6,7 +6,10 @@ import os
|
|
|
6
6
|
import signal
|
|
7
7
|
import subprocess
|
|
8
8
|
import sys
|
|
9
|
+
import threading
|
|
9
10
|
import time
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
13
|
|
|
11
14
|
import click
|
|
12
15
|
import psutil
|
|
@@ -91,6 +94,149 @@ def _spinner_char() -> str:
|
|
|
91
94
|
return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
|
|
92
95
|
|
|
93
96
|
|
|
97
|
+
def _index_shard_group(
|
|
98
|
+
shard_idx: int,
|
|
99
|
+
modules: list[tuple[str, str]],
|
|
100
|
+
sg,
|
|
101
|
+
full: bool,
|
|
102
|
+
embed: bool,
|
|
103
|
+
output_lock: threading.Lock,
|
|
104
|
+
parallel: bool,
|
|
105
|
+
) -> tuple[int, list, int]:
|
|
106
|
+
"""Index one group of modules that share a shard.
|
|
107
|
+
|
|
108
|
+
Modules within the group are always indexed sequentially (same KùzuDB).
|
|
109
|
+
Multiple groups can run concurrently in different threads when they own
|
|
110
|
+
different shards.
|
|
111
|
+
|
|
112
|
+
Returns (total_files_found, all_results, shard_idx).
|
|
113
|
+
"""
|
|
114
|
+
results = []
|
|
115
|
+
total_files = 0
|
|
116
|
+
|
|
117
|
+
def _locked_echo(*args, **kwargs) -> None:
|
|
118
|
+
"""Thread-safe click.echo."""
|
|
119
|
+
with output_lock:
|
|
120
|
+
click.echo(*args, **kwargs)
|
|
121
|
+
|
|
122
|
+
def _locked_secho(*args, **kwargs) -> None:
|
|
123
|
+
with output_lock:
|
|
124
|
+
click.secho(*args, **kwargs)
|
|
125
|
+
|
|
126
|
+
prefix = f"[S{shard_idx}] " if parallel else ""
|
|
127
|
+
|
|
128
|
+
for mod_path, project_id in modules:
|
|
129
|
+
# Per-module progress state (local — no shared mutation).
|
|
130
|
+
parse_state: dict = {"shown": False, "indexed": 0, "total": 0,
|
|
131
|
+
"last_ts": 0.0, "printed_zero": False}
|
|
132
|
+
call_state: dict = {"shown": False, "count": 0, "last_ts": 0.0,
|
|
133
|
+
"started_at": 0.0}
|
|
134
|
+
|
|
135
|
+
def _progress(event: str, payload: dict) -> None:
|
|
136
|
+
now = time.perf_counter()
|
|
137
|
+
if event == "scan_done":
|
|
138
|
+
with output_lock:
|
|
139
|
+
_phase(f"{prefix}Walking files...", f"{int(payload.get('files_found', 0))} files found")
|
|
140
|
+
return
|
|
141
|
+
if event == "plan_done":
|
|
142
|
+
to_index = int(payload.get("files_to_index", 0))
|
|
143
|
+
deleted = int(payload.get("deleted_files", 0))
|
|
144
|
+
mode = str(payload.get("mode", "incremental"))
|
|
145
|
+
parse_state["total"] = to_index
|
|
146
|
+
with output_lock:
|
|
147
|
+
_phase(f"{prefix}Index mode...", f"{mode} ({to_index} files, {deleted} deleted)")
|
|
148
|
+
if to_index == 0:
|
|
149
|
+
with output_lock:
|
|
150
|
+
_phase(f"{prefix}Parsing code...", "0/0")
|
|
151
|
+
parse_state["printed_zero"] = True
|
|
152
|
+
return
|
|
153
|
+
if event == "parse_progress":
|
|
154
|
+
indexed = int(payload.get("indexed", 0))
|
|
155
|
+
total = int(payload.get("total", 0))
|
|
156
|
+
parse_state["indexed"] = indexed
|
|
157
|
+
parse_state["total"] = total
|
|
158
|
+
if total == 0:
|
|
159
|
+
return
|
|
160
|
+
if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
|
|
161
|
+
if not parallel:
|
|
162
|
+
# In-place progress bar only makes sense in serial mode.
|
|
163
|
+
click.echo(
|
|
164
|
+
f"\r{prefix}Parsing code... {_bar(indexed, total)} {indexed}/{total} ",
|
|
165
|
+
nl=False,
|
|
166
|
+
)
|
|
167
|
+
else:
|
|
168
|
+
with output_lock:
|
|
169
|
+
click.echo(
|
|
170
|
+
f"\r{prefix}Parsing {indexed}/{total} ",
|
|
171
|
+
nl=False,
|
|
172
|
+
)
|
|
173
|
+
parse_state["shown"] = True
|
|
174
|
+
parse_state["last_ts"] = now
|
|
175
|
+
return
|
|
176
|
+
if event in ("resolve_calls_start",):
|
|
177
|
+
if parse_state["shown"]:
|
|
178
|
+
with output_lock:
|
|
179
|
+
click.echo()
|
|
180
|
+
parse_state["shown"] = False
|
|
181
|
+
call_state["started_at"] = now
|
|
182
|
+
with output_lock:
|
|
183
|
+
_phase(f"{prefix}Tracing calls...", "starting...")
|
|
184
|
+
return
|
|
185
|
+
if event == "resolve_calls_progress":
|
|
186
|
+
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
187
|
+
if (now - call_state["last_ts"]) >= 0.25:
|
|
188
|
+
elapsed_s = now - call_state["started_at"]
|
|
189
|
+
if not parallel:
|
|
190
|
+
click.echo(
|
|
191
|
+
f"\r{_spinner_char()} {prefix}Tracing calls... "
|
|
192
|
+
f"{call_state['count']:>6} resolved {elapsed_s:.1f}s ",
|
|
193
|
+
nl=False,
|
|
194
|
+
)
|
|
195
|
+
else:
|
|
196
|
+
with output_lock:
|
|
197
|
+
click.echo(
|
|
198
|
+
f"\r{prefix}Calls: {call_state['count']} ({elapsed_s:.0f}s) ",
|
|
199
|
+
nl=False,
|
|
200
|
+
)
|
|
201
|
+
call_state["shown"] = True
|
|
202
|
+
call_state["last_ts"] = now
|
|
203
|
+
return
|
|
204
|
+
if event == "resolve_calls_done":
|
|
205
|
+
if call_state["shown"]:
|
|
206
|
+
with output_lock:
|
|
207
|
+
click.echo()
|
|
208
|
+
call_state["shown"] = False
|
|
209
|
+
elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
|
|
210
|
+
n = int(payload.get("calls_resolved", 0))
|
|
211
|
+
with output_lock:
|
|
212
|
+
_phase(f"{prefix}Tracing calls...", f"{n} calls resolved ({elapsed_s:.1f}s)")
|
|
213
|
+
return
|
|
214
|
+
if event == "resolve_types_start":
|
|
215
|
+
with output_lock:
|
|
216
|
+
_phase(f"{prefix}Analyzing types...", "running")
|
|
217
|
+
return
|
|
218
|
+
if event == "resolve_types_done":
|
|
219
|
+
n = int(payload.get("type_relationships", 0))
|
|
220
|
+
with output_lock:
|
|
221
|
+
_phase(f"{prefix}Analyzing types...", f"{n} type relationships")
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
shard_store = sg.shard(project_id)
|
|
225
|
+
indexer = JavaIndexer(shard_store)
|
|
226
|
+
result = indexer.index_project(
|
|
227
|
+
mod_path, full=full, progress=_progress, project_id=project_id, embed=embed
|
|
228
|
+
)
|
|
229
|
+
results.append(result)
|
|
230
|
+
total_files += result.files_found
|
|
231
|
+
|
|
232
|
+
# Flush any dangling progress line.
|
|
233
|
+
if parse_state["shown"]:
|
|
234
|
+
with output_lock:
|
|
235
|
+
click.echo()
|
|
236
|
+
|
|
237
|
+
return shard_idx, results, total_files
|
|
238
|
+
|
|
239
|
+
|
|
94
240
|
def _show_shard_topology(as_json: bool) -> None:
|
|
95
241
|
"""Display the current shard routing topology and imbalance metrics."""
|
|
96
242
|
router = ShardRouter()
|
|
@@ -217,103 +363,69 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
217
363
|
|
|
218
364
|
root_basename = os.path.basename(abs_path)
|
|
219
365
|
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def _progress(event: str, payload: dict) -> None:
|
|
230
|
-
now = time.perf_counter()
|
|
231
|
-
if event == "scan_done":
|
|
232
|
-
_phase("Walking files...", f"{int(payload.get('files_found', 0))} files found")
|
|
233
|
-
return
|
|
234
|
-
if event == "plan_done":
|
|
235
|
-
to_index = int(payload.get("files_to_index", 0))
|
|
236
|
-
deleted = int(payload.get("deleted_files", 0))
|
|
237
|
-
mode = str(payload.get("mode", "incremental"))
|
|
238
|
-
parse_state["total"] = to_index
|
|
239
|
-
_phase("Index mode...", f"{mode} ({to_index} files to index, {deleted} deleted)")
|
|
240
|
-
if to_index == 0:
|
|
241
|
-
_phase("Parsing code...", "0/0")
|
|
242
|
-
parse_state["printed_zero"] = True
|
|
243
|
-
return
|
|
244
|
-
if event == "parse_progress":
|
|
245
|
-
indexed = int(payload.get("indexed", 0))
|
|
246
|
-
total = int(payload.get("total", 0))
|
|
247
|
-
parse_state["indexed"] = indexed
|
|
248
|
-
parse_state["total"] = total
|
|
249
|
-
if total == 0:
|
|
250
|
-
return
|
|
251
|
-
if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
|
|
252
|
-
click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
|
|
253
|
-
parse_state["shown"] = True
|
|
254
|
-
parse_state["last_ts"] = now
|
|
255
|
-
return
|
|
256
|
-
if event == "resolve_calls_start" and parse_state["shown"]:
|
|
257
|
-
click.echo()
|
|
258
|
-
parse_state["shown"] = False
|
|
259
|
-
call_state["started_at"] = now
|
|
260
|
-
_phase("Tracing calls...", "starting...")
|
|
261
|
-
return
|
|
262
|
-
if event == "resolve_calls_start":
|
|
263
|
-
call_state["started_at"] = now
|
|
264
|
-
_phase("Tracing calls...", "starting...")
|
|
265
|
-
return
|
|
266
|
-
if event == "resolve_calls_progress":
|
|
267
|
-
call_state["count"] = int(payload.get("calls_resolved", 0))
|
|
268
|
-
if (now - call_state["last_ts"]) >= 0.25:
|
|
269
|
-
elapsed_s = now - call_state["started_at"]
|
|
270
|
-
click.echo(
|
|
271
|
-
f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
|
|
272
|
-
nl=False,
|
|
273
|
-
)
|
|
274
|
-
call_state["shown"] = True
|
|
275
|
-
call_state["last_ts"] = now
|
|
276
|
-
return
|
|
277
|
-
if event == "resolve_calls_done":
|
|
278
|
-
if call_state["shown"]:
|
|
279
|
-
click.echo()
|
|
280
|
-
call_state["shown"] = False
|
|
281
|
-
elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
|
|
282
|
-
_phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
|
|
283
|
-
return
|
|
284
|
-
if event == "resolve_types_start":
|
|
285
|
-
_phase("Analyzing types...", "running")
|
|
286
|
-
return
|
|
287
|
-
if event == "resolve_types_done":
|
|
288
|
-
_phase("Analyzing types...", f"{int(payload.get('type_relationships', 0))} type relationships")
|
|
289
|
-
return
|
|
290
|
-
|
|
291
|
-
# --- Index each module ---
|
|
366
|
+
# ── Group modules by target shard ─────────────────────────────────
|
|
367
|
+
# Modules that hash to different shards own separate KùzuDBs and can
|
|
368
|
+
# be indexed in parallel. Modules in the same shard (same project
|
|
369
|
+
# root for multi-module projects) are always indexed sequentially.
|
|
370
|
+
shard_groups: dict[int, list[tuple[str, str]]] = defaultdict(list)
|
|
371
|
+
for mod_path, pid in modules_with_ids:
|
|
372
|
+
shard_groups[sg.router.shard_for(pid)].append((mod_path, pid))
|
|
373
|
+
|
|
292
374
|
is_multi = len(modules_with_ids) > 1
|
|
375
|
+
parallel_mode = len(shard_groups) > 1 # ≥2 shards → true parallelism
|
|
376
|
+
output_lock = threading.Lock()
|
|
377
|
+
|
|
378
|
+
if parallel_mode:
|
|
379
|
+
click.secho(
|
|
380
|
+
f"Parallel mode: {len(shard_groups)} shards will be indexed concurrently.",
|
|
381
|
+
fg="cyan",
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Print which shard each module lands on (multi-module only).
|
|
385
|
+
if is_multi:
|
|
386
|
+
for s_idx, group in sorted(shard_groups.items()):
|
|
387
|
+
for _, pid in group:
|
|
388
|
+
click.secho(f" {pid:<40} → shard {s_idx}", fg="cyan")
|
|
389
|
+
|
|
390
|
+
# ── Dispatch to shards ────────────────────────────────────────────
|
|
293
391
|
total_files_found = 0
|
|
392
|
+
all_results: list = []
|
|
294
393
|
last_result = None
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
394
|
+
|
|
395
|
+
if parallel_mode:
|
|
396
|
+
max_workers = min(len(shard_groups), 4)
|
|
397
|
+
click.echo()
|
|
398
|
+
futures_map = {}
|
|
399
|
+
with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="codespine-shard") as ex:
|
|
400
|
+
for s_idx, group in shard_groups.items():
|
|
401
|
+
f = ex.submit(
|
|
402
|
+
_index_shard_group,
|
|
403
|
+
s_idx, group, sg, full, embed, output_lock, True,
|
|
404
|
+
)
|
|
405
|
+
futures_map[f] = s_idx
|
|
406
|
+
|
|
407
|
+
for future in as_completed(futures_map):
|
|
408
|
+
s_idx = futures_map[future]
|
|
409
|
+
try:
|
|
410
|
+
ret_idx, results, n_files = future.result()
|
|
411
|
+
all_results.extend(results)
|
|
412
|
+
total_files_found += n_files
|
|
413
|
+
if results:
|
|
414
|
+
last_result = results[-1]
|
|
415
|
+
with output_lock:
|
|
416
|
+
click.secho(f" Shard {ret_idx} done ({n_files} files)", fg="green")
|
|
417
|
+
except Exception as exc: # noqa: BLE001
|
|
418
|
+
with output_lock:
|
|
419
|
+
click.secho(f" Shard {s_idx} FAILED: {exc}", fg="red")
|
|
420
|
+
else:
|
|
421
|
+
# Serial path — single shard (or single module). Full progress UX.
|
|
422
|
+
only_shard_idx = next(iter(shard_groups))
|
|
423
|
+
only_group = shard_groups[only_shard_idx]
|
|
424
|
+
_, all_results, total_files_found = _index_shard_group(
|
|
425
|
+
only_shard_idx, only_group, sg, full, embed, output_lock, False,
|
|
309
426
|
)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
click.echo()
|
|
313
|
-
if parse_state["total"] == 0 and not parse_state["printed_zero"]:
|
|
314
|
-
_phase("Parsing code...", "0/0")
|
|
315
|
-
elif parse_state["indexed"] < parse_state["total"]:
|
|
316
|
-
_phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
|
|
427
|
+
if all_results:
|
|
428
|
+
last_result = all_results[-1]
|
|
317
429
|
|
|
318
430
|
# ── Helper for in-place progress updates ────────────────────────────
|
|
319
431
|
def _live_phase(label: str, status: str) -> None:
|
|
@@ -495,35 +495,77 @@ class GraphStore:
|
|
|
495
495
|
def upsert_symbols_batch(self, records: list[dict[str, Any]], create_mode: bool = False) -> None:
|
|
496
496
|
if not records:
|
|
497
497
|
return
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
else
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
498
|
+
# Split into rows with and without embeddings.
|
|
499
|
+
# Kuzu's UNWIND parameter type inference treats None as STRING, which
|
|
500
|
+
# conflicts with the FLOAT[384] column type. Keeping the two groups
|
|
501
|
+
# separate avoids the type-mismatch error on fresh DBs.
|
|
502
|
+
rows_emb: list[dict] = []
|
|
503
|
+
rows_no_emb: list[dict] = []
|
|
504
|
+
for r in records:
|
|
505
|
+
emb = r.get("embedding")
|
|
506
|
+
base = {"id": r["id"], "kind": r["kind"], "name": r["name"],
|
|
507
|
+
"fqname": r["fqname"], "file_id": r["file_id"],
|
|
508
|
+
"line": int(r["line"]), "col": int(r["col"])}
|
|
509
|
+
if emb is not None:
|
|
510
|
+
rows_emb.append({**base, "embedding": emb})
|
|
511
|
+
else:
|
|
512
|
+
rows_no_emb.append(base)
|
|
513
|
+
|
|
514
|
+
op = "CREATE" if create_mode else "MERGE"
|
|
515
|
+
edge_op = "CREATE" if create_mode else "MERGE"
|
|
516
|
+
|
|
517
|
+
if rows_no_emb:
|
|
518
|
+
if create_mode:
|
|
519
|
+
self.execute(
|
|
520
|
+
"""
|
|
521
|
+
UNWIND $rows AS row
|
|
522
|
+
MATCH (f:File {id: row.file_id})
|
|
523
|
+
CREATE (s:Symbol {id: row.id, kind: row.kind, name: row.name,
|
|
524
|
+
fqname: row.fqname, file_id: row.file_id,
|
|
525
|
+
line: row.line, col: row.col})
|
|
526
|
+
CREATE (f)-[:DECLARES]->(s)
|
|
527
|
+
""",
|
|
528
|
+
{"rows": rows_no_emb},
|
|
529
|
+
)
|
|
530
|
+
else:
|
|
531
|
+
self.execute(
|
|
532
|
+
"""
|
|
533
|
+
UNWIND $rows AS row
|
|
534
|
+
MATCH (f:File {id: row.file_id})
|
|
535
|
+
MERGE (s:Symbol {id: row.id})
|
|
536
|
+
SET s.kind = row.kind, s.name = row.name, s.fqname = row.fqname,
|
|
537
|
+
s.file_id = row.file_id, s.line = row.line, s.col = row.col
|
|
538
|
+
MERGE (f)-[:DECLARES]->(s)
|
|
539
|
+
""",
|
|
540
|
+
{"rows": rows_no_emb},
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
if rows_emb:
|
|
544
|
+
if create_mode:
|
|
545
|
+
self.execute(
|
|
546
|
+
"""
|
|
547
|
+
UNWIND $rows AS row
|
|
548
|
+
MATCH (f:File {id: row.file_id})
|
|
549
|
+
CREATE (s:Symbol {id: row.id, kind: row.kind, name: row.name,
|
|
550
|
+
fqname: row.fqname, file_id: row.file_id,
|
|
551
|
+
line: row.line, col: row.col, embedding: row.embedding})
|
|
552
|
+
CREATE (f)-[:DECLARES]->(s)
|
|
553
|
+
""",
|
|
554
|
+
{"rows": rows_emb},
|
|
555
|
+
)
|
|
556
|
+
else:
|
|
557
|
+
self.execute(
|
|
558
|
+
"""
|
|
559
|
+
UNWIND $rows AS row
|
|
560
|
+
MATCH (f:File {id: row.file_id})
|
|
561
|
+
MERGE (s:Symbol {id: row.id})
|
|
562
|
+
SET s.kind = row.kind, s.name = row.name, s.fqname = row.fqname,
|
|
563
|
+
s.file_id = row.file_id, s.line = row.line, s.col = row.col,
|
|
564
|
+
s.embedding = row.embedding
|
|
565
|
+
MERGE (f)-[:DECLARES]->(s)
|
|
566
|
+
""",
|
|
567
|
+
{"rows": rows_emb},
|
|
568
|
+
)
|
|
527
569
|
|
|
528
570
|
def add_call(self, source_id: str, target_id: str, confidence: float, reason: str) -> None:
|
|
529
571
|
self.execute(
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""Tests for sharding infrastructure: ShardRouter + ShardedGraphStore."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
pytest.importorskip("kuzu")
|
|
10
|
+
pytest.importorskip("tree_sitter_java")
|
|
11
|
+
|
|
12
|
+
from codespine.sharding.router import ShardRouter
|
|
13
|
+
from codespine.sharding.store import ShardedGraphStore
|
|
14
|
+
from codespine.indexer.engine import JavaIndexer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# ShardRouter
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_router_co_location():
|
|
23
|
+
"""All modules of the same project must hash to the same shard."""
|
|
24
|
+
r = ShardRouter(num_shards=8)
|
|
25
|
+
root_shard = r.shard_for("myapp")
|
|
26
|
+
assert r.shard_for("myapp::module-a") == root_shard
|
|
27
|
+
assert r.shard_for("myapp::module-b") == root_shard
|
|
28
|
+
assert r.shard_for("myapp::module-c") == root_shard
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_router_single_shard_always_zero():
|
|
32
|
+
"""With num_shards=1 every project must land on shard 0."""
|
|
33
|
+
r = ShardRouter(num_shards=1)
|
|
34
|
+
for pid in ["alpha", "beta", "gamma::sub", "delta"]:
|
|
35
|
+
assert r.shard_for(pid) == 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_router_distribution(tmp_path: Path):
|
|
39
|
+
"""With 4 shards and many distinct projects, at least 2 shards get used."""
|
|
40
|
+
r = ShardRouter(num_shards=4)
|
|
41
|
+
projects = [f"project-{i}" for i in range(50)]
|
|
42
|
+
used_shards = {r.shard_for(p) for p in projects}
|
|
43
|
+
assert len(used_shards) >= 2, "Poor distribution — expected multiple shards to be used"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_router_deterministic():
|
|
47
|
+
"""Same project_id must always map to the same shard across instances."""
|
|
48
|
+
r1 = ShardRouter(num_shards=4)
|
|
49
|
+
r2 = ShardRouter(num_shards=4)
|
|
50
|
+
for pid in ["foo", "bar::baz", "qux"]:
|
|
51
|
+
assert r1.shard_for(pid) == r2.shard_for(pid)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_router_paths(tmp_path: Path):
|
|
55
|
+
r = ShardRouter(num_shards=3, shards_dir=str(tmp_path / "shards"))
|
|
56
|
+
assert r.db_path(0).endswith("/0/db")
|
|
57
|
+
assert r.snapshot_path(1).endswith("/1/db_read")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# ShardedGraphStore — basic routing
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _write_java(path: Path, content: str) -> None:
|
|
66
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
path.write_text(content, encoding="utf-8")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def test_sharded_store_routes_modules_to_same_shard(tmp_path: Path):
|
|
71
|
+
"""All modules of the same project must use the same GraphStore instance."""
|
|
72
|
+
sg = ShardedGraphStore(num_shards=4, shards_dir=str(tmp_path / "shards"))
|
|
73
|
+
store_a = sg.shard("myapp::module-a")
|
|
74
|
+
store_b = sg.shard("myapp::module-b")
|
|
75
|
+
store_c = sg.shard("myapp::module-c")
|
|
76
|
+
assert store_a is store_b
|
|
77
|
+
assert store_b is store_c
|
|
78
|
+
assert store_a._db_path == store_b._db_path
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_sharded_store_different_projects_may_differ(tmp_path: Path):
|
|
82
|
+
"""Two unrelated projects may (and likely will) land on different stores."""
|
|
83
|
+
sg = ShardedGraphStore(num_shards=8, shards_dir=str(tmp_path / "shards"))
|
|
84
|
+
# Find two project IDs that hash to different shards.
|
|
85
|
+
router = sg.router
|
|
86
|
+
p1, p2 = None, None
|
|
87
|
+
for i in range(200):
|
|
88
|
+
pid = f"project-{i}"
|
|
89
|
+
if p1 is None:
|
|
90
|
+
p1 = pid
|
|
91
|
+
elif router.shard_for(pid) != router.shard_for(p1):
|
|
92
|
+
p2 = pid
|
|
93
|
+
break
|
|
94
|
+
if p2 is None:
|
|
95
|
+
pytest.skip("All 200 projects happened to hash to the same shard — skip")
|
|
96
|
+
store_1 = sg.shard(p1)
|
|
97
|
+
store_2 = sg.shard(p2)
|
|
98
|
+
assert store_1 is not store_2
|
|
99
|
+
assert store_1._db_path != store_2._db_path
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_sharded_store_list_projects_empty(tmp_path: Path):
|
|
103
|
+
sg = ShardedGraphStore(read_only=False, num_shards=2, shards_dir=str(tmp_path / "shards"))
|
|
104
|
+
# Force open all shards by reading — they're empty so result should be []
|
|
105
|
+
projects = sg.list_project_metadata()
|
|
106
|
+
assert projects == []
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
# Indexing via ShardedGraphStore
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_index_single_project_via_sharded_store(tmp_path: Path):
|
|
115
|
+
"""Indexing through ShardedGraphStore should produce queryable results."""
|
|
116
|
+
_write_java(
|
|
117
|
+
tmp_path / "src/main/java/com/example/Hello.java",
|
|
118
|
+
"""
|
|
119
|
+
package com.example;
|
|
120
|
+
public class Hello {
|
|
121
|
+
public String greet(String name) { return "Hi " + name; }
|
|
122
|
+
}
|
|
123
|
+
""",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
sg = ShardedGraphStore(num_shards=2, shards_dir=str(tmp_path / "shards"))
|
|
127
|
+
project_id = "hello-project"
|
|
128
|
+
shard_store = sg.shard(project_id)
|
|
129
|
+
result = JavaIndexer(shard_store).index_project(
|
|
130
|
+
str(tmp_path), full=True, project_id=project_id
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
assert result.files_indexed == 1
|
|
134
|
+
assert result.classes_indexed >= 1
|
|
135
|
+
assert result.methods_indexed >= 1
|
|
136
|
+
|
|
137
|
+
classes = shard_store.query_records(
|
|
138
|
+
"MATCH (c:Class) WHERE c.fqcn = $fqcn RETURN c.name as name",
|
|
139
|
+
{"fqcn": "com.example.Hello"},
|
|
140
|
+
)
|
|
141
|
+
assert classes, "Class not found in shard DB"
|
|
142
|
+
assert classes[0]["name"] == "Hello"
|
|
143
|
+
|
|
144
|
+
# list_project_metadata fan-out should find the project.
|
|
145
|
+
all_projects = sg.list_project_metadata()
|
|
146
|
+
assert any(p["id"] == project_id for p in all_projects)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_two_projects_indexed_into_separate_shards(tmp_path: Path):
|
|
150
|
+
"""When two projects land on different shards they're stored independently."""
|
|
151
|
+
_write_java(
|
|
152
|
+
tmp_path / "proj-a" / "src" / "main" / "java" / "a" / "A.java",
|
|
153
|
+
"package a; public class A { public void alpha() {} }",
|
|
154
|
+
)
|
|
155
|
+
_write_java(
|
|
156
|
+
tmp_path / "proj-b" / "src" / "main" / "java" / "b" / "B.java",
|
|
157
|
+
"package b; public class B { public void beta() {} }",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
sg = ShardedGraphStore(num_shards=8, shards_dir=str(tmp_path / "shards"))
|
|
161
|
+
router = sg.router
|
|
162
|
+
|
|
163
|
+
# Find project IDs that will use different shards.
|
|
164
|
+
pid_a = "proj-a"
|
|
165
|
+
pid_b = None
|
|
166
|
+
for candidate in [f"project-alt-{i}" for i in range(100)]:
|
|
167
|
+
if router.shard_for(candidate) != router.shard_for(pid_a):
|
|
168
|
+
pid_b = candidate
|
|
169
|
+
break
|
|
170
|
+
if pid_b is None:
|
|
171
|
+
pytest.skip("Could not find two IDs hashing to different shards")
|
|
172
|
+
|
|
173
|
+
JavaIndexer(sg.shard(pid_a)).index_project(
|
|
174
|
+
str(tmp_path / "proj-a"), full=True, project_id=pid_a
|
|
175
|
+
)
|
|
176
|
+
JavaIndexer(sg.shard(pid_b)).index_project(
|
|
177
|
+
str(tmp_path / "proj-b"), full=True, project_id=pid_b
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Each shard should contain exactly one project.
|
|
181
|
+
store_a = sg.shard(pid_a)
|
|
182
|
+
store_b = sg.shard(pid_b)
|
|
183
|
+
assert store_a is not store_b
|
|
184
|
+
|
|
185
|
+
# Methods are visible only in their owning shard.
|
|
186
|
+
methods_a = store_a.query_records("MATCH (m:Method) RETURN m.name as name")
|
|
187
|
+
methods_b = store_b.query_records("MATCH (m:Method) RETURN m.name as name")
|
|
188
|
+
names_a = {m["name"] for m in methods_a}
|
|
189
|
+
names_b = {m["name"] for m in methods_b}
|
|
190
|
+
assert "alpha" in names_a
|
|
191
|
+
assert "beta" in names_b
|
|
192
|
+
# Cross-shard isolation: alpha not in shard B, beta not in shard A.
|
|
193
|
+
assert "beta" not in names_a
|
|
194
|
+
assert "alpha" not in names_b
|
|
195
|
+
|
|
196
|
+
# Fan-out list should see both.
|
|
197
|
+
all_projects = sg.list_project_metadata()
|
|
198
|
+
all_ids = {p["id"] for p in all_projects}
|
|
199
|
+
assert pid_a in all_ids
|
|
200
|
+
assert pid_b in all_ids
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|