codespine 0.9.6__tar.gz → 0.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {codespine-0.9.6 → codespine-0.9.7}/PKG-INFO +1 -1
  2. {codespine-0.9.6 → codespine-0.9.7}/codespine/__init__.py +1 -1
  3. {codespine-0.9.6 → codespine-0.9.7}/codespine/cli.py +205 -93
  4. {codespine-0.9.6 → codespine-0.9.7}/codespine/db/store.py +71 -29
  5. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/PKG-INFO +1 -1
  6. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/SOURCES.txt +1 -0
  7. {codespine-0.9.6 → codespine-0.9.7}/pyproject.toml +1 -1
  8. codespine-0.9.7/tests/test_sharding.py +200 -0
  9. {codespine-0.9.6 → codespine-0.9.7}/LICENSE +0 -0
  10. {codespine-0.9.6 → codespine-0.9.7}/README.md +0 -0
  11. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/__init__.py +0 -0
  12. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/community.py +0 -0
  13. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/context.py +0 -0
  14. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/coupling.py +0 -0
  15. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/crossmodule.py +0 -0
  16. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/deadcode.py +0 -0
  17. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/flow.py +0 -0
  18. {codespine-0.9.6 → codespine-0.9.7}/codespine/analysis/impact.py +0 -0
  19. {codespine-0.9.6 → codespine-0.9.7}/codespine/config.py +0 -0
  20. {codespine-0.9.6 → codespine-0.9.7}/codespine/db/__init__.py +0 -0
  21. {codespine-0.9.6 → codespine-0.9.7}/codespine/db/schema.py +0 -0
  22. {codespine-0.9.6 → codespine-0.9.7}/codespine/diff/__init__.py +0 -0
  23. {codespine-0.9.6 → codespine-0.9.7}/codespine/diff/branch_diff.py +0 -0
  24. {codespine-0.9.6 → codespine-0.9.7}/codespine/guide.py +0 -0
  25. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/__init__.py +0 -0
  26. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/call_resolver.py +0 -0
  27. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/di_resolver.py +0 -0
  28. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/engine.py +0 -0
  29. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/java_parser.py +0 -0
  30. {codespine-0.9.6 → codespine-0.9.7}/codespine/indexer/symbol_builder.py +0 -0
  31. {codespine-0.9.6 → codespine-0.9.7}/codespine/mcp/__init__.py +0 -0
  32. {codespine-0.9.6 → codespine-0.9.7}/codespine/mcp/server.py +0 -0
  33. {codespine-0.9.6 → codespine-0.9.7}/codespine/noise/__init__.py +0 -0
  34. {codespine-0.9.6 → codespine-0.9.7}/codespine/noise/blocklist.py +0 -0
  35. {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/__init__.py +0 -0
  36. {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/git_state.py +0 -0
  37. {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/merge.py +0 -0
  38. {codespine-0.9.6 → codespine-0.9.7}/codespine/overlay/store.py +0 -0
  39. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/__init__.py +0 -0
  40. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/bm25.py +0 -0
  41. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/fuzzy.py +0 -0
  42. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/hybrid.py +0 -0
  43. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/rrf.py +0 -0
  44. {codespine-0.9.6 → codespine-0.9.7}/codespine/search/vector.py +0 -0
  45. {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/__init__.py +0 -0
  46. {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/router.py +0 -0
  47. {codespine-0.9.6 → codespine-0.9.7}/codespine/sharding/store.py +0 -0
  48. {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/__init__.py +0 -0
  49. {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/git_hook.py +0 -0
  50. {codespine-0.9.6 → codespine-0.9.7}/codespine/watch/watcher.py +0 -0
  51. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/dependency_links.txt +0 -0
  52. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/entry_points.txt +0 -0
  53. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/requires.txt +0 -0
  54. {codespine-0.9.6 → codespine-0.9.7}/codespine.egg-info/top_level.txt +0 -0
  55. {codespine-0.9.6 → codespine-0.9.7}/gindex.py +0 -0
  56. {codespine-0.9.6 → codespine-0.9.7}/setup.cfg +0 -0
  57. {codespine-0.9.6 → codespine-0.9.7}/tests/test_branch_diff_normalize.py +0 -0
  58. {codespine-0.9.6 → codespine-0.9.7}/tests/test_call_resolver.py +0 -0
  59. {codespine-0.9.6 → codespine-0.9.7}/tests/test_community_detection.py +0 -0
  60. {codespine-0.9.6 → codespine-0.9.7}/tests/test_deadcode.py +0 -0
  61. {codespine-0.9.6 → codespine-0.9.7}/tests/test_index_and_hybrid.py +0 -0
  62. {codespine-0.9.6 → codespine-0.9.7}/tests/test_java_parser.py +0 -0
  63. {codespine-0.9.6 → codespine-0.9.7}/tests/test_multimodule_index.py +0 -0
  64. {codespine-0.9.6 → codespine-0.9.7}/tests/test_overlay.py +0 -0
  65. {codespine-0.9.6 → codespine-0.9.7}/tests/test_search_ranking.py +0 -0
  66. {codespine-0.9.6 → codespine-0.9.7}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.6
3
+ Version: 0.9.7
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.9.6"
4
+ __version__ = "0.9.7"
@@ -6,7 +6,10 @@ import os
6
6
  import signal
7
7
  import subprocess
8
8
  import sys
9
+ import threading
9
10
  import time
11
+ from collections import defaultdict
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
13
 
11
14
  import click
12
15
  import psutil
@@ -91,6 +94,149 @@ def _spinner_char() -> str:
91
94
  return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
92
95
 
93
96
 
97
+ def _index_shard_group(
98
+ shard_idx: int,
99
+ modules: list[tuple[str, str]],
100
+ sg,
101
+ full: bool,
102
+ embed: bool,
103
+ output_lock: threading.Lock,
104
+ parallel: bool,
105
+ ) -> tuple[int, list, int]:
106
+ """Index one group of modules that share a shard.
107
+
108
+ Modules within the group are always indexed sequentially (same KùzuDB).
109
+ Multiple groups can run concurrently in different threads when they own
110
+ different shards.
111
+
112
+ Returns (total_files_found, all_results, shard_idx).
113
+ """
114
+ results = []
115
+ total_files = 0
116
+
117
+ def _locked_echo(*args, **kwargs) -> None:
118
+ """Thread-safe click.echo."""
119
+ with output_lock:
120
+ click.echo(*args, **kwargs)
121
+
122
+ def _locked_secho(*args, **kwargs) -> None:
123
+ with output_lock:
124
+ click.secho(*args, **kwargs)
125
+
126
+ prefix = f"[S{shard_idx}] " if parallel else ""
127
+
128
+ for mod_path, project_id in modules:
129
+ # Per-module progress state (local — no shared mutation).
130
+ parse_state: dict = {"shown": False, "indexed": 0, "total": 0,
131
+ "last_ts": 0.0, "printed_zero": False}
132
+ call_state: dict = {"shown": False, "count": 0, "last_ts": 0.0,
133
+ "started_at": 0.0}
134
+
135
+ def _progress(event: str, payload: dict) -> None:
136
+ now = time.perf_counter()
137
+ if event == "scan_done":
138
+ with output_lock:
139
+ _phase(f"{prefix}Walking files...", f"{int(payload.get('files_found', 0))} files found")
140
+ return
141
+ if event == "plan_done":
142
+ to_index = int(payload.get("files_to_index", 0))
143
+ deleted = int(payload.get("deleted_files", 0))
144
+ mode = str(payload.get("mode", "incremental"))
145
+ parse_state["total"] = to_index
146
+ with output_lock:
147
+ _phase(f"{prefix}Index mode...", f"{mode} ({to_index} files, {deleted} deleted)")
148
+ if to_index == 0:
149
+ with output_lock:
150
+ _phase(f"{prefix}Parsing code...", "0/0")
151
+ parse_state["printed_zero"] = True
152
+ return
153
+ if event == "parse_progress":
154
+ indexed = int(payload.get("indexed", 0))
155
+ total = int(payload.get("total", 0))
156
+ parse_state["indexed"] = indexed
157
+ parse_state["total"] = total
158
+ if total == 0:
159
+ return
160
+ if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
161
+ if not parallel:
162
+ # In-place progress bar only makes sense in serial mode.
163
+ click.echo(
164
+ f"\r{prefix}Parsing code... {_bar(indexed, total)} {indexed}/{total} ",
165
+ nl=False,
166
+ )
167
+ else:
168
+ with output_lock:
169
+ click.echo(
170
+ f"\r{prefix}Parsing {indexed}/{total} ",
171
+ nl=False,
172
+ )
173
+ parse_state["shown"] = True
174
+ parse_state["last_ts"] = now
175
+ return
176
+ if event in ("resolve_calls_start",):
177
+ if parse_state["shown"]:
178
+ with output_lock:
179
+ click.echo()
180
+ parse_state["shown"] = False
181
+ call_state["started_at"] = now
182
+ with output_lock:
183
+ _phase(f"{prefix}Tracing calls...", "starting...")
184
+ return
185
+ if event == "resolve_calls_progress":
186
+ call_state["count"] = int(payload.get("calls_resolved", 0))
187
+ if (now - call_state["last_ts"]) >= 0.25:
188
+ elapsed_s = now - call_state["started_at"]
189
+ if not parallel:
190
+ click.echo(
191
+ f"\r{_spinner_char()} {prefix}Tracing calls... "
192
+ f"{call_state['count']:>6} resolved {elapsed_s:.1f}s ",
193
+ nl=False,
194
+ )
195
+ else:
196
+ with output_lock:
197
+ click.echo(
198
+ f"\r{prefix}Calls: {call_state['count']} ({elapsed_s:.0f}s) ",
199
+ nl=False,
200
+ )
201
+ call_state["shown"] = True
202
+ call_state["last_ts"] = now
203
+ return
204
+ if event == "resolve_calls_done":
205
+ if call_state["shown"]:
206
+ with output_lock:
207
+ click.echo()
208
+ call_state["shown"] = False
209
+ elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
210
+ n = int(payload.get("calls_resolved", 0))
211
+ with output_lock:
212
+ _phase(f"{prefix}Tracing calls...", f"{n} calls resolved ({elapsed_s:.1f}s)")
213
+ return
214
+ if event == "resolve_types_start":
215
+ with output_lock:
216
+ _phase(f"{prefix}Analyzing types...", "running")
217
+ return
218
+ if event == "resolve_types_done":
219
+ n = int(payload.get("type_relationships", 0))
220
+ with output_lock:
221
+ _phase(f"{prefix}Analyzing types...", f"{n} type relationships")
222
+ return
223
+
224
+ shard_store = sg.shard(project_id)
225
+ indexer = JavaIndexer(shard_store)
226
+ result = indexer.index_project(
227
+ mod_path, full=full, progress=_progress, project_id=project_id, embed=embed
228
+ )
229
+ results.append(result)
230
+ total_files += result.files_found
231
+
232
+ # Flush any dangling progress line.
233
+ if parse_state["shown"]:
234
+ with output_lock:
235
+ click.echo()
236
+
237
+ return shard_idx, results, total_files
238
+
239
+
94
240
  def _show_shard_topology(as_json: bool) -> None:
95
241
  """Display the current shard routing topology and imbalance metrics."""
96
242
  router = ShardRouter()
@@ -217,103 +363,69 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
217
363
 
218
364
  root_basename = os.path.basename(abs_path)
219
365
 
220
- # Shared progress state (reset per module)
221
- parse_state = {"shown": False, "indexed": 0, "total": 0, "last_ts": 0.0, "printed_zero": False}
222
- call_state = {"shown": False, "count": 0, "last_ts": 0.0, "started_at": 0.0}
223
-
224
- def _reset_state() -> None:
225
- for k in list(parse_state):
226
- parse_state[k] = False if isinstance(parse_state[k], bool) else (0.0 if isinstance(parse_state[k], float) else 0)
227
- parse_state["last_ts"] = 0.0
228
-
229
- def _progress(event: str, payload: dict) -> None:
230
- now = time.perf_counter()
231
- if event == "scan_done":
232
- _phase("Walking files...", f"{int(payload.get('files_found', 0))} files found")
233
- return
234
- if event == "plan_done":
235
- to_index = int(payload.get("files_to_index", 0))
236
- deleted = int(payload.get("deleted_files", 0))
237
- mode = str(payload.get("mode", "incremental"))
238
- parse_state["total"] = to_index
239
- _phase("Index mode...", f"{mode} ({to_index} files to index, {deleted} deleted)")
240
- if to_index == 0:
241
- _phase("Parsing code...", "0/0")
242
- parse_state["printed_zero"] = True
243
- return
244
- if event == "parse_progress":
245
- indexed = int(payload.get("indexed", 0))
246
- total = int(payload.get("total", 0))
247
- parse_state["indexed"] = indexed
248
- parse_state["total"] = total
249
- if total == 0:
250
- return
251
- if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
252
- click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
253
- parse_state["shown"] = True
254
- parse_state["last_ts"] = now
255
- return
256
- if event == "resolve_calls_start" and parse_state["shown"]:
257
- click.echo()
258
- parse_state["shown"] = False
259
- call_state["started_at"] = now
260
- _phase("Tracing calls...", "starting...")
261
- return
262
- if event == "resolve_calls_start":
263
- call_state["started_at"] = now
264
- _phase("Tracing calls...", "starting...")
265
- return
266
- if event == "resolve_calls_progress":
267
- call_state["count"] = int(payload.get("calls_resolved", 0))
268
- if (now - call_state["last_ts"]) >= 0.25:
269
- elapsed_s = now - call_state["started_at"]
270
- click.echo(
271
- f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
272
- nl=False,
273
- )
274
- call_state["shown"] = True
275
- call_state["last_ts"] = now
276
- return
277
- if event == "resolve_calls_done":
278
- if call_state["shown"]:
279
- click.echo()
280
- call_state["shown"] = False
281
- elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
282
- _phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
283
- return
284
- if event == "resolve_types_start":
285
- _phase("Analyzing types...", "running")
286
- return
287
- if event == "resolve_types_done":
288
- _phase("Analyzing types...", f"{int(payload.get('type_relationships', 0))} type relationships")
289
- return
290
-
291
- # --- Index each module ---
366
+ # ── Group modules by target shard ─────────────────────────────────
367
+ # Modules that hash to different shards own separate KùzuDBs and can
368
+ # be indexed in parallel. Modules in the same shard (same project
369
+ # root for multi-module projects) are always indexed sequentially.
370
+ shard_groups: dict[int, list[tuple[str, str]]] = defaultdict(list)
371
+ for mod_path, pid in modules_with_ids:
372
+ shard_groups[sg.router.shard_for(pid)].append((mod_path, pid))
373
+
292
374
  is_multi = len(modules_with_ids) > 1
375
+ parallel_mode = len(shard_groups) > 1 # ≥2 shards → true parallelism
376
+ output_lock = threading.Lock()
377
+
378
+ if parallel_mode:
379
+ click.secho(
380
+ f"Parallel mode: {len(shard_groups)} shards will be indexed concurrently.",
381
+ fg="cyan",
382
+ )
383
+
384
+ # Print which shard each module lands on (multi-module only).
385
+ if is_multi:
386
+ for s_idx, group in sorted(shard_groups.items()):
387
+ for _, pid in group:
388
+ click.secho(f" {pid:<40} → shard {s_idx}", fg="cyan")
389
+
390
+ # ── Dispatch to shards ────────────────────────────────────────────
293
391
  total_files_found = 0
392
+ all_results: list = []
294
393
  last_result = None
295
- for idx, (module_path, project_id) in enumerate(modules_with_ids):
296
- if is_multi:
297
- shard_idx = sg.router.shard_for(project_id)
298
- click.echo()
299
- click.secho(
300
- f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
301
- fg="cyan",
302
- )
303
- _reset_state()
304
- # Use the shard store for this project so data lands in the right DB.
305
- shard_store = sg.shard(project_id)
306
- indexer = JavaIndexer(shard_store)
307
- last_result = indexer.index_project(
308
- module_path, full=full, progress=_progress, project_id=project_id, embed=embed
394
+
395
+ if parallel_mode:
396
+ max_workers = min(len(shard_groups), 4)
397
+ click.echo()
398
+ futures_map = {}
399
+ with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="codespine-shard") as ex:
400
+ for s_idx, group in shard_groups.items():
401
+ f = ex.submit(
402
+ _index_shard_group,
403
+ s_idx, group, sg, full, embed, output_lock, True,
404
+ )
405
+ futures_map[f] = s_idx
406
+
407
+ for future in as_completed(futures_map):
408
+ s_idx = futures_map[future]
409
+ try:
410
+ ret_idx, results, n_files = future.result()
411
+ all_results.extend(results)
412
+ total_files_found += n_files
413
+ if results:
414
+ last_result = results[-1]
415
+ with output_lock:
416
+ click.secho(f" Shard {ret_idx} done ({n_files} files)", fg="green")
417
+ except Exception as exc: # noqa: BLE001
418
+ with output_lock:
419
+ click.secho(f" Shard {s_idx} FAILED: {exc}", fg="red")
420
+ else:
421
+ # Serial path — single shard (or single module). Full progress UX.
422
+ only_shard_idx = next(iter(shard_groups))
423
+ only_group = shard_groups[only_shard_idx]
424
+ _, all_results, total_files_found = _index_shard_group(
425
+ only_shard_idx, only_group, sg, full, embed, output_lock, False,
309
426
  )
310
- total_files_found += last_result.files_found
311
- if parse_state["shown"]:
312
- click.echo()
313
- if parse_state["total"] == 0 and not parse_state["printed_zero"]:
314
- _phase("Parsing code...", "0/0")
315
- elif parse_state["indexed"] < parse_state["total"]:
316
- _phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
427
+ if all_results:
428
+ last_result = all_results[-1]
317
429
 
318
430
  # ── Helper for in-place progress updates ────────────────────────────
319
431
  def _live_phase(label: str, status: str) -> None:
@@ -495,35 +495,77 @@ class GraphStore:
495
495
  def upsert_symbols_batch(self, records: list[dict[str, Any]], create_mode: bool = False) -> None:
496
496
  if not records:
497
497
  return
498
- rows = [{"id": r["id"], "kind": r["kind"], "name": r["name"],
499
- "fqname": r["fqname"], "file_id": r["file_id"],
500
- "line": int(r["line"]), "col": int(r["col"]),
501
- "embedding": r.get("embedding")} for r in records]
502
- if create_mode:
503
- self.execute(
504
- """
505
- UNWIND $rows AS row
506
- MATCH (f:File {id: row.file_id})
507
- CREATE (s:Symbol {id: row.id, kind: row.kind, name: row.name,
508
- fqname: row.fqname, file_id: row.file_id,
509
- line: row.line, col: row.col, embedding: row.embedding})
510
- CREATE (f)-[:DECLARES]->(s)
511
- """,
512
- {"rows": rows},
513
- )
514
- else:
515
- self.execute(
516
- """
517
- UNWIND $rows AS row
518
- MATCH (f:File {id: row.file_id})
519
- MERGE (s:Symbol {id: row.id})
520
- SET s.kind = row.kind, s.name = row.name, s.fqname = row.fqname,
521
- s.file_id = row.file_id, s.line = row.line, s.col = row.col,
522
- s.embedding = row.embedding
523
- MERGE (f)-[:DECLARES]->(s)
524
- """,
525
- {"rows": rows},
526
- )
498
+ # Split into rows with and without embeddings.
499
+ # Kuzu's UNWIND parameter type inference treats None as STRING, which
500
+ # conflicts with the FLOAT[384] column type. Keeping the two groups
501
+ # separate avoids the type-mismatch error on fresh DBs.
502
+ rows_emb: list[dict] = []
503
+ rows_no_emb: list[dict] = []
504
+ for r in records:
505
+ emb = r.get("embedding")
506
+ base = {"id": r["id"], "kind": r["kind"], "name": r["name"],
507
+ "fqname": r["fqname"], "file_id": r["file_id"],
508
+ "line": int(r["line"]), "col": int(r["col"])}
509
+ if emb is not None:
510
+ rows_emb.append({**base, "embedding": emb})
511
+ else:
512
+ rows_no_emb.append(base)
513
+
514
+ op = "CREATE" if create_mode else "MERGE"
515
+ edge_op = "CREATE" if create_mode else "MERGE"
516
+
517
+ if rows_no_emb:
518
+ if create_mode:
519
+ self.execute(
520
+ """
521
+ UNWIND $rows AS row
522
+ MATCH (f:File {id: row.file_id})
523
+ CREATE (s:Symbol {id: row.id, kind: row.kind, name: row.name,
524
+ fqname: row.fqname, file_id: row.file_id,
525
+ line: row.line, col: row.col})
526
+ CREATE (f)-[:DECLARES]->(s)
527
+ """,
528
+ {"rows": rows_no_emb},
529
+ )
530
+ else:
531
+ self.execute(
532
+ """
533
+ UNWIND $rows AS row
534
+ MATCH (f:File {id: row.file_id})
535
+ MERGE (s:Symbol {id: row.id})
536
+ SET s.kind = row.kind, s.name = row.name, s.fqname = row.fqname,
537
+ s.file_id = row.file_id, s.line = row.line, s.col = row.col
538
+ MERGE (f)-[:DECLARES]->(s)
539
+ """,
540
+ {"rows": rows_no_emb},
541
+ )
542
+
543
+ if rows_emb:
544
+ if create_mode:
545
+ self.execute(
546
+ """
547
+ UNWIND $rows AS row
548
+ MATCH (f:File {id: row.file_id})
549
+ CREATE (s:Symbol {id: row.id, kind: row.kind, name: row.name,
550
+ fqname: row.fqname, file_id: row.file_id,
551
+ line: row.line, col: row.col, embedding: row.embedding})
552
+ CREATE (f)-[:DECLARES]->(s)
553
+ """,
554
+ {"rows": rows_emb},
555
+ )
556
+ else:
557
+ self.execute(
558
+ """
559
+ UNWIND $rows AS row
560
+ MATCH (f:File {id: row.file_id})
561
+ MERGE (s:Symbol {id: row.id})
562
+ SET s.kind = row.kind, s.name = row.name, s.fqname = row.fqname,
563
+ s.file_id = row.file_id, s.line = row.line, s.col = row.col,
564
+ s.embedding = row.embedding
565
+ MERGE (f)-[:DECLARES]->(s)
566
+ """,
567
+ {"rows": rows_emb},
568
+ )
527
569
 
528
570
  def add_call(self, source_id: str, target_id: str, confidence: float, reason: str) -> None:
529
571
  self.execute(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.6
3
+ Version: 0.9.7
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -60,4 +60,5 @@ tests/test_java_parser.py
60
60
  tests/test_multimodule_index.py
61
61
  tests/test_overlay.py
62
62
  tests/test_search_ranking.py
63
+ tests/test_sharding.py
63
64
  tests/test_store_recovery.py
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "codespine"
7
- version = "0.9.6"
7
+ version = "0.9.7"
8
8
  description = "Local Java code intelligence indexer backed by a graph database"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -0,0 +1,200 @@
1
+ """Tests for sharding infrastructure: ShardRouter + ShardedGraphStore."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ pytest.importorskip("kuzu")
10
+ pytest.importorskip("tree_sitter_java")
11
+
12
+ from codespine.sharding.router import ShardRouter
13
+ from codespine.sharding.store import ShardedGraphStore
14
+ from codespine.indexer.engine import JavaIndexer
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # ShardRouter
19
+ # ---------------------------------------------------------------------------
20
+
21
+
22
+ def test_router_co_location():
23
+ """All modules of the same project must hash to the same shard."""
24
+ r = ShardRouter(num_shards=8)
25
+ root_shard = r.shard_for("myapp")
26
+ assert r.shard_for("myapp::module-a") == root_shard
27
+ assert r.shard_for("myapp::module-b") == root_shard
28
+ assert r.shard_for("myapp::module-c") == root_shard
29
+
30
+
31
+ def test_router_single_shard_always_zero():
32
+ """With num_shards=1 every project must land on shard 0."""
33
+ r = ShardRouter(num_shards=1)
34
+ for pid in ["alpha", "beta", "gamma::sub", "delta"]:
35
+ assert r.shard_for(pid) == 0
36
+
37
+
38
+ def test_router_distribution(tmp_path: Path):
39
+ """With 4 shards and many distinct projects, at least 2 shards get used."""
40
+ r = ShardRouter(num_shards=4)
41
+ projects = [f"project-{i}" for i in range(50)]
42
+ used_shards = {r.shard_for(p) for p in projects}
43
+ assert len(used_shards) >= 2, "Poor distribution — expected multiple shards to be used"
44
+
45
+
46
+ def test_router_deterministic():
47
+ """Same project_id must always map to the same shard across instances."""
48
+ r1 = ShardRouter(num_shards=4)
49
+ r2 = ShardRouter(num_shards=4)
50
+ for pid in ["foo", "bar::baz", "qux"]:
51
+ assert r1.shard_for(pid) == r2.shard_for(pid)
52
+
53
+
54
+ def test_router_paths(tmp_path: Path):
55
+ r = ShardRouter(num_shards=3, shards_dir=str(tmp_path / "shards"))
56
+ assert r.db_path(0).endswith("/0/db")
57
+ assert r.snapshot_path(1).endswith("/1/db_read")
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # ShardedGraphStore — basic routing
62
+ # ---------------------------------------------------------------------------
63
+
64
+
65
+ def _write_java(path: Path, content: str) -> None:
66
+ path.parent.mkdir(parents=True, exist_ok=True)
67
+ path.write_text(content, encoding="utf-8")
68
+
69
+
70
+ def test_sharded_store_routes_modules_to_same_shard(tmp_path: Path):
71
+ """All modules of the same project must use the same GraphStore instance."""
72
+ sg = ShardedGraphStore(num_shards=4, shards_dir=str(tmp_path / "shards"))
73
+ store_a = sg.shard("myapp::module-a")
74
+ store_b = sg.shard("myapp::module-b")
75
+ store_c = sg.shard("myapp::module-c")
76
+ assert store_a is store_b
77
+ assert store_b is store_c
78
+ assert store_a._db_path == store_b._db_path
79
+
80
+
81
+ def test_sharded_store_different_projects_may_differ(tmp_path: Path):
82
+ """Two unrelated projects may (and likely will) land on different stores."""
83
+ sg = ShardedGraphStore(num_shards=8, shards_dir=str(tmp_path / "shards"))
84
+ # Find two project IDs that hash to different shards.
85
+ router = sg.router
86
+ p1, p2 = None, None
87
+ for i in range(200):
88
+ pid = f"project-{i}"
89
+ if p1 is None:
90
+ p1 = pid
91
+ elif router.shard_for(pid) != router.shard_for(p1):
92
+ p2 = pid
93
+ break
94
+ if p2 is None:
95
+ pytest.skip("All 200 projects happened to hash to the same shard — skip")
96
+ store_1 = sg.shard(p1)
97
+ store_2 = sg.shard(p2)
98
+ assert store_1 is not store_2
99
+ assert store_1._db_path != store_2._db_path
100
+
101
+
102
+ def test_sharded_store_list_projects_empty(tmp_path: Path):
103
+ sg = ShardedGraphStore(read_only=False, num_shards=2, shards_dir=str(tmp_path / "shards"))
104
+ # Force open all shards by reading — they're empty so result should be []
105
+ projects = sg.list_project_metadata()
106
+ assert projects == []
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # Indexing via ShardedGraphStore
111
+ # ---------------------------------------------------------------------------
112
+
113
+
114
+ def test_index_single_project_via_sharded_store(tmp_path: Path):
115
+ """Indexing through ShardedGraphStore should produce queryable results."""
116
+ _write_java(
117
+ tmp_path / "src/main/java/com/example/Hello.java",
118
+ """
119
+ package com.example;
120
+ public class Hello {
121
+ public String greet(String name) { return "Hi " + name; }
122
+ }
123
+ """,
124
+ )
125
+
126
+ sg = ShardedGraphStore(num_shards=2, shards_dir=str(tmp_path / "shards"))
127
+ project_id = "hello-project"
128
+ shard_store = sg.shard(project_id)
129
+ result = JavaIndexer(shard_store).index_project(
130
+ str(tmp_path), full=True, project_id=project_id
131
+ )
132
+
133
+ assert result.files_indexed == 1
134
+ assert result.classes_indexed >= 1
135
+ assert result.methods_indexed >= 1
136
+
137
+ classes = shard_store.query_records(
138
+ "MATCH (c:Class) WHERE c.fqcn = $fqcn RETURN c.name as name",
139
+ {"fqcn": "com.example.Hello"},
140
+ )
141
+ assert classes, "Class not found in shard DB"
142
+ assert classes[0]["name"] == "Hello"
143
+
144
+ # list_project_metadata fan-out should find the project.
145
+ all_projects = sg.list_project_metadata()
146
+ assert any(p["id"] == project_id for p in all_projects)
147
+
148
+
149
+ def test_two_projects_indexed_into_separate_shards(tmp_path: Path):
150
+ """When two projects land on different shards they're stored independently."""
151
+ _write_java(
152
+ tmp_path / "proj-a" / "src" / "main" / "java" / "a" / "A.java",
153
+ "package a; public class A { public void alpha() {} }",
154
+ )
155
+ _write_java(
156
+ tmp_path / "proj-b" / "src" / "main" / "java" / "b" / "B.java",
157
+ "package b; public class B { public void beta() {} }",
158
+ )
159
+
160
+ sg = ShardedGraphStore(num_shards=8, shards_dir=str(tmp_path / "shards"))
161
+ router = sg.router
162
+
163
+ # Find project IDs that will use different shards.
164
+ pid_a = "proj-a"
165
+ pid_b = None
166
+ for candidate in [f"project-alt-{i}" for i in range(100)]:
167
+ if router.shard_for(candidate) != router.shard_for(pid_a):
168
+ pid_b = candidate
169
+ break
170
+ if pid_b is None:
171
+ pytest.skip("Could not find two IDs hashing to different shards")
172
+
173
+ JavaIndexer(sg.shard(pid_a)).index_project(
174
+ str(tmp_path / "proj-a"), full=True, project_id=pid_a
175
+ )
176
+ JavaIndexer(sg.shard(pid_b)).index_project(
177
+ str(tmp_path / "proj-b"), full=True, project_id=pid_b
178
+ )
179
+
180
+ # Each shard should contain exactly one project.
181
+ store_a = sg.shard(pid_a)
182
+ store_b = sg.shard(pid_b)
183
+ assert store_a is not store_b
184
+
185
+ # Methods are visible only in their owning shard.
186
+ methods_a = store_a.query_records("MATCH (m:Method) RETURN m.name as name")
187
+ methods_b = store_b.query_records("MATCH (m:Method) RETURN m.name as name")
188
+ names_a = {m["name"] for m in methods_a}
189
+ names_b = {m["name"] for m in methods_b}
190
+ assert "alpha" in names_a
191
+ assert "beta" in names_b
192
+ # Cross-shard isolation: alpha not in shard B, beta not in shard A.
193
+ assert "beta" not in names_a
194
+ assert "alpha" not in names_b
195
+
196
+ # Fan-out list should see both.
197
+ all_projects = sg.list_project_metadata()
198
+ all_ids = {p["id"] for p in all_projects}
199
+ assert pid_a in all_ids
200
+ assert pid_b in all_ids
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes