codespine 0.9.6__tar.gz → 0.9.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {codespine-0.9.6 → codespine-0.9.8}/PKG-INFO +4 -1
  2. {codespine-0.9.6 → codespine-0.9.8}/codespine/__init__.py +1 -1
  3. {codespine-0.9.6 → codespine-0.9.8}/codespine/cli.py +205 -93
  4. {codespine-0.9.6 → codespine-0.9.8}/codespine/config.py +5 -1
  5. codespine-0.9.8/codespine/db/duckdb_store.py +882 -0
  6. {codespine-0.9.6 → codespine-0.9.8}/codespine/db/store.py +72 -29
  7. {codespine-0.9.6 → codespine-0.9.8}/codespine/sharding/store.py +37 -17
  8. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/PKG-INFO +4 -1
  9. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/SOURCES.txt +3 -0
  10. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/requires.txt +4 -0
  11. {codespine-0.9.6 → codespine-0.9.8}/pyproject.toml +6 -2
  12. codespine-0.9.8/tests/test_duckdb_store.py +401 -0
  13. codespine-0.9.8/tests/test_sharding.py +200 -0
  14. {codespine-0.9.6 → codespine-0.9.8}/LICENSE +0 -0
  15. {codespine-0.9.6 → codespine-0.9.8}/README.md +0 -0
  16. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/__init__.py +0 -0
  17. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/community.py +0 -0
  18. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/context.py +0 -0
  19. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/coupling.py +0 -0
  20. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/crossmodule.py +0 -0
  21. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/deadcode.py +0 -0
  22. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/flow.py +0 -0
  23. {codespine-0.9.6 → codespine-0.9.8}/codespine/analysis/impact.py +0 -0
  24. {codespine-0.9.6 → codespine-0.9.8}/codespine/db/__init__.py +0 -0
  25. {codespine-0.9.6 → codespine-0.9.8}/codespine/db/schema.py +0 -0
  26. {codespine-0.9.6 → codespine-0.9.8}/codespine/diff/__init__.py +0 -0
  27. {codespine-0.9.6 → codespine-0.9.8}/codespine/diff/branch_diff.py +0 -0
  28. {codespine-0.9.6 → codespine-0.9.8}/codespine/guide.py +0 -0
  29. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/__init__.py +0 -0
  30. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/call_resolver.py +0 -0
  31. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/di_resolver.py +0 -0
  32. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/engine.py +0 -0
  33. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/java_parser.py +0 -0
  34. {codespine-0.9.6 → codespine-0.9.8}/codespine/indexer/symbol_builder.py +0 -0
  35. {codespine-0.9.6 → codespine-0.9.8}/codespine/mcp/__init__.py +0 -0
  36. {codespine-0.9.6 → codespine-0.9.8}/codespine/mcp/server.py +0 -0
  37. {codespine-0.9.6 → codespine-0.9.8}/codespine/noise/__init__.py +0 -0
  38. {codespine-0.9.6 → codespine-0.9.8}/codespine/noise/blocklist.py +0 -0
  39. {codespine-0.9.6 → codespine-0.9.8}/codespine/overlay/__init__.py +0 -0
  40. {codespine-0.9.6 → codespine-0.9.8}/codespine/overlay/git_state.py +0 -0
  41. {codespine-0.9.6 → codespine-0.9.8}/codespine/overlay/merge.py +0 -0
  42. {codespine-0.9.6 → codespine-0.9.8}/codespine/overlay/store.py +0 -0
  43. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/__init__.py +0 -0
  44. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/bm25.py +0 -0
  45. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/fuzzy.py +0 -0
  46. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/hybrid.py +0 -0
  47. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/rrf.py +0 -0
  48. {codespine-0.9.6 → codespine-0.9.8}/codespine/search/vector.py +0 -0
  49. {codespine-0.9.6 → codespine-0.9.8}/codespine/sharding/__init__.py +0 -0
  50. {codespine-0.9.6 → codespine-0.9.8}/codespine/sharding/router.py +0 -0
  51. {codespine-0.9.6 → codespine-0.9.8}/codespine/watch/__init__.py +0 -0
  52. {codespine-0.9.6 → codespine-0.9.8}/codespine/watch/git_hook.py +0 -0
  53. {codespine-0.9.6 → codespine-0.9.8}/codespine/watch/watcher.py +0 -0
  54. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/dependency_links.txt +0 -0
  55. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/entry_points.txt +0 -0
  56. {codespine-0.9.6 → codespine-0.9.8}/codespine.egg-info/top_level.txt +0 -0
  57. {codespine-0.9.6 → codespine-0.9.8}/gindex.py +0 -0
  58. {codespine-0.9.6 → codespine-0.9.8}/setup.cfg +0 -0
  59. {codespine-0.9.6 → codespine-0.9.8}/tests/test_branch_diff_normalize.py +0 -0
  60. {codespine-0.9.6 → codespine-0.9.8}/tests/test_call_resolver.py +0 -0
  61. {codespine-0.9.6 → codespine-0.9.8}/tests/test_community_detection.py +0 -0
  62. {codespine-0.9.6 → codespine-0.9.8}/tests/test_deadcode.py +0 -0
  63. {codespine-0.9.6 → codespine-0.9.8}/tests/test_index_and_hybrid.py +0 -0
  64. {codespine-0.9.6 → codespine-0.9.8}/tests/test_java_parser.py +0 -0
  65. {codespine-0.9.6 → codespine-0.9.8}/tests/test_multimodule_index.py +0 -0
  66. {codespine-0.9.6 → codespine-0.9.8}/tests/test_overlay.py +0 -0
  67. {codespine-0.9.6 → codespine-0.9.8}/tests/test_search_ranking.py +0 -0
  68. {codespine-0.9.6 → codespine-0.9.8}/tests/test_store_recovery.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 0.9.6
3
+ Version: 0.9.8
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -55,11 +55,14 @@ Requires-Dist: numpy; extra == "ml"
55
55
  Provides-Extra: community
56
56
  Requires-Dist: igraph; extra == "community"
57
57
  Requires-Dist: leidenalg; extra == "community"
58
+ Provides-Extra: duckdb
59
+ Requires-Dist: duckdb>=0.10.0; extra == "duckdb"
58
60
  Provides-Extra: full
59
61
  Requires-Dist: sentence-transformers; extra == "full"
60
62
  Requires-Dist: numpy; extra == "full"
61
63
  Requires-Dist: igraph; extra == "full"
62
64
  Requires-Dist: leidenalg; extra == "full"
65
+ Requires-Dist: duckdb>=0.10.0; extra == "full"
63
66
  Dynamic: license-file
64
67
 
65
68
  # CodeSpine
@@ -1,4 +1,4 @@
1
1
  """CodeSpine package."""
2
2
 
3
3
  __all__ = ["__version__"]
4
- __version__ = "0.9.6"
4
+ __version__ = "0.9.8"
@@ -6,7 +6,10 @@ import os
6
6
  import signal
7
7
  import subprocess
8
8
  import sys
9
+ import threading
9
10
  import time
11
+ from collections import defaultdict
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
13
 
11
14
  import click
12
15
  import psutil
@@ -91,6 +94,149 @@ def _spinner_char() -> str:
91
94
  return "⠋⠙⠹⠸⠼⠴⠦⠧⠇⠏"[int(time.perf_counter() * 8) % 10]
92
95
 
93
96
 
97
+ def _index_shard_group(
98
+ shard_idx: int,
99
+ modules: list[tuple[str, str]],
100
+ sg,
101
+ full: bool,
102
+ embed: bool,
103
+ output_lock: threading.Lock,
104
+ parallel: bool,
105
+ ) -> tuple[int, list, int]:
106
+ """Index one group of modules that share a shard.
107
+
108
+ Modules within the group are always indexed sequentially (same KùzuDB).
109
+ Multiple groups can run concurrently in different threads when they own
110
+ different shards.
111
+
112
+ Returns (total_files_found, all_results, shard_idx).
113
+ """
114
+ results = []
115
+ total_files = 0
116
+
117
+ def _locked_echo(*args, **kwargs) -> None:
118
+ """Thread-safe click.echo."""
119
+ with output_lock:
120
+ click.echo(*args, **kwargs)
121
+
122
+ def _locked_secho(*args, **kwargs) -> None:
123
+ with output_lock:
124
+ click.secho(*args, **kwargs)
125
+
126
+ prefix = f"[S{shard_idx}] " if parallel else ""
127
+
128
+ for mod_path, project_id in modules:
129
+ # Per-module progress state (local — no shared mutation).
130
+ parse_state: dict = {"shown": False, "indexed": 0, "total": 0,
131
+ "last_ts": 0.0, "printed_zero": False}
132
+ call_state: dict = {"shown": False, "count": 0, "last_ts": 0.0,
133
+ "started_at": 0.0}
134
+
135
+ def _progress(event: str, payload: dict) -> None:
136
+ now = time.perf_counter()
137
+ if event == "scan_done":
138
+ with output_lock:
139
+ _phase(f"{prefix}Walking files...", f"{int(payload.get('files_found', 0))} files found")
140
+ return
141
+ if event == "plan_done":
142
+ to_index = int(payload.get("files_to_index", 0))
143
+ deleted = int(payload.get("deleted_files", 0))
144
+ mode = str(payload.get("mode", "incremental"))
145
+ parse_state["total"] = to_index
146
+ with output_lock:
147
+ _phase(f"{prefix}Index mode...", f"{mode} ({to_index} files, {deleted} deleted)")
148
+ if to_index == 0:
149
+ with output_lock:
150
+ _phase(f"{prefix}Parsing code...", "0/0")
151
+ parse_state["printed_zero"] = True
152
+ return
153
+ if event == "parse_progress":
154
+ indexed = int(payload.get("indexed", 0))
155
+ total = int(payload.get("total", 0))
156
+ parse_state["indexed"] = indexed
157
+ parse_state["total"] = total
158
+ if total == 0:
159
+ return
160
+ if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
161
+ if not parallel:
162
+ # In-place progress bar only makes sense in serial mode.
163
+ click.echo(
164
+ f"\r{prefix}Parsing code... {_bar(indexed, total)} {indexed}/{total} ",
165
+ nl=False,
166
+ )
167
+ else:
168
+ with output_lock:
169
+ click.echo(
170
+ f"\r{prefix}Parsing {indexed}/{total} ",
171
+ nl=False,
172
+ )
173
+ parse_state["shown"] = True
174
+ parse_state["last_ts"] = now
175
+ return
176
+ if event in ("resolve_calls_start",):
177
+ if parse_state["shown"]:
178
+ with output_lock:
179
+ click.echo()
180
+ parse_state["shown"] = False
181
+ call_state["started_at"] = now
182
+ with output_lock:
183
+ _phase(f"{prefix}Tracing calls...", "starting...")
184
+ return
185
+ if event == "resolve_calls_progress":
186
+ call_state["count"] = int(payload.get("calls_resolved", 0))
187
+ if (now - call_state["last_ts"]) >= 0.25:
188
+ elapsed_s = now - call_state["started_at"]
189
+ if not parallel:
190
+ click.echo(
191
+ f"\r{_spinner_char()} {prefix}Tracing calls... "
192
+ f"{call_state['count']:>6} resolved {elapsed_s:.1f}s ",
193
+ nl=False,
194
+ )
195
+ else:
196
+ with output_lock:
197
+ click.echo(
198
+ f"\r{prefix}Calls: {call_state['count']} ({elapsed_s:.0f}s) ",
199
+ nl=False,
200
+ )
201
+ call_state["shown"] = True
202
+ call_state["last_ts"] = now
203
+ return
204
+ if event == "resolve_calls_done":
205
+ if call_state["shown"]:
206
+ with output_lock:
207
+ click.echo()
208
+ call_state["shown"] = False
209
+ elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
210
+ n = int(payload.get("calls_resolved", 0))
211
+ with output_lock:
212
+ _phase(f"{prefix}Tracing calls...", f"{n} calls resolved ({elapsed_s:.1f}s)")
213
+ return
214
+ if event == "resolve_types_start":
215
+ with output_lock:
216
+ _phase(f"{prefix}Analyzing types...", "running")
217
+ return
218
+ if event == "resolve_types_done":
219
+ n = int(payload.get("type_relationships", 0))
220
+ with output_lock:
221
+ _phase(f"{prefix}Analyzing types...", f"{n} type relationships")
222
+ return
223
+
224
+ shard_store = sg.shard(project_id)
225
+ indexer = JavaIndexer(shard_store)
226
+ result = indexer.index_project(
227
+ mod_path, full=full, progress=_progress, project_id=project_id, embed=embed
228
+ )
229
+ results.append(result)
230
+ total_files += result.files_found
231
+
232
+ # Flush any dangling progress line.
233
+ if parse_state["shown"]:
234
+ with output_lock:
235
+ click.echo()
236
+
237
+ return shard_idx, results, total_files
238
+
239
+
94
240
  def _show_shard_topology(as_json: bool) -> None:
95
241
  """Display the current shard routing topology and imbalance metrics."""
96
242
  router = ShardRouter()
@@ -217,103 +363,69 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
217
363
 
218
364
  root_basename = os.path.basename(abs_path)
219
365
 
220
- # Shared progress state (reset per module)
221
- parse_state = {"shown": False, "indexed": 0, "total": 0, "last_ts": 0.0, "printed_zero": False}
222
- call_state = {"shown": False, "count": 0, "last_ts": 0.0, "started_at": 0.0}
223
-
224
- def _reset_state() -> None:
225
- for k in list(parse_state):
226
- parse_state[k] = False if isinstance(parse_state[k], bool) else (0.0 if isinstance(parse_state[k], float) else 0)
227
- parse_state["last_ts"] = 0.0
228
-
229
- def _progress(event: str, payload: dict) -> None:
230
- now = time.perf_counter()
231
- if event == "scan_done":
232
- _phase("Walking files...", f"{int(payload.get('files_found', 0))} files found")
233
- return
234
- if event == "plan_done":
235
- to_index = int(payload.get("files_to_index", 0))
236
- deleted = int(payload.get("deleted_files", 0))
237
- mode = str(payload.get("mode", "incremental"))
238
- parse_state["total"] = to_index
239
- _phase("Index mode...", f"{mode} ({to_index} files to index, {deleted} deleted)")
240
- if to_index == 0:
241
- _phase("Parsing code...", "0/0")
242
- parse_state["printed_zero"] = True
243
- return
244
- if event == "parse_progress":
245
- indexed = int(payload.get("indexed", 0))
246
- total = int(payload.get("total", 0))
247
- parse_state["indexed"] = indexed
248
- parse_state["total"] = total
249
- if total == 0:
250
- return
251
- if indexed == total or (now - parse_state["last_ts"]) >= 0.2:
252
- click.echo(f"\rParsing code... {_bar(indexed, total)} {indexed}/{total} ", nl=False)
253
- parse_state["shown"] = True
254
- parse_state["last_ts"] = now
255
- return
256
- if event == "resolve_calls_start" and parse_state["shown"]:
257
- click.echo()
258
- parse_state["shown"] = False
259
- call_state["started_at"] = now
260
- _phase("Tracing calls...", "starting...")
261
- return
262
- if event == "resolve_calls_start":
263
- call_state["started_at"] = now
264
- _phase("Tracing calls...", "starting...")
265
- return
266
- if event == "resolve_calls_progress":
267
- call_state["count"] = int(payload.get("calls_resolved", 0))
268
- if (now - call_state["last_ts"]) >= 0.25:
269
- elapsed_s = now - call_state["started_at"]
270
- click.echo(
271
- f"\r{_spinner_char()} Tracing calls... {call_state['count']:>6} resolved {elapsed_s:.1f}s ",
272
- nl=False,
273
- )
274
- call_state["shown"] = True
275
- call_state["last_ts"] = now
276
- return
277
- if event == "resolve_calls_done":
278
- if call_state["shown"]:
279
- click.echo()
280
- call_state["shown"] = False
281
- elapsed_s = (now - call_state["started_at"]) if call_state["started_at"] else 0.0
282
- _phase("Tracing calls...", f"{int(payload.get('calls_resolved', 0))} calls resolved ({elapsed_s:.1f}s)")
283
- return
284
- if event == "resolve_types_start":
285
- _phase("Analyzing types...", "running")
286
- return
287
- if event == "resolve_types_done":
288
- _phase("Analyzing types...", f"{int(payload.get('type_relationships', 0))} type relationships")
289
- return
290
-
291
- # --- Index each module ---
366
+ # ── Group modules by target shard ─────────────────────────────────
367
+ # Modules that hash to different shards own separate KùzuDBs and can
368
+ # be indexed in parallel. Modules in the same shard (same project
369
+ # root for multi-module projects) are always indexed sequentially.
370
+ shard_groups: dict[int, list[tuple[str, str]]] = defaultdict(list)
371
+ for mod_path, pid in modules_with_ids:
372
+ shard_groups[sg.router.shard_for(pid)].append((mod_path, pid))
373
+
292
374
  is_multi = len(modules_with_ids) > 1
375
+ parallel_mode = len(shard_groups) > 1 # ≥2 shards → true parallelism
376
+ output_lock = threading.Lock()
377
+
378
+ if parallel_mode:
379
+ click.secho(
380
+ f"Parallel mode: {len(shard_groups)} shards will be indexed concurrently.",
381
+ fg="cyan",
382
+ )
383
+
384
+ # Print which shard each module lands on (multi-module only).
385
+ if is_multi:
386
+ for s_idx, group in sorted(shard_groups.items()):
387
+ for _, pid in group:
388
+ click.secho(f" {pid:<40} → shard {s_idx}", fg="cyan")
389
+
390
+ # ── Dispatch to shards ────────────────────────────────────────────
293
391
  total_files_found = 0
392
+ all_results: list = []
294
393
  last_result = None
295
- for idx, (module_path, project_id) in enumerate(modules_with_ids):
296
- if is_multi:
297
- shard_idx = sg.router.shard_for(project_id)
298
- click.echo()
299
- click.secho(
300
- f"[{idx + 1}/{len(modules_with_ids)}] Indexing: {project_id} (shard {shard_idx})",
301
- fg="cyan",
302
- )
303
- _reset_state()
304
- # Use the shard store for this project so data lands in the right DB.
305
- shard_store = sg.shard(project_id)
306
- indexer = JavaIndexer(shard_store)
307
- last_result = indexer.index_project(
308
- module_path, full=full, progress=_progress, project_id=project_id, embed=embed
394
+
395
+ if parallel_mode:
396
+ max_workers = min(len(shard_groups), 4)
397
+ click.echo()
398
+ futures_map = {}
399
+ with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="codespine-shard") as ex:
400
+ for s_idx, group in shard_groups.items():
401
+ f = ex.submit(
402
+ _index_shard_group,
403
+ s_idx, group, sg, full, embed, output_lock, True,
404
+ )
405
+ futures_map[f] = s_idx
406
+
407
+ for future in as_completed(futures_map):
408
+ s_idx = futures_map[future]
409
+ try:
410
+ ret_idx, results, n_files = future.result()
411
+ all_results.extend(results)
412
+ total_files_found += n_files
413
+ if results:
414
+ last_result = results[-1]
415
+ with output_lock:
416
+ click.secho(f" Shard {ret_idx} done ({n_files} files)", fg="green")
417
+ except Exception as exc: # noqa: BLE001
418
+ with output_lock:
419
+ click.secho(f" Shard {s_idx} FAILED: {exc}", fg="red")
420
+ else:
421
+ # Serial path — single shard (or single module). Full progress UX.
422
+ only_shard_idx = next(iter(shard_groups))
423
+ only_group = shard_groups[only_shard_idx]
424
+ _, all_results, total_files_found = _index_shard_group(
425
+ only_shard_idx, only_group, sg, full, embed, output_lock, False,
309
426
  )
310
- total_files_found += last_result.files_found
311
- if parse_state["shown"]:
312
- click.echo()
313
- if parse_state["total"] == 0 and not parse_state["printed_zero"]:
314
- _phase("Parsing code...", "0/0")
315
- elif parse_state["indexed"] < parse_state["total"]:
316
- _phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
427
+ if all_results:
428
+ last_result = all_results[-1]
317
429
 
318
430
  # ── Helper for in-place progress updates ────────────────────────────
319
431
  def _live_phase(label: str, status: str) -> None:
@@ -1,5 +1,5 @@
1
1
  import os
2
- from dataclasses import dataclass
2
+ from dataclasses import dataclass, field
3
3
 
4
4
 
5
5
  @dataclass(frozen=True)
@@ -15,6 +15,10 @@ class Settings:
15
15
  num_shards: int = 4
16
16
  shards_dir: str = os.path.expanduser("~/.codespine/shards")
17
17
 
18
+ # Storage backend: "kuzu" (default, property-graph) or "duckdb" (relational).
19
+ # Override at runtime via CODESPINE_BACKEND env var before starting the process.
20
+ backend: str = field(default_factory=lambda: os.environ.get("CODESPINE_BACKEND", "kuzu"))
21
+
18
22
  pid_file: str = os.path.expanduser("~/.codespine.pid")
19
23
  log_file: str = os.path.expanduser("~/.codespine.log")
20
24
  embedding_cache_path: str = os.path.expanduser("~/.codespine_embedding_cache.json")