java-codebase-rag 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
java_codebase_rag/cli.py CHANGED
@@ -21,13 +21,14 @@ from java_codebase_rag.config import (
21
21
  index_dir_has_existing_artifacts,
22
22
  resolve_operator_config,
23
23
  )
24
+ from java_codebase_rag._fdlimit import raise_fd_limit
24
25
  from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update, run_incremental_graph
25
26
  from java_ontology import VALID_UNRESOLVED_CALL_REASONS
26
27
 
27
- KUZU_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"
28
+ LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"
28
29
 
29
30
  _INCREMENT_WARNING_LINES = (
30
- "WARNING: AST graph (Kuzu) incremental rebuild is not yet implemented.",
31
+ "WARNING: AST graph (LadybugDB) incremental rebuild is not yet implemented.",
31
32
  "The graph reflects the index state from the last `init` or `reprocess`,",
32
33
  "which means `find`, `neighbors`, and `describe` may return stale results",
33
34
  "for files changed since then.",
@@ -37,8 +38,8 @@ _INCREMENT_WARNING_LINES = (
37
38
  "For an up-to-date graph, run:",
38
39
  " java-codebase-rag reprocess",
39
40
  "",
40
- "Track progress on Kuzu incremental rebuild:",
41
- f" {KUZU_INCREMENTAL_TRACKING_ISSUE_URL}",
41
+ "Track progress on LadybugDB incremental rebuild:",
42
+ f" {LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL}",
42
43
  )
43
44
 
44
45
  _REFRESH_DEPRECATION = (
@@ -47,7 +48,7 @@ _REFRESH_DEPRECATION = (
47
48
  )
48
49
 
49
50
  _REPROCESS_DRIFT_VECTORS_ONLY = (
50
- "java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.kuzu) was NOT rebuilt "
51
+ "java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.lbug) was NOT rebuilt "
51
52
  "and may now reflect a stale source snapshot."
52
53
  )
53
54
 
@@ -178,7 +179,7 @@ def _emit(value: Any) -> None:
178
179
  print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None))
179
180
 
180
181
 
181
- def _emit_increment_kuzu_warning() -> None:
182
+ def _emit_increment_ladybug_warning() -> None:
182
183
  for line in _INCREMENT_WARNING_LINES:
183
184
  print(line, file=sys.stderr)
184
185
 
@@ -289,7 +290,7 @@ def _cmd_init(args: argparse.Namespace) -> int:
289
290
  print(file=sys.stderr, flush=True)
290
291
  g = run_build_ast_graph(
291
292
  source_root=cfg.source_root,
292
- kuzu_path=cfg.kuzu_path,
293
+ ladybug_path=cfg.ladybug_path,
293
294
  verbose=verbose,
294
295
  quiet=bool(args.quiet),
295
296
  env=env,
@@ -319,7 +320,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
319
320
  # Check for --vectors-only flag
320
321
  vectors_only = bool(getattr(args, "vectors_only", False))
321
322
  if vectors_only:
322
- _emit_increment_kuzu_warning()
323
+ _emit_increment_ladybug_warning()
323
324
 
324
325
  def work() -> int:
325
326
  env = cfg.subprocess_env()
@@ -350,7 +351,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
350
351
  # Run incremental graph update
351
352
  g = run_incremental_graph(
352
353
  source_root=cfg.source_root,
353
- kuzu_path=cfg.kuzu_path,
354
+ ladybug_path=cfg.ladybug_path,
354
355
  verbose=bool(args.verbose),
355
356
  quiet=bool(args.quiet),
356
357
  env=env,
@@ -437,7 +438,7 @@ def _cmd_reprocess(args: argparse.Namespace) -> int:
437
438
  if graph_only:
438
439
  g = run_build_ast_graph(
439
440
  source_root=cfg.source_root,
440
- kuzu_path=cfg.kuzu_path,
441
+ ladybug_path=cfg.ladybug_path,
441
442
  verbose=verbose,
442
443
  quiet=bool(args.quiet),
443
444
  env=env,
@@ -509,7 +510,7 @@ def _cmd_erase(args: argparse.Namespace) -> int:
509
510
  cfg = _resolved_from_ns(args)
510
511
  _startup_hints(cfg)
511
512
  cfg.apply_to_os_environ()
512
- to_describe: list[Path] = [cfg.kuzu_path, cfg.cocoindex_db]
513
+ to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db]
513
514
  if cfg.index_dir.is_dir():
514
515
  try:
515
516
  import lancedb
@@ -546,8 +547,8 @@ def _cmd_erase(args: argparse.Namespace) -> int:
546
547
  )
547
548
  elif drop.returncode != 0:
548
549
  print(clip(drop.stderr, 4000), file=sys.stderr)
549
- if cfg.kuzu_path.exists():
550
- shutil.rmtree(cfg.kuzu_path, ignore_errors=True)
550
+ if cfg.ladybug_path.exists():
551
+ shutil.rmtree(cfg.ladybug_path, ignore_errors=True)
551
552
  if cfg.cocoindex_db.exists():
552
553
  try:
553
554
  cfg.cocoindex_db.unlink()
@@ -577,17 +578,17 @@ def _cmd_meta(args: argparse.Namespace) -> int:
577
578
  cfg = _resolved_from_ns(args)
578
579
  _startup_hints(cfg)
579
580
  cfg.apply_to_os_environ()
580
- from kuzu_queries import KuzuGraph # lazy
581
+ from ladybug_queries import LadybugGraph # lazy
581
582
 
582
- KuzuGraph._instance = None
583
- KuzuGraph._instance_path = None
583
+ LadybugGraph._instance = None
584
+ LadybugGraph._instance_path = None
584
585
  payload = server._graph_meta_output().model_dump()
585
586
  payload["embedding_model"] = cfg.embedding_model
586
587
  payload["embedding_device"] = cfg.embedding_device
587
588
  payload["embedding_model_source"] = cfg.embedding_model_source
588
589
  payload["embedding_device_source"] = cfg.embedding_device_source
589
590
  payload["index_dir"] = str(cfg.index_dir.resolve())
590
- payload["kuzu_path"] = str(cfg.kuzu_path.resolve())
591
+ payload["ladybug_path"] = str(cfg.ladybug_path.resolve())
591
592
  payload["index_dir_source"] = cfg.index_dir_source
592
593
  payload["hints_enabled"] = cfg.hints_enabled
593
594
  payload["hints_enabled_source"] = cfg.hints_enabled_source
@@ -637,12 +638,12 @@ def _cmd_unresolved_calls_list(args: argparse.Namespace) -> int:
637
638
  cfg = _resolved_from_ns(args)
638
639
  _startup_hints(cfg)
639
640
  cfg.apply_to_os_environ()
640
- from kuzu_queries import KuzuGraph # lazy
641
+ from ladybug_queries import LadybugGraph # lazy
641
642
 
642
- if not KuzuGraph.exists():
643
+ if not LadybugGraph.exists():
643
644
  _emit({"success": False, "message": "Kuzu graph not found"})
644
645
  return 1
645
- graph = KuzuGraph.get()
646
+ graph = LadybugGraph.get()
646
647
  rows = graph.list_unresolved_call_sites(
647
648
  method_id=args.method_id,
648
649
  reason=args.reason,
@@ -658,12 +659,12 @@ def _cmd_unresolved_calls_stats(args: argparse.Namespace) -> int:
658
659
  cfg = _resolved_from_ns(args)
659
660
  _startup_hints(cfg)
660
661
  cfg.apply_to_os_environ()
661
- from kuzu_queries import KuzuGraph # lazy
662
+ from ladybug_queries import LadybugGraph # lazy
662
663
 
663
- if not KuzuGraph.exists():
664
+ if not LadybugGraph.exists():
664
665
  _emit({"success": False, "message": "Kuzu graph not found"})
665
666
  return 1
666
- graph = KuzuGraph.get()
667
+ graph = LadybugGraph.get()
667
668
  buckets = graph.stats_unresolved_call_sites(by=args.by)
668
669
  total = sum(int(r.get("n") or 0) for r in buckets)
669
670
  _emit({"success": True, "total": total, "by": args.by, "buckets": buckets})
@@ -683,12 +684,12 @@ def _cmd_analyze_pr(args: argparse.Namespace) -> int:
683
684
  _emit({"success": False, "message": "Diff is empty"})
684
685
  return 1
685
686
  import pr_analysis # lazy
686
- from kuzu_queries import KuzuGraph # lazy
687
+ from ladybug_queries import LadybugGraph # lazy
687
688
 
688
- if not KuzuGraph.exists():
689
+ if not LadybugGraph.exists():
689
690
  _emit({"success": False, "message": "Kuzu graph not found"})
690
691
  return 1
691
- graph = KuzuGraph.get()
692
+ graph = LadybugGraph.get()
692
693
  report = pr_analysis.analyze_pr_pipeline(graph, diff_text)
693
694
  _emit(pr_analysis.pr_report_to_dict(report))
694
695
  return 0
@@ -774,8 +775,9 @@ def build_parser() -> argparse.ArgumentParser:
774
775
  help="Refresh shipped artifacts (skill, agent, MCP entry) after pip upgrade.",
775
776
  description=(
776
777
  "Post-upgrade refresh: overwrites skill and agent files with the latest "
777
- "shipped versions and updates the MCP command path. Use --dry-run to "
778
- "preview changes without writing. Requires a prior `install` run."
778
+ "shipped versions and updates the MCP command path. If an index exists, "
779
+ "also runs an incremental Lance + graph catch-up (same as `increment`). "
780
+ "Use --dry-run to preview changes without writing. Requires a prior `install` run."
779
781
  ),
780
782
  )
781
783
  update.add_argument(
@@ -902,6 +904,7 @@ def build_parser() -> argparse.ArgumentParser:
902
904
 
903
905
 
904
906
  def main(argv: list[str] | None = None) -> int:
907
+ raise_fd_limit()
905
908
  raw = list(argv if argv is not None else sys.argv[1:])
906
909
  if raw and raw[0] == "refresh":
907
910
  print(_REFRESH_DEPRECATION, file=sys.stderr)
@@ -25,6 +25,27 @@ ENV_SOURCE_ROOT = "JAVA_CODEBASE_RAG_SOURCE_ROOT"
25
25
  ENV_DEBUG_CONTEXT = "JAVA_CODEBASE_RAG_DEBUG_CONTEXT"
26
26
  ENV_RUN_HEAVY = "JAVA_CODEBASE_RAG_RUN_HEAVY"
27
27
 
28
+ # CocoIndex inflight-component throttle. CocoIndex's default is 1024 inflight
29
+ # components (cocoindex/_internal/app.py: ``_ENV_MAX_INFLIGHT_COMPONENTS``),
30
+ # which spawns enough concurrent LanceDB merge-inserts to exhaust OS file
31
+ # descriptors under default ulimits -> "Too many open files (os error 24)".
32
+ # NOTE: this is the REAL env var. An earlier fix (#293) set the non-existent
33
+ # ``COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS`` — CocoIndex never reads it, so it was a
34
+ # no-op and the EMFILE error recurred (#306).
35
+ COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV = "COCOINDEX_MAX_INFLIGHT_COMPONENTS"
36
+ COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS = "256"
37
+
38
+
39
+ def cocoindex_subprocess_env_defaults() -> dict[str, str]:
40
+ """Env defaults applied to every CocoIndex subprocess to bound concurrency.
41
+
42
+ Apply with ``env.setdefault(...)`` so a caller-provided (operator) value
43
+ always wins. See :issue:`306`.
44
+ """
45
+ return {
46
+ COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV: COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS
47
+ }
48
+
28
49
  _DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
29
50
 
30
51
  # Matches either $VAR or ${VAR} (POSIX shell variable syntax).
@@ -67,10 +88,9 @@ def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
67
88
  # Legacy env keys: never honored; detection-only hints name the replacement (if any).
68
89
  _LEGACY_ENV_HINTS: tuple[tuple[str, str], ...] = (
69
90
  ("LANCEDB_URI", "JAVA_CODEBASE_RAG_INDEX_DIR"),
70
- ("KUZU_DB_PATH", "JAVA_CODEBASE_RAG_INDEX_DIR (Kuzu lives at <index_dir>/code_graph.kuzu)"),
71
91
  ("LANCEDB_MCP_PROJECT_ROOT", "cwd or --source-root (no env replacement)"),
72
92
  ("LANCEDB_MCP_ALLOW_REFRESH", "(removed; use init / increment / reprocess / erase)"),
73
- ("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.kuzu exists)"),
93
+ ("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.lbug exists)"),
74
94
  ("LANCEDB_MCP_MICROSERVICE_ROOTS", "microservice_roots: in .java-codebase-rag.yml"),
75
95
  ("LANCEDB_MCP_DEBUG_CONTEXT", ENV_DEBUG_CONTEXT),
76
96
  ("LANCEDB_MCP_RUN_HEAVY", ENV_RUN_HEAVY),
@@ -182,7 +202,7 @@ def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
182
202
  class ResolvedOperatorConfig:
183
203
  source_root: Path
184
204
  index_dir: Path
185
- kuzu_path: Path
205
+ ladybug_path: Path
186
206
  cocoindex_db: Path
187
207
  embedding_model: str
188
208
  embedding_device: str | None
@@ -193,7 +213,7 @@ class ResolvedOperatorConfig:
193
213
  hints_enabled_source: SettingSource
194
214
 
195
215
  def apply_to_os_environ(self) -> None:
196
- """Make downstream modules (server, kuzu_queries, flows) see a consistent environment.
216
+ """Make downstream modules (server, ladybug_queries, flows) see a consistent environment.
197
217
 
198
218
  When ``embedding_device`` is unset, ``SBERT_DEVICE`` is not removed from ``os.environ`` so
199
219
  a long-lived host process is not mutated for unrelated callers; subprocesses still use
@@ -369,12 +389,12 @@ def resolve_operator_config(
369
389
  yaml_path=("hints", "enabled"),
370
390
  default=True,
371
391
  )
372
- ku = index_dir / "code_graph.kuzu"
392
+ ku = index_dir / "code_graph.lbug"
373
393
  coco = index_dir / "cocoindex.db"
374
394
  return ResolvedOperatorConfig(
375
395
  source_root=root,
376
396
  index_dir=index_dir,
377
- kuzu_path=ku,
397
+ ladybug_path=ku,
378
398
  cocoindex_db=coco,
379
399
  embedding_model=model,
380
400
  embedding_device=device,
@@ -387,9 +407,9 @@ def resolve_operator_config(
387
407
 
388
408
 
389
409
  def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
390
- """True if Kuzu graph dir or any Lance table already exists under index_dir."""
410
+ """True if graph dir or any Lance table already exists under index_dir."""
391
411
  paths: list[str] = []
392
- ku = index_dir / "code_graph.kuzu"
412
+ ku = index_dir / "code_graph.lbug"
393
413
  if ku.exists():
394
414
  paths.append(str(ku.resolve()))
395
415
  if index_dir.is_dir():
@@ -325,6 +325,66 @@ def select_hosts(*, non_interactive: bool, cli_agents: list[str] | None) -> list
325
325
  return [HOSTS[name] for name in selected]
326
326
 
327
327
 
328
+ def select_microservices(
329
+ java_dirs: list[Path],
330
+ *,
331
+ non_interactive: bool,
332
+ preselected: list[str] | None = None,
333
+ ) -> list[str] | None:
334
+ """Show an interactive checklist of detected microservices, all pre-checked.
335
+
336
+ Returns None when all are selected (-> microservice_roots omitted, index
337
+ everything) or a non-empty subset list. Never returns [].
338
+
339
+ Args:
340
+ java_dirs: Detected module roots (relative Path names) from
341
+ detect_java_directories. Caller must pass len >= 2.
342
+ non_interactive: If True, return None (all) without prompting.
343
+ preselected: On re-run, the prior microservice_roots subset to pre-check.
344
+ """
345
+ # Defensive guard: caller gates on len >= 2, but stay safe if called directly.
346
+ if len(java_dirs) < 2:
347
+ return None
348
+
349
+ dir_names = [str(d) for d in java_dirs]
350
+
351
+ if non_interactive:
352
+ return None
353
+
354
+ preselected_set = set(preselected) if preselected else None
355
+ choices = [
356
+ {
357
+ "name": name,
358
+ "value": name,
359
+ "checked": (name in preselected_set) if preselected_set is not None else True,
360
+ }
361
+ for name in dir_names
362
+ ]
363
+
364
+ print("Note: Select which modules to index. Toggle with Space, confirm with Enter.")
365
+ selected = prompt(
366
+ "checkbox",
367
+ "Select microservices to index:",
368
+ choices=choices,
369
+ default=dir_names, # non-TTY fallback returns all -> caller omits key
370
+ )
371
+
372
+ if not selected:
373
+ retry = prompt(
374
+ "confirm",
375
+ "At least one module is required. Re-select?",
376
+ )
377
+ if retry:
378
+ return select_microservices(java_dirs, non_interactive=False, preselected=preselected)
379
+ raise SystemExit(2)
380
+
381
+ selected_set = set(selected)
382
+ if selected_set == set(dir_names):
383
+ return None
384
+ # Preserve detection order for deterministic YAML output.
385
+ return [name for name in dir_names if name in selected_set]
386
+
387
+
328
388
  def select_scope(*, non_interactive: bool, cli_scope: str | None) -> Scope:
329
389
  """Select 'project' or 'user' scope.
330
390
 
@@ -791,7 +851,7 @@ def run_init_if_needed(
791
851
  # Run AST graph build
792
852
  g = run_build_ast_graph(
793
853
  source_root=cfg.source_root,
794
- kuzu_path=cfg.kuzu_path,
854
+ ladybug_path=cfg.ladybug_path,
795
855
  verbose=not quiet,
796
856
  quiet=quiet,
797
857
  env=env,
@@ -1182,7 +1242,7 @@ def run_update(
1182
1242
  index_dir_has_existing_artifacts,
1183
1243
  resolve_operator_config,
1184
1244
  )
1185
- from java_codebase_rag.pipeline import run_cocoindex_update
1245
+ from java_codebase_rag.pipeline import run_cocoindex_update, run_incremental_graph
1186
1246
 
1187
1247
  project_root = discover_project_root(cwd)
1188
1248
  if project_root is None:
@@ -1207,22 +1267,37 @@ def run_update(
1207
1267
  print("Run `java-codebase-rag install` to create one.")
1208
1268
  return EXIT_PARTIAL if has_artifact_failures else EXIT_SUCCESS
1209
1269
 
1210
- # Run increment (LanceDB catch-up)
1270
+ # Run increment: LanceDB catch-up + incremental graph rebuild.
1271
+ # Mirrors `java-codebase-rag increment` so both index layers stay current.
1272
+ # The "graph not implemented" warning belongs only on the vectors-only path
1273
+ # (increment --vectors-only), where the graph step is deliberately skipped.
1211
1274
  if not dry_run:
1212
- print("\nUpdating index (incremental LanceDB update)...")
1275
+ print("\nUpdating index (Lance + graph)...")
1213
1276
  cfg.apply_to_os_environ()
1214
1277
  env = cfg.subprocess_env()
1215
1278
 
1216
1279
  coco = run_cocoindex_update(env, full_reprocess=False, quiet=True)
1217
1280
  if coco.returncode != 0:
1218
- print(f"Error: Index update failed with code {coco.returncode}")
1281
+ print(f"Error: Lance index update failed with code {coco.returncode}")
1219
1282
  return 1
1220
1283
 
1221
- # Print graph staleness warning
1222
- from java_codebase_rag.cli import _INCREMENT_WARNING_LINES
1223
- print("\n" + "\n".join(_INCREMENT_WARNING_LINES))
1284
+ g = run_incremental_graph(
1285
+ source_root=cfg.source_root,
1286
+ ladybug_path=cfg.ladybug_path,
1287
+ verbose=False,
1288
+ quiet=True,
1289
+ env=env,
1290
+ )
1291
+ if g.returncode != 0:
1292
+ # Artifacts above already refreshed; the graph catch-up is best-effort
1293
+ # here. Surface a truthful, actionable message instead of leaving the
1294
+ # graph silently stale or claiming the feature is unimplemented.
1295
+ print(
1296
+ f"\nWarning: incremental graph update failed (exit {g.returncode}). "
1297
+ "Run `java-codebase-rag reprocess` for a full rebuild."
1298
+ )
1224
1299
  else:
1225
- print("\nWould run incremental index update.")
1300
+ print("\nWould run incremental index update (Lance + graph).")
1226
1301
 
1227
1302
  # Print summary
1228
1303
  print("\nUpdate complete.")
@@ -1270,6 +1345,20 @@ def run_install(
1270
1345
  except SystemExit as e:
1271
1346
  return e.code
1272
1347
 
1348
+ # Stage 1 (Case B): interactive microservice selection (only when 2+ detected)
1349
+ try:
1350
+ selected_roots = (
1351
+ select_microservices(
1352
+ java_dirs,
1353
+ non_interactive=non_interactive,
1354
+ preselected=existing_config.get("microservice_roots") if existing_config else None,
1355
+ )
1356
+ if len(java_dirs) >= 2
1357
+ else None
1358
+ )
1359
+ except SystemExit as e:
1360
+ return e.code
1361
+
1273
1362
  # Stage 2: Embedding model
1274
1363
  resolved_model = resolve_model(model, non_interactive=non_interactive)
1275
1364
 
@@ -1312,7 +1401,7 @@ def run_install(
1312
1401
  yaml_content = generate_yaml_config(
1313
1402
  source_root,
1314
1403
  resolved_model,
1315
- microservice_roots=[str(d) for d in java_dirs] if len(java_dirs) > 1 else None,
1404
+ microservice_roots=selected_roots,
1316
1405
  existing_yaml=existing_config,
1317
1406
  )
1318
1407
 
@@ -0,0 +1,148 @@
1
+ """Serialized post-flow LanceDB optimize with commit-conflict retry.
2
+
3
+ cocoindex 1.0.7 schedules ``table.optimize()`` (a LanceDB **Rewrite**/compaction
4
+ transaction) as a *background* ``asyncio`` task that races concurrent
5
+ ``table.delete()`` (**Delete**) transactions emitted by later mutation batches.
6
+ LanceDB does not allow a Rewrite to commit concurrently with a Delete
7
+ (upstream lancedb#1504 — "We do not support concurrent deletes right now"),
8
+ which surfaces as a flood of::
9
+
10
+ RuntimeError: lance error: Retryable commit conflict for version N: \
11
+ This Rewrite transaction was preempted by concurrent transaction Delete ...
12
+
13
+ To eliminate the race, the flow (``java_index_flow_lancedb.py``) disables the
14
+ in-flight background optimize entirely by raising
15
+ ``num_transactions_before_optimize`` to a value that is effectively never
16
+ reached. This module then performs a *single*, serialized optimize after the
17
+ flow returns (exit 0 → no concurrent writers), retrying the rare residual
18
+ commit conflict that two internal compaction passes can still produce.
19
+ """
20
+ from __future__ import annotations
21
+
22
+ import asyncio
23
+ import sys
24
+ from pathlib import Path
25
+
26
+ # Single source of truth for the three Lance table names created by the flow.
27
+ # Keep in sync with ``search_lancedb.TABLES`` (the values there mirror these).
28
+ LANCE_TABLE_NAMES: tuple[str, ...] = (
29
+ "javacodeindex_java_code",
30
+ "sqlschemaindex_sql_schema",
31
+ "yamlconfigindex_yaml_config",
32
+ )
33
+
34
+ # Commit conflicts are transient; a handful of exponential-backoff retries is
35
+ # enough because, post-flow, there are no concurrent writers — only successive
36
+ # optimize/compaction passes within this single serialized call can still
37
+ # transiently preempt one another.
38
+ _MAX_ATTEMPTS = 6
39
+ _BASE_BACKOFF_S = 0.1
40
+
41
+ # Substrings identifying the retryable Lance commit-conflict error. LanceDB
42
+ # wraps the underlying lance error text into the raised ``RuntimeError`` str,
43
+ # so a substring match is the robust detector (no dedicated exception type).
44
+ _RETRYABLE_MARKERS = (
45
+ "Retryable commit conflict",
46
+ "preempted by concurrent transaction",
47
+ )
48
+
49
+
50
+ def _is_retryable(exc: BaseException) -> bool:
51
+ text = str(exc)
52
+ return any(marker in text for marker in _RETRYABLE_MARKERS)
53
+
54
+
55
+ async def _list_table_names(db: object) -> set[str]:
56
+ """Existing table names across LanceDB API variants (``list_tables`` ≥ ``table_names``)."""
57
+ if hasattr(db, "list_tables"):
58
+ response = await db.list_tables()
59
+ return set(getattr(response, "tables", response))
60
+ return set(await db.table_names())
61
+
62
+
63
+ async def optimize_lance_tables(index_dir: Path, *, quiet: bool = False) -> dict[str, str]:
64
+ """Optimize all known Lance tables under *index_dir*, serially, with retry.
65
+
66
+ Runs ``table.optimize()`` for each name in :data:`LANCE_TABLE_NAMES` that
67
+ exists in the DB. Retryable commit conflicts are retried with exponential
68
+ backoff; any other exception (or an exhausted retry budget) is captured
69
+ per-table in the returned dict and logged to **stderr** — never stdout,
70
+ since this is callable from stdio-MCP / JSON-stdout contexts.
71
+
72
+ Args:
73
+ index_dir: directory holding the Lance tables (the flow's LanceDB URI).
74
+ quiet: when True, suppress the per-table success/skip info lines on
75
+ stderr (errors are always logged).
76
+
77
+ Returns:
78
+ Mapping of table name → status. Values are ``"ok"``, ``"skipped"``
79
+ (table absent — e.g. a repo with no SQL/YAML), or ``"error: <text>"``.
80
+ """
81
+ # Lazy import: the flow imports this module for LANCE_TABLE_NAMES and must
82
+ # not pay the lancedb import cost at flow-definition time.
83
+ import lancedb
84
+
85
+ results: dict[str, str] = {}
86
+ db = await lancedb.connect_async(str(index_dir))
87
+ try:
88
+ try:
89
+ existing = await _list_table_names(db)
90
+ except Exception as exc:
91
+ print(
92
+ f"java-codebase-rag: optimize: failed to list tables in "
93
+ f"{index_dir}: {exc}",
94
+ file=sys.stderr,
95
+ )
96
+ return {name: f"error: list failed: {exc}" for name in LANCE_TABLE_NAMES}
97
+
98
+ for name in LANCE_TABLE_NAMES:
99
+ if name not in existing:
100
+ results[name] = "skipped"
101
+ if not quiet:
102
+ print(
103
+ f"java-codebase-rag: optimize: {name} absent, skipped",
104
+ file=sys.stderr,
105
+ )
106
+ continue
107
+ try:
108
+ table = await db.open_table(name)
109
+ except Exception as exc:
110
+ results[name] = f"error: open failed: {exc}"
111
+ print(
112
+ f"java-codebase-rag: optimize: {name} open failed: {exc}",
113
+ file=sys.stderr,
114
+ )
115
+ continue
116
+
117
+ last_exc: BaseException | None = None
118
+ for attempt in range(_MAX_ATTEMPTS):
119
+ try:
120
+ await table.optimize()
121
+ last_exc = None
122
+ break
123
+ except Exception as exc:
124
+ last_exc = exc
125
+ if _is_retryable(exc) and attempt < _MAX_ATTEMPTS - 1:
126
+ await asyncio.sleep(_BASE_BACKOFF_S * (2**attempt))
127
+ continue
128
+ # Non-retryable, or retries exhausted: stop the loop and
129
+ # surface below — do not swallow silently.
130
+ break
131
+
132
+ if last_exc is None:
133
+ results[name] = "ok"
134
+ if not quiet:
135
+ print(
136
+ f"java-codebase-rag: optimize: {name} ok",
137
+ file=sys.stderr,
138
+ )
139
+ else:
140
+ results[name] = f"error: {last_exc}"
141
+ print(
142
+ f"java-codebase-rag: optimize: {name} failed: {last_exc}",
143
+ file=sys.stderr,
144
+ )
145
+ finally:
146
+ # ``AsyncConnection.close`` is a *sync* method in lancedb 0.30.x.
147
+ db.close()
148
+ return results