java-codebase-rag 0.5.3__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +1 -1
- build_ast_graph.py +142 -90
- graph_enrich.py +3 -3
- java_codebase_rag/_fdlimit.py +48 -0
- java_codebase_rag/cli.py +31 -28
- java_codebase_rag/config.py +28 -8
- java_codebase_rag/installer.py +99 -10
- java_codebase_rag/lance_optimize.py +148 -0
- java_codebase_rag/pipeline.py +63 -9
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.0.dist-info}/METADATA +5 -5
- java_codebase_rag-0.6.0.dist-info/RECORD +33 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.0.dist-info}/top_level.txt +1 -1
- java_index_flow_lancedb.py +22 -4
- java_ontology.py +1 -1
- kuzu_queries.py → ladybug_queries.py +62 -56
- mcp_v2.py +16 -16
- pr_analysis.py +1 -1
- search_lancedb.py +8 -8
- server.py +47 -17
- java_codebase_rag-0.5.3.dist-info/RECORD +0 -31
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.0.dist-info}/WHEEL +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.0.dist-info}/entry_points.txt +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.0.dist-info}/licenses/LICENSE +0 -0
java_codebase_rag/cli.py
CHANGED
|
@@ -21,13 +21,14 @@ from java_codebase_rag.config import (
|
|
|
21
21
|
index_dir_has_existing_artifacts,
|
|
22
22
|
resolve_operator_config,
|
|
23
23
|
)
|
|
24
|
+
from java_codebase_rag._fdlimit import raise_fd_limit
|
|
24
25
|
from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update, run_incremental_graph
|
|
25
26
|
from java_ontology import VALID_UNRESOLVED_CALL_REASONS
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"
|
|
28
29
|
|
|
29
30
|
_INCREMENT_WARNING_LINES = (
|
|
30
|
-
"WARNING: AST graph (
|
|
31
|
+
"WARNING: AST graph (LadybugDB) incremental rebuild is not yet implemented.",
|
|
31
32
|
"The graph reflects the index state from the last `init` or `reprocess`,",
|
|
32
33
|
"which means `find`, `neighbors`, and `describe` may return stale results",
|
|
33
34
|
"for files changed since then.",
|
|
@@ -37,8 +38,8 @@ _INCREMENT_WARNING_LINES = (
|
|
|
37
38
|
"For an up-to-date graph, run:",
|
|
38
39
|
" java-codebase-rag reprocess",
|
|
39
40
|
"",
|
|
40
|
-
"Track progress on
|
|
41
|
-
f" {
|
|
41
|
+
"Track progress on LadybugDB incremental rebuild:",
|
|
42
|
+
f" {LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL}",
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
_REFRESH_DEPRECATION = (
|
|
@@ -47,7 +48,7 @@ _REFRESH_DEPRECATION = (
|
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
_REPROCESS_DRIFT_VECTORS_ONLY = (
|
|
50
|
-
"java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.
|
|
51
|
+
"java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.lbug) was NOT rebuilt "
|
|
51
52
|
"and may now reflect a stale source snapshot."
|
|
52
53
|
)
|
|
53
54
|
|
|
@@ -178,7 +179,7 @@ def _emit(value: Any) -> None:
|
|
|
178
179
|
print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None))
|
|
179
180
|
|
|
180
181
|
|
|
181
|
-
def
|
|
182
|
+
def _emit_increment_ladybug_warning() -> None:
|
|
182
183
|
for line in _INCREMENT_WARNING_LINES:
|
|
183
184
|
print(line, file=sys.stderr)
|
|
184
185
|
|
|
@@ -289,7 +290,7 @@ def _cmd_init(args: argparse.Namespace) -> int:
|
|
|
289
290
|
print(file=sys.stderr, flush=True)
|
|
290
291
|
g = run_build_ast_graph(
|
|
291
292
|
source_root=cfg.source_root,
|
|
292
|
-
|
|
293
|
+
ladybug_path=cfg.ladybug_path,
|
|
293
294
|
verbose=verbose,
|
|
294
295
|
quiet=bool(args.quiet),
|
|
295
296
|
env=env,
|
|
@@ -319,7 +320,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
|
|
|
319
320
|
# Check for --vectors-only flag
|
|
320
321
|
vectors_only = bool(getattr(args, "vectors_only", False))
|
|
321
322
|
if vectors_only:
|
|
322
|
-
|
|
323
|
+
_emit_increment_ladybug_warning()
|
|
323
324
|
|
|
324
325
|
def work() -> int:
|
|
325
326
|
env = cfg.subprocess_env()
|
|
@@ -350,7 +351,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
|
|
|
350
351
|
# Run incremental graph update
|
|
351
352
|
g = run_incremental_graph(
|
|
352
353
|
source_root=cfg.source_root,
|
|
353
|
-
|
|
354
|
+
ladybug_path=cfg.ladybug_path,
|
|
354
355
|
verbose=bool(args.verbose),
|
|
355
356
|
quiet=bool(args.quiet),
|
|
356
357
|
env=env,
|
|
@@ -437,7 +438,7 @@ def _cmd_reprocess(args: argparse.Namespace) -> int:
|
|
|
437
438
|
if graph_only:
|
|
438
439
|
g = run_build_ast_graph(
|
|
439
440
|
source_root=cfg.source_root,
|
|
440
|
-
|
|
441
|
+
ladybug_path=cfg.ladybug_path,
|
|
441
442
|
verbose=verbose,
|
|
442
443
|
quiet=bool(args.quiet),
|
|
443
444
|
env=env,
|
|
@@ -509,7 +510,7 @@ def _cmd_erase(args: argparse.Namespace) -> int:
|
|
|
509
510
|
cfg = _resolved_from_ns(args)
|
|
510
511
|
_startup_hints(cfg)
|
|
511
512
|
cfg.apply_to_os_environ()
|
|
512
|
-
to_describe: list[Path] = [cfg.
|
|
513
|
+
to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db]
|
|
513
514
|
if cfg.index_dir.is_dir():
|
|
514
515
|
try:
|
|
515
516
|
import lancedb
|
|
@@ -546,8 +547,8 @@ def _cmd_erase(args: argparse.Namespace) -> int:
|
|
|
546
547
|
)
|
|
547
548
|
elif drop.returncode != 0:
|
|
548
549
|
print(clip(drop.stderr, 4000), file=sys.stderr)
|
|
549
|
-
if cfg.
|
|
550
|
-
shutil.rmtree(cfg.
|
|
550
|
+
if cfg.ladybug_path.exists():
|
|
551
|
+
shutil.rmtree(cfg.ladybug_path, ignore_errors=True)
|
|
551
552
|
if cfg.cocoindex_db.exists():
|
|
552
553
|
try:
|
|
553
554
|
cfg.cocoindex_db.unlink()
|
|
@@ -577,17 +578,17 @@ def _cmd_meta(args: argparse.Namespace) -> int:
|
|
|
577
578
|
cfg = _resolved_from_ns(args)
|
|
578
579
|
_startup_hints(cfg)
|
|
579
580
|
cfg.apply_to_os_environ()
|
|
580
|
-
from
|
|
581
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
581
582
|
|
|
582
|
-
|
|
583
|
-
|
|
583
|
+
LadybugGraph._instance = None
|
|
584
|
+
LadybugGraph._instance_path = None
|
|
584
585
|
payload = server._graph_meta_output().model_dump()
|
|
585
586
|
payload["embedding_model"] = cfg.embedding_model
|
|
586
587
|
payload["embedding_device"] = cfg.embedding_device
|
|
587
588
|
payload["embedding_model_source"] = cfg.embedding_model_source
|
|
588
589
|
payload["embedding_device_source"] = cfg.embedding_device_source
|
|
589
590
|
payload["index_dir"] = str(cfg.index_dir.resolve())
|
|
590
|
-
payload["
|
|
591
|
+
payload["ladybug_path"] = str(cfg.ladybug_path.resolve())
|
|
591
592
|
payload["index_dir_source"] = cfg.index_dir_source
|
|
592
593
|
payload["hints_enabled"] = cfg.hints_enabled
|
|
593
594
|
payload["hints_enabled_source"] = cfg.hints_enabled_source
|
|
@@ -637,12 +638,12 @@ def _cmd_unresolved_calls_list(args: argparse.Namespace) -> int:
|
|
|
637
638
|
cfg = _resolved_from_ns(args)
|
|
638
639
|
_startup_hints(cfg)
|
|
639
640
|
cfg.apply_to_os_environ()
|
|
640
|
-
from
|
|
641
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
641
642
|
|
|
642
|
-
if not
|
|
643
|
+
if not LadybugGraph.exists():
|
|
643
644
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
644
645
|
return 1
|
|
645
|
-
graph =
|
|
646
|
+
graph = LadybugGraph.get()
|
|
646
647
|
rows = graph.list_unresolved_call_sites(
|
|
647
648
|
method_id=args.method_id,
|
|
648
649
|
reason=args.reason,
|
|
@@ -658,12 +659,12 @@ def _cmd_unresolved_calls_stats(args: argparse.Namespace) -> int:
|
|
|
658
659
|
cfg = _resolved_from_ns(args)
|
|
659
660
|
_startup_hints(cfg)
|
|
660
661
|
cfg.apply_to_os_environ()
|
|
661
|
-
from
|
|
662
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
662
663
|
|
|
663
|
-
if not
|
|
664
|
+
if not LadybugGraph.exists():
|
|
664
665
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
665
666
|
return 1
|
|
666
|
-
graph =
|
|
667
|
+
graph = LadybugGraph.get()
|
|
667
668
|
buckets = graph.stats_unresolved_call_sites(by=args.by)
|
|
668
669
|
total = sum(int(r.get("n") or 0) for r in buckets)
|
|
669
670
|
_emit({"success": True, "total": total, "by": args.by, "buckets": buckets})
|
|
@@ -683,12 +684,12 @@ def _cmd_analyze_pr(args: argparse.Namespace) -> int:
|
|
|
683
684
|
_emit({"success": False, "message": "Diff is empty"})
|
|
684
685
|
return 1
|
|
685
686
|
import pr_analysis # lazy
|
|
686
|
-
from
|
|
687
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
687
688
|
|
|
688
|
-
if not
|
|
689
|
+
if not LadybugGraph.exists():
|
|
689
690
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
690
691
|
return 1
|
|
691
|
-
graph =
|
|
692
|
+
graph = LadybugGraph.get()
|
|
692
693
|
report = pr_analysis.analyze_pr_pipeline(graph, diff_text)
|
|
693
694
|
_emit(pr_analysis.pr_report_to_dict(report))
|
|
694
695
|
return 0
|
|
@@ -774,8 +775,9 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
774
775
|
help="Refresh shipped artifacts (skill, agent, MCP entry) after pip upgrade.",
|
|
775
776
|
description=(
|
|
776
777
|
"Post-upgrade refresh: overwrites skill and agent files with the latest "
|
|
777
|
-
"shipped versions and updates the MCP command path.
|
|
778
|
-
"
|
|
778
|
+
"shipped versions and updates the MCP command path. If an index exists, "
|
|
779
|
+
"also runs an incremental Lance + graph catch-up (same as `increment`). "
|
|
780
|
+
"Use --dry-run to preview changes without writing. Requires a prior `install` run."
|
|
779
781
|
),
|
|
780
782
|
)
|
|
781
783
|
update.add_argument(
|
|
@@ -902,6 +904,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
902
904
|
|
|
903
905
|
|
|
904
906
|
def main(argv: list[str] | None = None) -> int:
|
|
907
|
+
raise_fd_limit()
|
|
905
908
|
raw = list(argv if argv is not None else sys.argv[1:])
|
|
906
909
|
if raw and raw[0] == "refresh":
|
|
907
910
|
print(_REFRESH_DEPRECATION, file=sys.stderr)
|
java_codebase_rag/config.py
CHANGED
|
@@ -25,6 +25,27 @@ ENV_SOURCE_ROOT = "JAVA_CODEBASE_RAG_SOURCE_ROOT"
|
|
|
25
25
|
ENV_DEBUG_CONTEXT = "JAVA_CODEBASE_RAG_DEBUG_CONTEXT"
|
|
26
26
|
ENV_RUN_HEAVY = "JAVA_CODEBASE_RAG_RUN_HEAVY"
|
|
27
27
|
|
|
28
|
+
# CocoIndex inflight-component throttle. CocoIndex's default is 1024 inflight
|
|
29
|
+
# components (cocoindex/_internal/app.py: ``_ENV_MAX_INFLIGHT_COMPONENTS``),
|
|
30
|
+
# which spawns enough concurrent LanceDB merge-inserts to exhaust OS file
|
|
31
|
+
# descriptors under default ulimits -> "Too many open files (os error 24)".
|
|
32
|
+
# NOTE: this is the REAL env var. An earlier fix (#293) set the non-existent
|
|
33
|
+
# ``COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS`` — CocoIndex never reads it, so it was a
|
|
34
|
+
# no-op and the EMFILE error recurred (#306).
|
|
35
|
+
COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV = "COCOINDEX_MAX_INFLIGHT_COMPONENTS"
|
|
36
|
+
COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS = "256"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def cocoindex_subprocess_env_defaults() -> dict[str, str]:
|
|
40
|
+
"""Env defaults applied to every CocoIndex subprocess to bound concurrency.
|
|
41
|
+
|
|
42
|
+
Apply with ``env.setdefault(...)`` so a caller-provided (operator) value
|
|
43
|
+
always wins. See :issue:`306`.
|
|
44
|
+
"""
|
|
45
|
+
return {
|
|
46
|
+
COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV: COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS
|
|
47
|
+
}
|
|
48
|
+
|
|
28
49
|
_DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
29
50
|
|
|
30
51
|
# Matches either $VAR or ${VAR} (POSIX shell variable syntax).
|
|
@@ -67,10 +88,9 @@ def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
|
|
|
67
88
|
# Legacy env keys: never honored; detection-only hints name the replacement (if any).
|
|
68
89
|
_LEGACY_ENV_HINTS: tuple[tuple[str, str], ...] = (
|
|
69
90
|
("LANCEDB_URI", "JAVA_CODEBASE_RAG_INDEX_DIR"),
|
|
70
|
-
("KUZU_DB_PATH", "JAVA_CODEBASE_RAG_INDEX_DIR (Kuzu lives at <index_dir>/code_graph.kuzu)"),
|
|
71
91
|
("LANCEDB_MCP_PROJECT_ROOT", "cwd or --source-root (no env replacement)"),
|
|
72
92
|
("LANCEDB_MCP_ALLOW_REFRESH", "(removed; use init / increment / reprocess / erase)"),
|
|
73
|
-
("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.
|
|
93
|
+
("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.lbug exists)"),
|
|
74
94
|
("LANCEDB_MCP_MICROSERVICE_ROOTS", "microservice_roots: in .java-codebase-rag.yml"),
|
|
75
95
|
("LANCEDB_MCP_DEBUG_CONTEXT", ENV_DEBUG_CONTEXT),
|
|
76
96
|
("LANCEDB_MCP_RUN_HEAVY", ENV_RUN_HEAVY),
|
|
@@ -182,7 +202,7 @@ def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
|
|
|
182
202
|
class ResolvedOperatorConfig:
|
|
183
203
|
source_root: Path
|
|
184
204
|
index_dir: Path
|
|
185
|
-
|
|
205
|
+
ladybug_path: Path
|
|
186
206
|
cocoindex_db: Path
|
|
187
207
|
embedding_model: str
|
|
188
208
|
embedding_device: str | None
|
|
@@ -193,7 +213,7 @@ class ResolvedOperatorConfig:
|
|
|
193
213
|
hints_enabled_source: SettingSource
|
|
194
214
|
|
|
195
215
|
def apply_to_os_environ(self) -> None:
|
|
196
|
-
"""Make downstream modules (server,
|
|
216
|
+
"""Make downstream modules (server, ladybug_queries, flows) see a consistent environment.
|
|
197
217
|
|
|
198
218
|
When ``embedding_device`` is unset, ``SBERT_DEVICE`` is not removed from ``os.environ`` so
|
|
199
219
|
a long-lived host process is not mutated for unrelated callers; subprocesses still use
|
|
@@ -369,12 +389,12 @@ def resolve_operator_config(
|
|
|
369
389
|
yaml_path=("hints", "enabled"),
|
|
370
390
|
default=True,
|
|
371
391
|
)
|
|
372
|
-
ku = index_dir / "code_graph.
|
|
392
|
+
ku = index_dir / "code_graph.lbug"
|
|
373
393
|
coco = index_dir / "cocoindex.db"
|
|
374
394
|
return ResolvedOperatorConfig(
|
|
375
395
|
source_root=root,
|
|
376
396
|
index_dir=index_dir,
|
|
377
|
-
|
|
397
|
+
ladybug_path=ku,
|
|
378
398
|
cocoindex_db=coco,
|
|
379
399
|
embedding_model=model,
|
|
380
400
|
embedding_device=device,
|
|
@@ -387,9 +407,9 @@ def resolve_operator_config(
|
|
|
387
407
|
|
|
388
408
|
|
|
389
409
|
def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
|
|
390
|
-
"""True if
|
|
410
|
+
"""True if graph dir or any Lance table already exists under index_dir."""
|
|
391
411
|
paths: list[str] = []
|
|
392
|
-
ku = index_dir / "code_graph.
|
|
412
|
+
ku = index_dir / "code_graph.lbug"
|
|
393
413
|
if ku.exists():
|
|
394
414
|
paths.append(str(ku.resolve()))
|
|
395
415
|
if index_dir.is_dir():
|
java_codebase_rag/installer.py
CHANGED
|
@@ -325,6 +325,66 @@ def select_hosts(*, non_interactive: bool, cli_agents: list[str] | None) -> list
|
|
|
325
325
|
return [HOSTS[name] for name in selected]
|
|
326
326
|
|
|
327
327
|
|
|
328
|
+
def select_microservices(
|
|
329
|
+
java_dirs: list[Path],
|
|
330
|
+
*,
|
|
331
|
+
non_interactive: bool,
|
|
332
|
+
preselected: list[str] | None = None,
|
|
333
|
+
) -> list[str] | None:
|
|
334
|
+
"""Show an interactive checklist of detected microservices, all pre-checked.
|
|
335
|
+
|
|
336
|
+
Returns None when all are selected (-> microservice_roots omitted, index
|
|
337
|
+
everything) or a non-empty subset list. Never returns [].
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
java_dirs: Detected module roots (relative Path names) from
|
|
341
|
+
detect_java_directories. Caller must pass len >= 2.
|
|
342
|
+
non_interactive: If True, return None (all) without prompting.
|
|
343
|
+
preselected: On re-run, the prior microservice_roots subset to pre-check.
|
|
344
|
+
"""
|
|
345
|
+
# Defensive guard: caller gates on len >= 2, but stay safe if called directly.
|
|
346
|
+
if len(java_dirs) < 2:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
dir_names = [str(d) for d in java_dirs]
|
|
350
|
+
|
|
351
|
+
if non_interactive:
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
preselected_set = set(preselected) if preselected else None
|
|
355
|
+
choices = [
|
|
356
|
+
{
|
|
357
|
+
"name": name,
|
|
358
|
+
"value": name,
|
|
359
|
+
"checked": (name in preselected_set) if preselected_set is not None else True,
|
|
360
|
+
}
|
|
361
|
+
for name in dir_names
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
print("Note: Select which modules to index. Toggle with Space, confirm with Enter.")
|
|
365
|
+
selected = prompt(
|
|
366
|
+
"checkbox",
|
|
367
|
+
"Select microservices to index:",
|
|
368
|
+
choices=choices,
|
|
369
|
+
default=dir_names, # non-TTY fallback returns all -> caller omits key
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
if not selected:
|
|
373
|
+
retry = prompt(
|
|
374
|
+
"confirm",
|
|
375
|
+
"At least one module is required. Re-select?",
|
|
376
|
+
)
|
|
377
|
+
if retry:
|
|
378
|
+
return select_microservices(java_dirs, non_interactive=False, preselected=preselected)
|
|
379
|
+
raise SystemExit(2)
|
|
380
|
+
|
|
381
|
+
selected_set = set(selected)
|
|
382
|
+
if selected_set == set(dir_names):
|
|
383
|
+
return None
|
|
384
|
+
# Preserve detection order for deterministic YAML output.
|
|
385
|
+
return [name for name in dir_names if name in selected_set]
|
|
386
|
+
|
|
387
|
+
|
|
328
388
|
def select_scope(*, non_interactive: bool, cli_scope: str | None) -> Scope:
|
|
329
389
|
"""Select 'project' or 'user' scope.
|
|
330
390
|
|
|
@@ -791,7 +851,7 @@ def run_init_if_needed(
|
|
|
791
851
|
# Run AST graph build
|
|
792
852
|
g = run_build_ast_graph(
|
|
793
853
|
source_root=cfg.source_root,
|
|
794
|
-
|
|
854
|
+
ladybug_path=cfg.ladybug_path,
|
|
795
855
|
verbose=not quiet,
|
|
796
856
|
quiet=quiet,
|
|
797
857
|
env=env,
|
|
@@ -1182,7 +1242,7 @@ def run_update(
|
|
|
1182
1242
|
index_dir_has_existing_artifacts,
|
|
1183
1243
|
resolve_operator_config,
|
|
1184
1244
|
)
|
|
1185
|
-
from java_codebase_rag.pipeline import run_cocoindex_update
|
|
1245
|
+
from java_codebase_rag.pipeline import run_cocoindex_update, run_incremental_graph
|
|
1186
1246
|
|
|
1187
1247
|
project_root = discover_project_root(cwd)
|
|
1188
1248
|
if project_root is None:
|
|
@@ -1207,22 +1267,37 @@ def run_update(
|
|
|
1207
1267
|
print("Run `java-codebase-rag install` to create one.")
|
|
1208
1268
|
return EXIT_PARTIAL if has_artifact_failures else EXIT_SUCCESS
|
|
1209
1269
|
|
|
1210
|
-
# Run increment
|
|
1270
|
+
# Run increment: LanceDB catch-up + incremental graph rebuild.
|
|
1271
|
+
# Mirrors `java-codebase-rag increment` so both index layers stay current.
|
|
1272
|
+
# The "graph not implemented" warning belongs only on the vectors-only path
|
|
1273
|
+
# (increment --vectors-only), where the graph step is deliberately skipped.
|
|
1211
1274
|
if not dry_run:
|
|
1212
|
-
print("\nUpdating index (
|
|
1275
|
+
print("\nUpdating index (Lance + graph)...")
|
|
1213
1276
|
cfg.apply_to_os_environ()
|
|
1214
1277
|
env = cfg.subprocess_env()
|
|
1215
1278
|
|
|
1216
1279
|
coco = run_cocoindex_update(env, full_reprocess=False, quiet=True)
|
|
1217
1280
|
if coco.returncode != 0:
|
|
1218
|
-
print(f"Error:
|
|
1281
|
+
print(f"Error: Lance index update failed with code {coco.returncode}")
|
|
1219
1282
|
return 1
|
|
1220
1283
|
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1284
|
+
g = run_incremental_graph(
|
|
1285
|
+
source_root=cfg.source_root,
|
|
1286
|
+
ladybug_path=cfg.ladybug_path,
|
|
1287
|
+
verbose=False,
|
|
1288
|
+
quiet=True,
|
|
1289
|
+
env=env,
|
|
1290
|
+
)
|
|
1291
|
+
if g.returncode != 0:
|
|
1292
|
+
# Artifacts above already refreshed; the graph catch-up is best-effort
|
|
1293
|
+
# here. Surface a truthful, actionable message instead of leaving the
|
|
1294
|
+
# graph silently stale or claiming the feature is unimplemented.
|
|
1295
|
+
print(
|
|
1296
|
+
f"\nWarning: incremental graph update failed (exit {g.returncode}). "
|
|
1297
|
+
"Run `java-codebase-rag reprocess` for a full rebuild."
|
|
1298
|
+
)
|
|
1224
1299
|
else:
|
|
1225
|
-
print("\nWould run incremental index update.")
|
|
1300
|
+
print("\nWould run incremental index update (Lance + graph).")
|
|
1226
1301
|
|
|
1227
1302
|
# Print summary
|
|
1228
1303
|
print("\nUpdate complete.")
|
|
@@ -1270,6 +1345,20 @@ def run_install(
|
|
|
1270
1345
|
except SystemExit as e:
|
|
1271
1346
|
return e.code
|
|
1272
1347
|
|
|
1348
|
+
# Stage 1 (Case B): interactive microservice selection (only when 2+ detected)
|
|
1349
|
+
try:
|
|
1350
|
+
selected_roots = (
|
|
1351
|
+
select_microservices(
|
|
1352
|
+
java_dirs,
|
|
1353
|
+
non_interactive=non_interactive,
|
|
1354
|
+
preselected=existing_config.get("microservice_roots") if existing_config else None,
|
|
1355
|
+
)
|
|
1356
|
+
if len(java_dirs) >= 2
|
|
1357
|
+
else None
|
|
1358
|
+
)
|
|
1359
|
+
except SystemExit as e:
|
|
1360
|
+
return e.code
|
|
1361
|
+
|
|
1273
1362
|
# Stage 2: Embedding model
|
|
1274
1363
|
resolved_model = resolve_model(model, non_interactive=non_interactive)
|
|
1275
1364
|
|
|
@@ -1312,7 +1401,7 @@ def run_install(
|
|
|
1312
1401
|
yaml_content = generate_yaml_config(
|
|
1313
1402
|
source_root,
|
|
1314
1403
|
resolved_model,
|
|
1315
|
-
microservice_roots=
|
|
1404
|
+
microservice_roots=selected_roots,
|
|
1316
1405
|
existing_yaml=existing_config,
|
|
1317
1406
|
)
|
|
1318
1407
|
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Serialized post-flow LanceDB optimize with commit-conflict retry.
|
|
2
|
+
|
|
3
|
+
cocoindex 1.0.7 schedules ``table.optimize()`` (a LanceDB **Rewrite**/compaction
|
|
4
|
+
transaction) as a *background* ``asyncio`` task that races concurrent
|
|
5
|
+
``table.delete()`` (**Delete**) transactions emitted by later mutation batches.
|
|
6
|
+
LanceDB does not allow a Rewrite to commit concurrently with a Delete
|
|
7
|
+
(upstream lancedb#1504 — "We do not support concurrent deletes right now"),
|
|
8
|
+
which surfaces as a flood of::
|
|
9
|
+
|
|
10
|
+
RuntimeError: lance error: Retryable commit conflict for version N: \
|
|
11
|
+
This Rewrite transaction was preempted by concurrent transaction Delete ...
|
|
12
|
+
|
|
13
|
+
To eliminate the race, the flow (``java_index_flow_lancedb.py``) disables the
|
|
14
|
+
in-flight background optimize entirely by raising
|
|
15
|
+
``num_transactions_before_optimize`` to a value that is effectively never
|
|
16
|
+
reached. This module then performs a *single*, serialized optimize after the
|
|
17
|
+
flow returns (exit 0 → no concurrent writers), retrying the rare residual
|
|
18
|
+
commit conflict that two internal compaction passes can still produce.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
import sys
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
# Single source of truth for the three Lance table names created by the flow.
|
|
27
|
+
# Keep in sync with ``search_lancedb.TABLES`` (the values there mirror these).
|
|
28
|
+
LANCE_TABLE_NAMES: tuple[str, ...] = (
|
|
29
|
+
"javacodeindex_java_code",
|
|
30
|
+
"sqlschemaindex_sql_schema",
|
|
31
|
+
"yamlconfigindex_yaml_config",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Commit conflicts are transient; a handful of exponential-backoff retries is
|
|
35
|
+
# enough because, post-flow, there are no concurrent writers — only successive
|
|
36
|
+
# optimize/compaction passes within this single serialized call can still
|
|
37
|
+
# transiently preempt one another.
|
|
38
|
+
_MAX_ATTEMPTS = 6
|
|
39
|
+
_BASE_BACKOFF_S = 0.1
|
|
40
|
+
|
|
41
|
+
# Substrings identifying the retryable Lance commit-conflict error. LanceDB
|
|
42
|
+
# wraps the underlying lance error text into the raised ``RuntimeError`` str,
|
|
43
|
+
# so a substring match is the robust detector (no dedicated exception type).
|
|
44
|
+
_RETRYABLE_MARKERS = (
|
|
45
|
+
"Retryable commit conflict",
|
|
46
|
+
"preempted by concurrent transaction",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _is_retryable(exc: BaseException) -> bool:
|
|
51
|
+
text = str(exc)
|
|
52
|
+
return any(marker in text for marker in _RETRYABLE_MARKERS)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def _list_table_names(db: object) -> set[str]:
|
|
56
|
+
"""Existing table names across LanceDB API variants (``list_tables`` ≥ ``table_names``)."""
|
|
57
|
+
if hasattr(db, "list_tables"):
|
|
58
|
+
response = await db.list_tables()
|
|
59
|
+
return set(getattr(response, "tables", response))
|
|
60
|
+
return set(await db.table_names())
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def optimize_lance_tables(index_dir: Path, *, quiet: bool = False) -> dict[str, str]:
|
|
64
|
+
"""Optimize all known Lance tables under *index_dir*, serially, with retry.
|
|
65
|
+
|
|
66
|
+
Runs ``table.optimize()`` for each name in :data:`LANCE_TABLE_NAMES` that
|
|
67
|
+
exists in the DB. Retryable commit conflicts are retried with exponential
|
|
68
|
+
backoff; any other exception (or an exhausted retry budget) is captured
|
|
69
|
+
per-table in the returned dict and logged to **stderr** — never stdout,
|
|
70
|
+
since this is callable from stdio-MCP / JSON-stdout contexts.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
index_dir: directory holding the Lance tables (the flow's LanceDB URI).
|
|
74
|
+
quiet: when True, suppress the per-table success/skip info lines on
|
|
75
|
+
stderr (errors are always logged).
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Mapping of table name → status. Values are ``"ok"``, ``"skipped"``
|
|
79
|
+
(table absent — e.g. a repo with no SQL/YAML), or ``"error: <text>"``.
|
|
80
|
+
"""
|
|
81
|
+
# Lazy import: the flow imports this module for LANCE_TABLE_NAMES and must
|
|
82
|
+
# not pay the lancedb import cost at flow-definition time.
|
|
83
|
+
import lancedb
|
|
84
|
+
|
|
85
|
+
results: dict[str, str] = {}
|
|
86
|
+
db = await lancedb.connect_async(str(index_dir))
|
|
87
|
+
try:
|
|
88
|
+
try:
|
|
89
|
+
existing = await _list_table_names(db)
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
print(
|
|
92
|
+
f"java-codebase-rag: optimize: failed to list tables in "
|
|
93
|
+
f"{index_dir}: {exc}",
|
|
94
|
+
file=sys.stderr,
|
|
95
|
+
)
|
|
96
|
+
return {name: f"error: list failed: {exc}" for name in LANCE_TABLE_NAMES}
|
|
97
|
+
|
|
98
|
+
for name in LANCE_TABLE_NAMES:
|
|
99
|
+
if name not in existing:
|
|
100
|
+
results[name] = "skipped"
|
|
101
|
+
if not quiet:
|
|
102
|
+
print(
|
|
103
|
+
f"java-codebase-rag: optimize: {name} absent, skipped",
|
|
104
|
+
file=sys.stderr,
|
|
105
|
+
)
|
|
106
|
+
continue
|
|
107
|
+
try:
|
|
108
|
+
table = await db.open_table(name)
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
results[name] = f"error: open failed: {exc}"
|
|
111
|
+
print(
|
|
112
|
+
f"java-codebase-rag: optimize: {name} open failed: {exc}",
|
|
113
|
+
file=sys.stderr,
|
|
114
|
+
)
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
last_exc: BaseException | None = None
|
|
118
|
+
for attempt in range(_MAX_ATTEMPTS):
|
|
119
|
+
try:
|
|
120
|
+
await table.optimize()
|
|
121
|
+
last_exc = None
|
|
122
|
+
break
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
last_exc = exc
|
|
125
|
+
if _is_retryable(exc) and attempt < _MAX_ATTEMPTS - 1:
|
|
126
|
+
await asyncio.sleep(_BASE_BACKOFF_S * (2**attempt))
|
|
127
|
+
continue
|
|
128
|
+
# Non-retryable, or retries exhausted: stop the loop and
|
|
129
|
+
# surface below — do not swallow silently.
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
if last_exc is None:
|
|
133
|
+
results[name] = "ok"
|
|
134
|
+
if not quiet:
|
|
135
|
+
print(
|
|
136
|
+
f"java-codebase-rag: optimize: {name} ok",
|
|
137
|
+
file=sys.stderr,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
results[name] = f"error: {last_exc}"
|
|
141
|
+
print(
|
|
142
|
+
f"java-codebase-rag: optimize: {name} failed: {last_exc}",
|
|
143
|
+
file=sys.stderr,
|
|
144
|
+
)
|
|
145
|
+
finally:
|
|
146
|
+
# ``AsyncConnection.close`` is a *sync* method in lancedb 0.30.x.
|
|
147
|
+
db.close()
|
|
148
|
+
return results
|