java-codebase-rag 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +24 -7
- build_ast_graph.py +153 -94
- graph_enrich.py +3 -3
- java_codebase_rag/_fdlimit.py +48 -0
- java_codebase_rag/cli.py +31 -28
- java_codebase_rag/config.py +40 -10
- java_codebase_rag/installer.py +99 -10
- java_codebase_rag/lance_optimize.py +148 -0
- java_codebase_rag/pipeline.py +63 -9
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/METADATA +6 -5
- java_codebase_rag-0.6.1.dist-info/RECORD +36 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/top_level.txt +1 -1
- java_index_flow_lancedb.py +22 -4
- java_ontology.py +5 -2
- ladybug_queries.py +1995 -0
- mcp_v2.py +51 -26
- pr_analysis.py +1 -1
- search_lancedb.py +8 -8
- server.py +116 -68
- user_rag/__init__.py +1 -0
- user_rag/cli.py +175 -0
- java_codebase_rag-0.5.3.dist-info/RECORD +0 -31
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/WHEEL +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/entry_points.txt +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/licenses/LICENSE +0 -0
graph_enrich.py
CHANGED
|
@@ -334,7 +334,7 @@ def collect_annotation_meta_chain(
|
|
|
334
334
|
) -> dict[str, frozenset[str]]:
|
|
335
335
|
"""Map annotation simple name → built-in simple names reachable via meta-annotations.
|
|
336
336
|
|
|
337
|
-
Single source of truth for Layer A: both the
|
|
337
|
+
Single source of truth for Layer A: both the LadybugDB writer and Lance chunk
|
|
338
338
|
enrichment must use this; they must not derive `meta_chain` from separate
|
|
339
339
|
filesystem walks. See ``PLAN-BROWNFIELD-ROLE-OVERRIDES`` §
|
|
340
340
|
*Single source of truth (REQUIRED — read before implementation)*.
|
|
@@ -350,7 +350,7 @@ def annotation_meta_decls_from_graph_tables(
|
|
|
350
350
|
"""From `build_ast_graph.GraphTables.types`, map @interface simple name -> meta anns.
|
|
351
351
|
|
|
352
352
|
Used for diagnostics; Layer A in production uses `collect_annotation_meta_chain`
|
|
353
|
-
(disk) so
|
|
353
|
+
(disk) so LadybugDB and Lance share one index.
|
|
354
354
|
"""
|
|
355
355
|
decls: dict[str, tuple[str, ...]] = {}
|
|
356
356
|
first_fqn: dict[str, str] = {}
|
|
@@ -1702,7 +1702,7 @@ def enrich_chunk(
|
|
|
1702
1702
|
|
|
1703
1703
|
|
|
1704
1704
|
def symbol_id(kind: str, fqn: str, file_path: str = "", start_byte: int = 0) -> str:
|
|
1705
|
-
"""Deterministic SHA1-based id for
|
|
1705
|
+
"""Deterministic SHA1-based id for LadybugDB Symbol nodes."""
|
|
1706
1706
|
key = f"{kind}|{fqn}|{file_path}|{start_byte}".encode("utf-8")
|
|
1707
1707
|
return hashlib.sha1(key).hexdigest()
|
|
1708
1708
|
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Raise the process soft file-descriptor limit to avoid LanceDB EMFILE.
|
|
2
|
+
|
|
3
|
+
LanceDB's merge-insert path opens many file handles concurrently; under the
|
|
4
|
+
default OS soft ``RLIMIT_NOFILE`` (256 on macOS processes launched by GUI /
|
|
5
|
+
launchd / IDE hosts, *not* the shell's raised limit) this exhausts file
|
|
6
|
+
descriptors and surfaces as::
|
|
7
|
+
|
|
8
|
+
RuntimeError: lance error: LanceError(IO): ... Too many open files (os error 24)
|
|
9
|
+
lance-io-4.0.0/src/local.rs:133:24
|
|
10
|
+
|
|
11
|
+
``raise_fd_limit`` raises the process's *own* soft limit toward its hard limit.
|
|
12
|
+
``RLIMIT_NOFILE`` is inherited across ``fork``+``exec``, so every CocoIndex /
|
|
13
|
+
``cocoindex-code`` child spawned afterwards inherits the headroom. This fixes the
|
|
14
|
+
failure regardless of launch context (shell vs IDE vs MCP host) and regardless of
|
|
15
|
+
Lance's internal IO concurrency.
|
|
16
|
+
|
|
17
|
+
Never raise to ``RLIM_INFINITY`` — that breaks ``select()``/kqueue and Python
|
|
18
|
+
selectors on macOS; ``cap`` bounds the target to a safe value.
|
|
19
|
+
|
|
20
|
+
See https://github.com/HumanBean17/java-codebase-rag/issues/306
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import resource
|
|
26
|
+
|
|
27
|
+
# Safe ceiling well above LanceDB's appetite, comfortably below macOS libc
|
|
28
|
+
# quirks. The hard limit caps it further if lower (locked-down servers).
|
|
29
|
+
_DEFAULT_CAP = 65536
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def raise_fd_limit(cap: int = _DEFAULT_CAP) -> None:
|
|
33
|
+
"""Raise this process's soft ``RLIMIT_NOFILE`` toward its hard limit.
|
|
34
|
+
|
|
35
|
+
Best-effort and silent: never raises. No-op where ``RLIMIT_NOFILE`` is
|
|
36
|
+
unsupported (Windows) or where the soft limit already meets ``min(hard, cap)``.
|
|
37
|
+
"""
|
|
38
|
+
if not hasattr(resource, "RLIMIT_NOFILE"):
|
|
39
|
+
return
|
|
40
|
+
soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
41
|
+
target = min(hard, cap)
|
|
42
|
+
if soft >= target:
|
|
43
|
+
return
|
|
44
|
+
try:
|
|
45
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (target, hard))
|
|
46
|
+
except (ValueError, OSError):
|
|
47
|
+
# Best-effort: a locked-down environment shouldn't fail the run.
|
|
48
|
+
pass
|
java_codebase_rag/cli.py
CHANGED
|
@@ -21,13 +21,14 @@ from java_codebase_rag.config import (
|
|
|
21
21
|
index_dir_has_existing_artifacts,
|
|
22
22
|
resolve_operator_config,
|
|
23
23
|
)
|
|
24
|
+
from java_codebase_rag._fdlimit import raise_fd_limit
|
|
24
25
|
from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update, run_incremental_graph
|
|
25
26
|
from java_ontology import VALID_UNRESOLVED_CALL_REASONS
|
|
26
27
|
|
|
27
|
-
|
|
28
|
+
LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"
|
|
28
29
|
|
|
29
30
|
_INCREMENT_WARNING_LINES = (
|
|
30
|
-
"WARNING: AST graph (
|
|
31
|
+
"WARNING: AST graph (LadybugDB) incremental rebuild is not yet implemented.",
|
|
31
32
|
"The graph reflects the index state from the last `init` or `reprocess`,",
|
|
32
33
|
"which means `find`, `neighbors`, and `describe` may return stale results",
|
|
33
34
|
"for files changed since then.",
|
|
@@ -37,8 +38,8 @@ _INCREMENT_WARNING_LINES = (
|
|
|
37
38
|
"For an up-to-date graph, run:",
|
|
38
39
|
" java-codebase-rag reprocess",
|
|
39
40
|
"",
|
|
40
|
-
"Track progress on
|
|
41
|
-
f" {
|
|
41
|
+
"Track progress on LadybugDB incremental rebuild:",
|
|
42
|
+
f" {LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL}",
|
|
42
43
|
)
|
|
43
44
|
|
|
44
45
|
_REFRESH_DEPRECATION = (
|
|
@@ -47,7 +48,7 @@ _REFRESH_DEPRECATION = (
|
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
_REPROCESS_DRIFT_VECTORS_ONLY = (
|
|
50
|
-
"java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.
|
|
51
|
+
"java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.lbug) was NOT rebuilt "
|
|
51
52
|
"and may now reflect a stale source snapshot."
|
|
52
53
|
)
|
|
53
54
|
|
|
@@ -178,7 +179,7 @@ def _emit(value: Any) -> None:
|
|
|
178
179
|
print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None))
|
|
179
180
|
|
|
180
181
|
|
|
181
|
-
def
|
|
182
|
+
def _emit_increment_ladybug_warning() -> None:
|
|
182
183
|
for line in _INCREMENT_WARNING_LINES:
|
|
183
184
|
print(line, file=sys.stderr)
|
|
184
185
|
|
|
@@ -289,7 +290,7 @@ def _cmd_init(args: argparse.Namespace) -> int:
|
|
|
289
290
|
print(file=sys.stderr, flush=True)
|
|
290
291
|
g = run_build_ast_graph(
|
|
291
292
|
source_root=cfg.source_root,
|
|
292
|
-
|
|
293
|
+
ladybug_path=cfg.ladybug_path,
|
|
293
294
|
verbose=verbose,
|
|
294
295
|
quiet=bool(args.quiet),
|
|
295
296
|
env=env,
|
|
@@ -319,7 +320,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
|
|
|
319
320
|
# Check for --vectors-only flag
|
|
320
321
|
vectors_only = bool(getattr(args, "vectors_only", False))
|
|
321
322
|
if vectors_only:
|
|
322
|
-
|
|
323
|
+
_emit_increment_ladybug_warning()
|
|
323
324
|
|
|
324
325
|
def work() -> int:
|
|
325
326
|
env = cfg.subprocess_env()
|
|
@@ -350,7 +351,7 @@ def _cmd_increment(args: argparse.Namespace) -> int:
|
|
|
350
351
|
# Run incremental graph update
|
|
351
352
|
g = run_incremental_graph(
|
|
352
353
|
source_root=cfg.source_root,
|
|
353
|
-
|
|
354
|
+
ladybug_path=cfg.ladybug_path,
|
|
354
355
|
verbose=bool(args.verbose),
|
|
355
356
|
quiet=bool(args.quiet),
|
|
356
357
|
env=env,
|
|
@@ -437,7 +438,7 @@ def _cmd_reprocess(args: argparse.Namespace) -> int:
|
|
|
437
438
|
if graph_only:
|
|
438
439
|
g = run_build_ast_graph(
|
|
439
440
|
source_root=cfg.source_root,
|
|
440
|
-
|
|
441
|
+
ladybug_path=cfg.ladybug_path,
|
|
441
442
|
verbose=verbose,
|
|
442
443
|
quiet=bool(args.quiet),
|
|
443
444
|
env=env,
|
|
@@ -509,7 +510,7 @@ def _cmd_erase(args: argparse.Namespace) -> int:
|
|
|
509
510
|
cfg = _resolved_from_ns(args)
|
|
510
511
|
_startup_hints(cfg)
|
|
511
512
|
cfg.apply_to_os_environ()
|
|
512
|
-
to_describe: list[Path] = [cfg.
|
|
513
|
+
to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db]
|
|
513
514
|
if cfg.index_dir.is_dir():
|
|
514
515
|
try:
|
|
515
516
|
import lancedb
|
|
@@ -546,8 +547,8 @@ def _cmd_erase(args: argparse.Namespace) -> int:
|
|
|
546
547
|
)
|
|
547
548
|
elif drop.returncode != 0:
|
|
548
549
|
print(clip(drop.stderr, 4000), file=sys.stderr)
|
|
549
|
-
if cfg.
|
|
550
|
-
shutil.rmtree(cfg.
|
|
550
|
+
if cfg.ladybug_path.exists():
|
|
551
|
+
shutil.rmtree(cfg.ladybug_path, ignore_errors=True)
|
|
551
552
|
if cfg.cocoindex_db.exists():
|
|
552
553
|
try:
|
|
553
554
|
cfg.cocoindex_db.unlink()
|
|
@@ -577,17 +578,17 @@ def _cmd_meta(args: argparse.Namespace) -> int:
|
|
|
577
578
|
cfg = _resolved_from_ns(args)
|
|
578
579
|
_startup_hints(cfg)
|
|
579
580
|
cfg.apply_to_os_environ()
|
|
580
|
-
from
|
|
581
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
581
582
|
|
|
582
|
-
|
|
583
|
-
|
|
583
|
+
LadybugGraph._instance = None
|
|
584
|
+
LadybugGraph._instance_path = None
|
|
584
585
|
payload = server._graph_meta_output().model_dump()
|
|
585
586
|
payload["embedding_model"] = cfg.embedding_model
|
|
586
587
|
payload["embedding_device"] = cfg.embedding_device
|
|
587
588
|
payload["embedding_model_source"] = cfg.embedding_model_source
|
|
588
589
|
payload["embedding_device_source"] = cfg.embedding_device_source
|
|
589
590
|
payload["index_dir"] = str(cfg.index_dir.resolve())
|
|
590
|
-
payload["
|
|
591
|
+
payload["ladybug_path"] = str(cfg.ladybug_path.resolve())
|
|
591
592
|
payload["index_dir_source"] = cfg.index_dir_source
|
|
592
593
|
payload["hints_enabled"] = cfg.hints_enabled
|
|
593
594
|
payload["hints_enabled_source"] = cfg.hints_enabled_source
|
|
@@ -637,12 +638,12 @@ def _cmd_unresolved_calls_list(args: argparse.Namespace) -> int:
|
|
|
637
638
|
cfg = _resolved_from_ns(args)
|
|
638
639
|
_startup_hints(cfg)
|
|
639
640
|
cfg.apply_to_os_environ()
|
|
640
|
-
from
|
|
641
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
641
642
|
|
|
642
|
-
if not
|
|
643
|
+
if not LadybugGraph.exists():
|
|
643
644
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
644
645
|
return 1
|
|
645
|
-
graph =
|
|
646
|
+
graph = LadybugGraph.get()
|
|
646
647
|
rows = graph.list_unresolved_call_sites(
|
|
647
648
|
method_id=args.method_id,
|
|
648
649
|
reason=args.reason,
|
|
@@ -658,12 +659,12 @@ def _cmd_unresolved_calls_stats(args: argparse.Namespace) -> int:
|
|
|
658
659
|
cfg = _resolved_from_ns(args)
|
|
659
660
|
_startup_hints(cfg)
|
|
660
661
|
cfg.apply_to_os_environ()
|
|
661
|
-
from
|
|
662
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
662
663
|
|
|
663
|
-
if not
|
|
664
|
+
if not LadybugGraph.exists():
|
|
664
665
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
665
666
|
return 1
|
|
666
|
-
graph =
|
|
667
|
+
graph = LadybugGraph.get()
|
|
667
668
|
buckets = graph.stats_unresolved_call_sites(by=args.by)
|
|
668
669
|
total = sum(int(r.get("n") or 0) for r in buckets)
|
|
669
670
|
_emit({"success": True, "total": total, "by": args.by, "buckets": buckets})
|
|
@@ -683,12 +684,12 @@ def _cmd_analyze_pr(args: argparse.Namespace) -> int:
|
|
|
683
684
|
_emit({"success": False, "message": "Diff is empty"})
|
|
684
685
|
return 1
|
|
685
686
|
import pr_analysis # lazy
|
|
686
|
-
from
|
|
687
|
+
from ladybug_queries import LadybugGraph # lazy
|
|
687
688
|
|
|
688
|
-
if not
|
|
689
|
+
if not LadybugGraph.exists():
|
|
689
690
|
_emit({"success": False, "message": "Kuzu graph not found"})
|
|
690
691
|
return 1
|
|
691
|
-
graph =
|
|
692
|
+
graph = LadybugGraph.get()
|
|
692
693
|
report = pr_analysis.analyze_pr_pipeline(graph, diff_text)
|
|
693
694
|
_emit(pr_analysis.pr_report_to_dict(report))
|
|
694
695
|
return 0
|
|
@@ -774,8 +775,9 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
774
775
|
help="Refresh shipped artifacts (skill, agent, MCP entry) after pip upgrade.",
|
|
775
776
|
description=(
|
|
776
777
|
"Post-upgrade refresh: overwrites skill and agent files with the latest "
|
|
777
|
-
"shipped versions and updates the MCP command path.
|
|
778
|
-
"
|
|
778
|
+
"shipped versions and updates the MCP command path. If an index exists, "
|
|
779
|
+
"also runs an incremental Lance + graph catch-up (same as `increment`). "
|
|
780
|
+
"Use --dry-run to preview changes without writing. Requires a prior `install` run."
|
|
779
781
|
),
|
|
780
782
|
)
|
|
781
783
|
update.add_argument(
|
|
@@ -902,6 +904,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
902
904
|
|
|
903
905
|
|
|
904
906
|
def main(argv: list[str] | None = None) -> int:
|
|
907
|
+
raise_fd_limit()
|
|
905
908
|
raw = list(argv if argv is not None else sys.argv[1:])
|
|
906
909
|
if raw and raw[0] == "refresh":
|
|
907
910
|
print(_REFRESH_DEPRECATION, file=sys.stderr)
|
java_codebase_rag/config.py
CHANGED
|
@@ -25,6 +25,27 @@ ENV_SOURCE_ROOT = "JAVA_CODEBASE_RAG_SOURCE_ROOT"
|
|
|
25
25
|
ENV_DEBUG_CONTEXT = "JAVA_CODEBASE_RAG_DEBUG_CONTEXT"
|
|
26
26
|
ENV_RUN_HEAVY = "JAVA_CODEBASE_RAG_RUN_HEAVY"
|
|
27
27
|
|
|
28
|
+
# CocoIndex inflight-component throttle. CocoIndex's default is 1024 inflight
|
|
29
|
+
# components (cocoindex/_internal/app.py: ``_ENV_MAX_INFLIGHT_COMPONENTS``),
|
|
30
|
+
# which spawns enough concurrent LanceDB merge-inserts to exhaust OS file
|
|
31
|
+
# descriptors under default ulimits -> "Too many open files (os error 24)".
|
|
32
|
+
# NOTE: this is the REAL env var. An earlier fix (#293) set the non-existent
|
|
33
|
+
# ``COCOINDEX_SOURCE_MAX_INFLIGHT_ROWS`` — CocoIndex never reads it, so it was a
|
|
34
|
+
# no-op and the EMFILE error recurred (#306).
|
|
35
|
+
COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV = "COCOINDEX_MAX_INFLIGHT_COMPONENTS"
|
|
36
|
+
COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS = "256"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def cocoindex_subprocess_env_defaults() -> dict[str, str]:
|
|
40
|
+
"""Env defaults applied to every CocoIndex subprocess to bound concurrency.
|
|
41
|
+
|
|
42
|
+
Apply with ``env.setdefault(...)`` so a caller-provided (operator) value
|
|
43
|
+
always wins. See :issue:`306`.
|
|
44
|
+
"""
|
|
45
|
+
return {
|
|
46
|
+
COCOINDEX_MAX_INFLIGHT_COMPONENTS_ENV: COCOINDEX_DEFAULT_MAX_INFLIGHT_COMPONENTS
|
|
47
|
+
}
|
|
48
|
+
|
|
28
49
|
_DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
29
50
|
|
|
30
51
|
# Matches either $VAR or ${VAR} (POSIX shell variable syntax).
|
|
@@ -67,10 +88,9 @@ def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
|
|
|
67
88
|
# Legacy env keys: never honored; detection-only hints name the replacement (if any).
|
|
68
89
|
_LEGACY_ENV_HINTS: tuple[tuple[str, str], ...] = (
|
|
69
90
|
("LANCEDB_URI", "JAVA_CODEBASE_RAG_INDEX_DIR"),
|
|
70
|
-
("KUZU_DB_PATH", "JAVA_CODEBASE_RAG_INDEX_DIR (Kuzu lives at <index_dir>/code_graph.kuzu)"),
|
|
71
91
|
("LANCEDB_MCP_PROJECT_ROOT", "cwd or --source-root (no env replacement)"),
|
|
72
92
|
("LANCEDB_MCP_ALLOW_REFRESH", "(removed; use init / increment / reprocess / erase)"),
|
|
73
|
-
("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.
|
|
93
|
+
("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.lbug exists)"),
|
|
74
94
|
("LANCEDB_MCP_MICROSERVICE_ROOTS", "microservice_roots: in .java-codebase-rag.yml"),
|
|
75
95
|
("LANCEDB_MCP_DEBUG_CONTEXT", ENV_DEBUG_CONTEXT),
|
|
76
96
|
("LANCEDB_MCP_RUN_HEAVY", ENV_RUN_HEAVY),
|
|
@@ -182,7 +202,7 @@ def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
|
|
|
182
202
|
class ResolvedOperatorConfig:
|
|
183
203
|
source_root: Path
|
|
184
204
|
index_dir: Path
|
|
185
|
-
|
|
205
|
+
ladybug_path: Path
|
|
186
206
|
cocoindex_db: Path
|
|
187
207
|
embedding_model: str
|
|
188
208
|
embedding_device: str | None
|
|
@@ -193,7 +213,7 @@ class ResolvedOperatorConfig:
|
|
|
193
213
|
hints_enabled_source: SettingSource
|
|
194
214
|
|
|
195
215
|
def apply_to_os_environ(self) -> None:
|
|
196
|
-
"""Make downstream modules (server,
|
|
216
|
+
"""Make downstream modules (server, ladybug_queries, flows) see a consistent environment.
|
|
197
217
|
|
|
198
218
|
When ``embedding_device`` is unset, ``SBERT_DEVICE`` is not removed from ``os.environ`` so
|
|
199
219
|
a long-lived host process is not mutated for unrelated callers; subprocesses still use
|
|
@@ -286,9 +306,19 @@ def _pick_bool(
|
|
|
286
306
|
def _resolve_index_dir_path(
|
|
287
307
|
*,
|
|
288
308
|
source_root: Path,
|
|
309
|
+
config_dir: Path,
|
|
289
310
|
cli_index_dir: str | None,
|
|
290
311
|
yaml_dict: dict[str, Any],
|
|
291
312
|
) -> tuple[Path, SettingSource]:
|
|
313
|
+
# Bases for relative paths:
|
|
314
|
+
# - YAML ``index_dir`` -> the config file's directory (``config_dir``),
|
|
315
|
+
# the SAME base used for YAML ``source_root``. Paths written in the
|
|
316
|
+
# config file are relative to the file, so both keys stay consistent.
|
|
317
|
+
# - CLI / env ``index_dir`` -> ``source_root`` (unchanged). These are not
|
|
318
|
+
# "in the config file"; preserving the existing base avoids a semantics
|
|
319
|
+
# change for operators who pass ``--index-dir`` on the command line.
|
|
320
|
+
# - Default ``./.java-codebase-rag`` -> ``source_root`` so the index sits
|
|
321
|
+
# beside the Java tree (the layout ``discover_project_root`` anchors on).
|
|
292
322
|
raw_cli = cli_index_dir.strip() if isinstance(cli_index_dir, str) else None
|
|
293
323
|
if raw_cli:
|
|
294
324
|
p = Path(raw_cli).expanduser()
|
|
@@ -304,7 +334,7 @@ def _resolve_index_dir_path(
|
|
|
304
334
|
idx = yaml_dict.get("index_dir")
|
|
305
335
|
if isinstance(idx, str) and idx.strip():
|
|
306
336
|
p = Path(idx.strip()).expanduser()
|
|
307
|
-
out = p.resolve() if p.is_absolute() else (
|
|
337
|
+
out = p.resolve() if p.is_absolute() else (config_dir / p).resolve()
|
|
308
338
|
return out, "yaml"
|
|
309
339
|
|
|
310
340
|
return (source_root / ".java-codebase-rag").resolve(), "default"
|
|
@@ -348,7 +378,7 @@ def resolve_operator_config(
|
|
|
348
378
|
root = config_dir
|
|
349
379
|
|
|
350
380
|
index_dir, index_src = _resolve_index_dir_path(
|
|
351
|
-
source_root=root, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
|
|
381
|
+
source_root=root, config_dir=config_dir, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
|
|
352
382
|
)
|
|
353
383
|
model, model_src = _pick_str(
|
|
354
384
|
cli_val=cli_embedding_model,
|
|
@@ -369,12 +399,12 @@ def resolve_operator_config(
|
|
|
369
399
|
yaml_path=("hints", "enabled"),
|
|
370
400
|
default=True,
|
|
371
401
|
)
|
|
372
|
-
ku = index_dir / "code_graph.
|
|
402
|
+
ku = index_dir / "code_graph.lbug"
|
|
373
403
|
coco = index_dir / "cocoindex.db"
|
|
374
404
|
return ResolvedOperatorConfig(
|
|
375
405
|
source_root=root,
|
|
376
406
|
index_dir=index_dir,
|
|
377
|
-
|
|
407
|
+
ladybug_path=ku,
|
|
378
408
|
cocoindex_db=coco,
|
|
379
409
|
embedding_model=model,
|
|
380
410
|
embedding_device=device,
|
|
@@ -387,9 +417,9 @@ def resolve_operator_config(
|
|
|
387
417
|
|
|
388
418
|
|
|
389
419
|
def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
|
|
390
|
-
"""True if
|
|
420
|
+
"""True if graph dir or any Lance table already exists under index_dir."""
|
|
391
421
|
paths: list[str] = []
|
|
392
|
-
ku = index_dir / "code_graph.
|
|
422
|
+
ku = index_dir / "code_graph.lbug"
|
|
393
423
|
if ku.exists():
|
|
394
424
|
paths.append(str(ku.resolve()))
|
|
395
425
|
if index_dir.is_dir():
|
java_codebase_rag/installer.py
CHANGED
|
@@ -325,6 +325,66 @@ def select_hosts(*, non_interactive: bool, cli_agents: list[str] | None) -> list
|
|
|
325
325
|
return [HOSTS[name] for name in selected]
|
|
326
326
|
|
|
327
327
|
|
|
328
|
+
def select_microservices(
|
|
329
|
+
java_dirs: list[Path],
|
|
330
|
+
*,
|
|
331
|
+
non_interactive: bool,
|
|
332
|
+
preselected: list[str] | None = None,
|
|
333
|
+
) -> list[str] | None:
|
|
334
|
+
"""Show an interactive checklist of detected microservices, all pre-checked.
|
|
335
|
+
|
|
336
|
+
Returns None when all are selected (-> microservice_roots omitted, index
|
|
337
|
+
everything) or a non-empty subset list. Never returns [].
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
java_dirs: Detected module roots (relative Path names) from
|
|
341
|
+
detect_java_directories. Caller must pass len >= 2.
|
|
342
|
+
non_interactive: If True, return None (all) without prompting.
|
|
343
|
+
preselected: On re-run, the prior microservice_roots subset to pre-check.
|
|
344
|
+
"""
|
|
345
|
+
# Defensive guard: caller gates on len >= 2, but stay safe if called directly.
|
|
346
|
+
if len(java_dirs) < 2:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
dir_names = [str(d) for d in java_dirs]
|
|
350
|
+
|
|
351
|
+
if non_interactive:
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
preselected_set = set(preselected) if preselected else None
|
|
355
|
+
choices = [
|
|
356
|
+
{
|
|
357
|
+
"name": name,
|
|
358
|
+
"value": name,
|
|
359
|
+
"checked": (name in preselected_set) if preselected_set is not None else True,
|
|
360
|
+
}
|
|
361
|
+
for name in dir_names
|
|
362
|
+
]
|
|
363
|
+
|
|
364
|
+
print("Note: Select which modules to index. Toggle with Space, confirm with Enter.")
|
|
365
|
+
selected = prompt(
|
|
366
|
+
"checkbox",
|
|
367
|
+
"Select microservices to index:",
|
|
368
|
+
choices=choices,
|
|
369
|
+
default=dir_names, # non-TTY fallback returns all -> caller omits key
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
if not selected:
|
|
373
|
+
retry = prompt(
|
|
374
|
+
"confirm",
|
|
375
|
+
"At least one module is required. Re-select?",
|
|
376
|
+
)
|
|
377
|
+
if retry:
|
|
378
|
+
return select_microservices(java_dirs, non_interactive=False, preselected=preselected)
|
|
379
|
+
raise SystemExit(2)
|
|
380
|
+
|
|
381
|
+
selected_set = set(selected)
|
|
382
|
+
if selected_set == set(dir_names):
|
|
383
|
+
return None
|
|
384
|
+
# Preserve detection order for deterministic YAML output.
|
|
385
|
+
return [name for name in dir_names if name in selected_set]
|
|
386
|
+
|
|
387
|
+
|
|
328
388
|
def select_scope(*, non_interactive: bool, cli_scope: str | None) -> Scope:
|
|
329
389
|
"""Select 'project' or 'user' scope.
|
|
330
390
|
|
|
@@ -791,7 +851,7 @@ def run_init_if_needed(
|
|
|
791
851
|
# Run AST graph build
|
|
792
852
|
g = run_build_ast_graph(
|
|
793
853
|
source_root=cfg.source_root,
|
|
794
|
-
|
|
854
|
+
ladybug_path=cfg.ladybug_path,
|
|
795
855
|
verbose=not quiet,
|
|
796
856
|
quiet=quiet,
|
|
797
857
|
env=env,
|
|
@@ -1182,7 +1242,7 @@ def run_update(
|
|
|
1182
1242
|
index_dir_has_existing_artifacts,
|
|
1183
1243
|
resolve_operator_config,
|
|
1184
1244
|
)
|
|
1185
|
-
from java_codebase_rag.pipeline import run_cocoindex_update
|
|
1245
|
+
from java_codebase_rag.pipeline import run_cocoindex_update, run_incremental_graph
|
|
1186
1246
|
|
|
1187
1247
|
project_root = discover_project_root(cwd)
|
|
1188
1248
|
if project_root is None:
|
|
@@ -1207,22 +1267,37 @@ def run_update(
|
|
|
1207
1267
|
print("Run `java-codebase-rag install` to create one.")
|
|
1208
1268
|
return EXIT_PARTIAL if has_artifact_failures else EXIT_SUCCESS
|
|
1209
1269
|
|
|
1210
|
-
# Run increment
|
|
1270
|
+
# Run increment: LanceDB catch-up + incremental graph rebuild.
|
|
1271
|
+
# Mirrors `java-codebase-rag increment` so both index layers stay current.
|
|
1272
|
+
# The "graph not implemented" warning belongs only on the vectors-only path
|
|
1273
|
+
# (increment --vectors-only), where the graph step is deliberately skipped.
|
|
1211
1274
|
if not dry_run:
|
|
1212
|
-
print("\nUpdating index (
|
|
1275
|
+
print("\nUpdating index (Lance + graph)...")
|
|
1213
1276
|
cfg.apply_to_os_environ()
|
|
1214
1277
|
env = cfg.subprocess_env()
|
|
1215
1278
|
|
|
1216
1279
|
coco = run_cocoindex_update(env, full_reprocess=False, quiet=True)
|
|
1217
1280
|
if coco.returncode != 0:
|
|
1218
|
-
print(f"Error:
|
|
1281
|
+
print(f"Error: Lance index update failed with code {coco.returncode}")
|
|
1219
1282
|
return 1
|
|
1220
1283
|
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1284
|
+
g = run_incremental_graph(
|
|
1285
|
+
source_root=cfg.source_root,
|
|
1286
|
+
ladybug_path=cfg.ladybug_path,
|
|
1287
|
+
verbose=False,
|
|
1288
|
+
quiet=True,
|
|
1289
|
+
env=env,
|
|
1290
|
+
)
|
|
1291
|
+
if g.returncode != 0:
|
|
1292
|
+
# Artifacts above already refreshed; the graph catch-up is best-effort
|
|
1293
|
+
# here. Surface a truthful, actionable message instead of leaving the
|
|
1294
|
+
# graph silently stale or claiming the feature is unimplemented.
|
|
1295
|
+
print(
|
|
1296
|
+
f"\nWarning: incremental graph update failed (exit {g.returncode}). "
|
|
1297
|
+
"Run `java-codebase-rag reprocess` for a full rebuild."
|
|
1298
|
+
)
|
|
1224
1299
|
else:
|
|
1225
|
-
print("\nWould run incremental index update.")
|
|
1300
|
+
print("\nWould run incremental index update (Lance + graph).")
|
|
1226
1301
|
|
|
1227
1302
|
# Print summary
|
|
1228
1303
|
print("\nUpdate complete.")
|
|
@@ -1270,6 +1345,20 @@ def run_install(
|
|
|
1270
1345
|
except SystemExit as e:
|
|
1271
1346
|
return e.code
|
|
1272
1347
|
|
|
1348
|
+
# Stage 1 (Case B): interactive microservice selection (only when 2+ detected)
|
|
1349
|
+
try:
|
|
1350
|
+
selected_roots = (
|
|
1351
|
+
select_microservices(
|
|
1352
|
+
java_dirs,
|
|
1353
|
+
non_interactive=non_interactive,
|
|
1354
|
+
preselected=existing_config.get("microservice_roots") if existing_config else None,
|
|
1355
|
+
)
|
|
1356
|
+
if len(java_dirs) >= 2
|
|
1357
|
+
else None
|
|
1358
|
+
)
|
|
1359
|
+
except SystemExit as e:
|
|
1360
|
+
return e.code
|
|
1361
|
+
|
|
1273
1362
|
# Stage 2: Embedding model
|
|
1274
1363
|
resolved_model = resolve_model(model, non_interactive=non_interactive)
|
|
1275
1364
|
|
|
@@ -1312,7 +1401,7 @@ def run_install(
|
|
|
1312
1401
|
yaml_content = generate_yaml_config(
|
|
1313
1402
|
source_root,
|
|
1314
1403
|
resolved_model,
|
|
1315
|
-
microservice_roots=
|
|
1404
|
+
microservice_roots=selected_roots,
|
|
1316
1405
|
existing_yaml=existing_config,
|
|
1317
1406
|
)
|
|
1318
1407
|
|