java-codebase-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,761 @@
1
+ from __future__ import annotations
2
+
3
+ # Heavy imports (`server`, `pr_analysis`, `path_filtering.LayeredIgnore`) stay lazy
4
+ # inside handlers so `java-codebase-rag --help` stays fast.
5
+
6
+ import argparse
7
+ import asyncio
8
+ import json
9
+ import pprint
10
+ import shutil
11
+ import sys
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Any, Callable
15
+
16
+ from java_codebase_rag.config import (
17
+ ResolvedOperatorConfig,
18
+ describe_path_sizes,
19
+ emit_legacy_env_hints_if_present,
20
+ emit_legacy_yaml_hint_if_needed,
21
+ index_dir_has_existing_artifacts,
22
+ resolve_operator_config,
23
+ )
24
+ from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update
25
+ from java_ontology import VALID_UNRESOLVED_CALL_REASONS
26
+
27
+ KUZU_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"
28
+
29
+ _INCREMENT_WARNING_LINES = (
30
+ "WARNING: AST graph (Kuzu) incremental rebuild is not yet implemented.",
31
+ "The graph reflects the index state from the last `init` or `reprocess`,",
32
+ "which means `find`, `neighbors`, and `describe` may return stale results",
33
+ "for files changed since then.",
34
+ "",
35
+ "Lance vector index has been updated incrementally and is current.",
36
+ "",
37
+ "For an up-to-date graph, run:",
38
+ " java-codebase-rag reprocess",
39
+ "",
40
+ "Track progress on Kuzu incremental rebuild:",
41
+ f" {KUZU_INCREMENTAL_TRACKING_ISSUE_URL}",
42
+ )
43
+
44
+ _REFRESH_DEPRECATION = (
45
+ "WARN: 'refresh' is deprecated; use 'reprocess'. "
46
+ "This alias will be removed in the next release."
47
+ )
48
+
49
+ _REPROCESS_DRIFT_VECTORS_ONLY = (
50
+ "java-codebase-rag reprocess: rebuilt vectors only; graph (code_graph.kuzu) was NOT rebuilt "
51
+ "and may now reflect a stale source snapshot."
52
+ )
53
+
54
+
55
+ def _reprocess_drift_graph_only_line(index_dir: Path) -> str:
56
+ return (
57
+ "java-codebase-rag reprocess: rebuilt graph only; vectors (Lance tables under "
58
+ f"{index_dir}) were NOT rebuilt and may now reflect a stale source snapshot."
59
+ )
60
+
61
+
62
+ def _reprocess_exit_code(payload: dict[str, Any]) -> int:
63
+ if payload.get("success"):
64
+ return 0
65
+ phases_run = payload.get("phases_run") or []
66
+ if not phases_run:
67
+ return 2
68
+ return 1
69
+
70
+
71
+ # Preflight detection must stay aligned with stub CompletedProcess shapes in
72
+ # java_codebase_rag/pipeline.py (missing cocoindex / flow / build_ast_graph.py).
73
+ def _is_cocoindex_preflight_blocker(coco: Any) -> bool:
74
+ """True when ``run_cocoindex_update`` returned without spawning cocoindex."""
75
+ return bool(coco.returncode in (126, 127) and len(getattr(coco, "args", ()) or ()) <= 1)
76
+
77
+
78
+ def _is_graph_preflight_blocker(g: Any) -> bool:
79
+ """True when ``run_build_ast_graph`` returned without spawning the builder."""
80
+ return bool(g.returncode in (126, 127) and len(getattr(g, "args", ()) or ()) <= 1)
81
+
82
+
83
+ def _emit_reprocess_selective_tty(*, mode: str) -> None:
84
+ if mode == "vectors":
85
+ print("Rebuilt: vectors")
86
+ print("Skipped: graph (use `java-codebase-rag reprocess --graph-only` or `reprocess` to refresh)")
87
+ else:
88
+ print("Rebuilt: graph")
89
+ print("Skipped: vectors (use `java-codebase-rag reprocess --vectors-only` or `reprocess` to refresh)")
90
+
91
+
92
+ def _emit_reprocess_outcome(payload: dict[str, Any], *, selective_tty_mode: str | None = None) -> None:
93
+ if payload.get("success") and selective_tty_mode and sys.stdout.isatty():
94
+ _emit_reprocess_selective_tty(mode=selective_tty_mode)
95
+ return
96
+ _emit(payload)
97
+
98
+
99
+ _PIPELINE_SEP = "\u00b7"
100
+
101
+
102
+ def _pipeline_header(subcommand: str, cfg: ResolvedOperatorConfig) -> None:
103
+ root = cfg.source_root.resolve()
104
+ idx = cfg.index_dir.resolve()
105
+ print(
106
+ f"java-codebase-rag {subcommand} {_PIPELINE_SEP} source={root} {_PIPELINE_SEP} index={idx}",
107
+ file=sys.stderr,
108
+ flush=True,
109
+ )
110
+
111
+
112
+ def _pipeline_footer(subcommand: str, started: float, exit_code: int) -> None:
113
+ elapsed = time.perf_counter() - started
114
+ print(
115
+ f"java-codebase-rag {subcommand} {_PIPELINE_SEP} finished in {elapsed:.2f}s (exit={exit_code})",
116
+ file=sys.stderr,
117
+ flush=True,
118
+ )
119
+
120
+
121
+ def _run_with_pipeline_progress(
122
+ subcommand: str,
123
+ cfg: ResolvedOperatorConfig,
124
+ *,
125
+ quiet: bool,
126
+ work: Callable[[], int],
127
+ ) -> int:
128
+ if quiet:
129
+ return int(work())
130
+ _pipeline_header(subcommand, cfg)
131
+ t0 = time.perf_counter()
132
+ code = 0
133
+ try:
134
+ code = int(work())
135
+ return code
136
+ except BaseException as exc:
137
+ # Keep footer aligned with process outcome (main maps unhandled Exception -> exit 2).
138
+ if isinstance(exc, SystemExit):
139
+ c = exc.code
140
+ if isinstance(c, int):
141
+ code = c
142
+ elif c in (None, False):
143
+ code = 0
144
+ else:
145
+ code = 1
146
+ elif code == 0:
147
+ code = 2
148
+ raise
149
+ finally:
150
+ _pipeline_footer(subcommand, t0, code)
151
+
152
+
153
+ def _jsonable(value: Any) -> Any:
154
+ if hasattr(value, "model_dump"):
155
+ return value.model_dump()
156
+ if isinstance(value, Path):
157
+ return str(value)
158
+ raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")
159
+
160
+
161
+ def _to_payload(value: Any) -> Any:
162
+ if hasattr(value, "model_dump"):
163
+ return value.model_dump()
164
+ return value
165
+
166
+
167
+ def _emit(value: Any) -> None:
168
+ payload = _to_payload(value)
169
+ if sys.stdout.isatty():
170
+ print(pprint.pformat(payload, sort_dicts=True))
171
+ return
172
+ print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None))
173
+
174
+
175
+ def _emit_increment_kuzu_warning() -> None:
176
+ for line in _INCREMENT_WARNING_LINES:
177
+ print(line, file=sys.stderr)
178
+
179
+
180
+ def _parse_source_root(ns: argparse.Namespace) -> Path | None:
181
+ if ns.source_root:
182
+ return Path(ns.source_root).expanduser().resolve()
183
+ return None
184
+
185
+
186
+ def _resolved_from_ns(ns: argparse.Namespace) -> ResolvedOperatorConfig:
187
+ root = _parse_source_root(ns)
188
+ return resolve_operator_config(
189
+ source_root=root,
190
+ cli_index_dir=ns.index_dir,
191
+ cli_embedding_model=getattr(ns, "embedding_model", None),
192
+ cli_embedding_device=getattr(ns, "embedding_device", None),
193
+ )
194
+
195
+
196
+ def _startup_hints(cfg: ResolvedOperatorConfig) -> None:
197
+ emit_legacy_env_hints_if_present()
198
+ emit_legacy_yaml_hint_if_needed(cfg.source_root)
199
+
200
+
201
+ def _add_index_embedding_flags(p: argparse.ArgumentParser) -> None:
202
+ p.add_argument("--source-root", type=str, default=None, help="Java repository root (default: cwd)")
203
+ p.add_argument("--index-dir", type=str, default=None, help="Index directory (Lance + Kuzu + cocoindex state)")
204
+ p.add_argument("--embedding-model", type=str, default=None, help="Override SBERT_MODEL / YAML embedding.model")
205
+ p.add_argument("--embedding-device", type=str, default=None, help="Override SBERT_DEVICE / YAML embedding.device")
206
+
207
+
208
+ def _cmd_init(args: argparse.Namespace) -> int:
209
+ cfg = _resolved_from_ns(args)
210
+ _startup_hints(cfg)
211
+ cfg.apply_to_os_environ()
212
+ occupied, paths = index_dir_has_existing_artifacts(cfg.index_dir)
213
+ if occupied:
214
+ _emit(
215
+ {
216
+ "success": False,
217
+ "message": (
218
+ "init refused: index paths already exist. "
219
+ "Use `java-codebase-rag reprocess` to rebuild in place, "
220
+ "or `java-codebase-rag erase --yes` then `init` for a clean slate."
221
+ ),
222
+ "non_empty_paths": paths,
223
+ }
224
+ )
225
+ return 2
226
+ cfg.index_dir.mkdir(parents=True, exist_ok=True)
227
+
228
+ def work() -> int:
229
+ env = cfg.subprocess_env()
230
+ coco = run_cocoindex_update(
231
+ env,
232
+ full_reprocess=False,
233
+ quiet=bool(args.quiet),
234
+ lance_project_root=None if args.quiet else cfg.source_root,
235
+ )
236
+ if coco.returncode != 0:
237
+ _emit(
238
+ {
239
+ "success": False,
240
+ "exit_code": coco.returncode,
241
+ "stdout": clip(coco.stdout, 8000),
242
+ "stderr": clip(coco.stderr, 8000),
243
+ "message": f"cocoindex exit {coco.returncode}",
244
+ }
245
+ )
246
+ return 1
247
+ g = run_build_ast_graph(
248
+ source_root=cfg.source_root,
249
+ kuzu_path=cfg.kuzu_path,
250
+ verbose=not args.quiet,
251
+ env=env,
252
+ )
253
+ if g.returncode != 0:
254
+ _emit(
255
+ {
256
+ "success": False,
257
+ "exit_code": g.returncode,
258
+ "stdout": clip(g.stdout, 4000),
259
+ "stderr": clip(g.stderr, 4000),
260
+ "message": f"graph builder exit {g.returncode}",
261
+ }
262
+ )
263
+ return 1
264
+ _emit({"success": True, "message": "init completed"})
265
+ return 0
266
+
267
+ return _run_with_pipeline_progress("init", cfg, quiet=bool(args.quiet), work=work)
268
+
269
+
270
+ def _cmd_increment(args: argparse.Namespace) -> int:
271
+ cfg = _resolved_from_ns(args)
272
+ _startup_hints(cfg)
273
+ cfg.apply_to_os_environ()
274
+ _emit_increment_kuzu_warning()
275
+
276
+ def work() -> int:
277
+ env = cfg.subprocess_env()
278
+ coco = run_cocoindex_update(
279
+ env,
280
+ full_reprocess=False,
281
+ quiet=bool(args.quiet),
282
+ lance_project_root=None if args.quiet else cfg.source_root,
283
+ )
284
+ if coco.returncode != 0:
285
+ _emit(
286
+ {
287
+ "success": False,
288
+ "exit_code": coco.returncode,
289
+ "stdout": clip(coco.stdout, 8000),
290
+ "stderr": clip(coco.stderr, 8000),
291
+ "message": f"cocoindex exit {coco.returncode}",
292
+ }
293
+ )
294
+ return 1
295
+ _emit({"success": True, "message": "increment completed (Lance only; graph may be stale — see stderr)"})
296
+ return 0
297
+
298
+ return _run_with_pipeline_progress("increment", cfg, quiet=bool(args.quiet), work=work)
299
+
300
+
301
+ def _cmd_reprocess(args: argparse.Namespace) -> int:
302
+ cfg = _resolved_from_ns(args)
303
+ _startup_hints(cfg)
304
+ cfg.apply_to_os_environ()
305
+
306
+ def work() -> int:
307
+ env = cfg.subprocess_env()
308
+ vectors_only = bool(getattr(args, "vectors_only", False))
309
+ graph_only = bool(getattr(args, "graph_only", False))
310
+
311
+ if vectors_only:
312
+ coco = run_cocoindex_update(env, full_reprocess=True, quiet=bool(args.quiet))
313
+ if _is_cocoindex_preflight_blocker(coco):
314
+ payload: dict[str, Any] = {
315
+ "success": False,
316
+ "exit_code": None,
317
+ "stdout": clip(coco.stdout, 8000),
318
+ "stderr": clip(coco.stderr, 8000),
319
+ "message": coco.stderr.strip() or f"cocoindex setup exit {coco.returncode}",
320
+ "graph_exit_code": None,
321
+ "graph_stdout": "",
322
+ "graph_stderr": "",
323
+ "phases_run": [],
324
+ }
325
+ _emit_reprocess_outcome(payload)
326
+ return _reprocess_exit_code(payload)
327
+ ok = coco.returncode == 0
328
+ payload = {
329
+ "success": ok,
330
+ "exit_code": coco.returncode,
331
+ "stdout": clip(coco.stdout, 8000),
332
+ "stderr": clip(coco.stderr, 8000),
333
+ "message": None if ok else f"cocoindex exit {coco.returncode}",
334
+ "graph_exit_code": None,
335
+ "graph_stdout": "",
336
+ "graph_stderr": "",
337
+ "phases_run": ["vectors"],
338
+ }
339
+ if ok:
340
+ print(_REPROCESS_DRIFT_VECTORS_ONLY, file=sys.stderr)
341
+ _emit_reprocess_outcome(payload, selective_tty_mode="vectors" if ok else None)
342
+ return _reprocess_exit_code(payload)
343
+
344
+ if graph_only:
345
+ g = run_build_ast_graph(
346
+ source_root=cfg.source_root,
347
+ kuzu_path=cfg.kuzu_path,
348
+ verbose=not args.quiet,
349
+ env=env,
350
+ )
351
+ if _is_graph_preflight_blocker(g):
352
+ payload = {
353
+ "success": False,
354
+ "exit_code": None,
355
+ "stdout": "",
356
+ "stderr": "",
357
+ "message": g.stderr.strip() or f"graph builder setup exit {g.returncode}",
358
+ "graph_exit_code": None,
359
+ "graph_stdout": clip(g.stdout, 4000),
360
+ "graph_stderr": clip(g.stderr, 4000),
361
+ "phases_run": [],
362
+ }
363
+ _emit_reprocess_outcome(payload)
364
+ return _reprocess_exit_code(payload)
365
+ ok = g.returncode == 0
366
+ payload = {
367
+ "success": ok,
368
+ "exit_code": None,
369
+ "stdout": "",
370
+ "stderr": "",
371
+ "message": None if ok else f"graph builder exit {g.returncode}",
372
+ "graph_exit_code": g.returncode,
373
+ "graph_stdout": clip(g.stdout, 4000),
374
+ "graph_stderr": clip(g.stderr, 4000),
375
+ "phases_run": ["graph"],
376
+ }
377
+ if ok:
378
+ print(_reprocess_drift_graph_only_line(cfg.index_dir), file=sys.stderr)
379
+ _emit_reprocess_outcome(payload, selective_tty_mode="graph" if ok else None)
380
+ return _reprocess_exit_code(payload)
381
+
382
+ import server # lazy: pulls sentence_transformers/torch/lancedb/kuzu
383
+
384
+ result = asyncio.run(server.run_refresh_pipeline(quiet=bool(args.quiet)))
385
+ payload = result.model_dump()
386
+ _emit_reprocess_outcome(payload)
387
+ return _reprocess_exit_code(payload)
388
+
389
+ return _run_with_pipeline_progress("reprocess", cfg, quiet=bool(args.quiet), work=work)
390
+
391
+
392
+ def _cmd_erase(args: argparse.Namespace) -> int:
393
+ cfg = _resolved_from_ns(args)
394
+ _startup_hints(cfg)
395
+ cfg.apply_to_os_environ()
396
+ to_describe: list[Path] = [cfg.kuzu_path, cfg.cocoindex_db]
397
+ if cfg.index_dir.is_dir():
398
+ try:
399
+ import lancedb
400
+
401
+ db = lancedb.connect(str(cfg.index_dir.resolve()))
402
+ for name in db.table_names():
403
+ to_describe.append(cfg.index_dir / name)
404
+ except Exception:
405
+ pass
406
+ rows = describe_path_sizes(to_describe)
407
+ summary_lines = [f" {p}: {sz} bytes" for p, sz in rows] or [" (nothing to delete under resolved index dir)"]
408
+ print("Will delete:", file=sys.stderr)
409
+ print("\n".join(summary_lines), file=sys.stderr)
410
+ if not args.yes:
411
+ if not sys.stdin.isatty():
412
+ print(
413
+ "java-codebase-rag erase: non-interactive stdin; pass --yes to confirm.",
414
+ file=sys.stderr,
415
+ )
416
+ return 2
417
+ ans = input("Delete these paths? [y/N]: ").strip().lower()
418
+ if ans not in ("y", "yes"):
419
+ print("Aborted.", file=sys.stderr)
420
+ return 2
421
+
422
+ def work() -> int:
423
+ env = cfg.subprocess_env()
424
+ drop = run_cocoindex_drop(env, quiet=bool(args.quiet))
425
+ if drop.returncode == 127:
426
+ print(
427
+ "java-codebase-rag erase: cocoindex CLI not found next to this Python; "
428
+ "skipped `cocoindex drop` — cocoindex.db (if any) was not removed by CocoIndex.",
429
+ file=sys.stderr,
430
+ )
431
+ elif drop.returncode != 0:
432
+ print(clip(drop.stderr, 4000), file=sys.stderr)
433
+ if cfg.kuzu_path.exists():
434
+ shutil.rmtree(cfg.kuzu_path, ignore_errors=True)
435
+ if cfg.cocoindex_db.exists():
436
+ try:
437
+ cfg.cocoindex_db.unlink()
438
+ except OSError:
439
+ pass
440
+ if cfg.index_dir.is_dir():
441
+ try:
442
+ import lancedb
443
+
444
+ db = lancedb.connect(str(cfg.index_dir.resolve()))
445
+ for name in list(db.table_names()):
446
+ try:
447
+ db.drop_table(name)
448
+ except Exception as exc:
449
+ print(f"warning: failed to drop Lance table {name!r}: {exc}", file=sys.stderr)
450
+ except Exception:
451
+ pass
452
+ _emit({"success": True, "message": "erase completed"})
453
+ return 0
454
+
455
+ return _run_with_pipeline_progress("erase", cfg, quiet=bool(args.quiet), work=work)
456
+
457
+
458
+ def _cmd_meta(args: argparse.Namespace) -> int:
459
+ import server # lazy
460
+
461
+ cfg = _resolved_from_ns(args)
462
+ _startup_hints(cfg)
463
+ cfg.apply_to_os_environ()
464
+ from kuzu_queries import KuzuGraph # lazy
465
+
466
+ KuzuGraph._instance = None
467
+ KuzuGraph._instance_path = None
468
+ payload = server._graph_meta_output().model_dump()
469
+ payload["embedding_model"] = cfg.embedding_model
470
+ payload["embedding_device"] = cfg.embedding_device
471
+ payload["embedding_model_source"] = cfg.embedding_model_source
472
+ payload["embedding_device_source"] = cfg.embedding_device_source
473
+ payload["index_dir"] = str(cfg.index_dir.resolve())
474
+ payload["kuzu_path"] = str(cfg.kuzu_path.resolve())
475
+ payload["index_dir_source"] = cfg.index_dir_source
476
+ _emit(payload)
477
+ return 0 if payload.get("success") else 2
478
+
479
+
480
+ def _cmd_tables(args: argparse.Namespace) -> int:
481
+ import server # lazy
482
+
483
+ cfg = _resolved_from_ns(args)
484
+ _startup_hints(cfg)
485
+ cfg.apply_to_os_environ()
486
+ payload = server.list_code_index_tables_payload().model_dump()
487
+ _emit(payload)
488
+ return 0
489
+
490
+
491
+ def _cmd_diagnose_ignore(args: argparse.Namespace) -> int:
492
+ import server # lazy
493
+ from path_filtering import LayeredIgnore # lazy
494
+
495
+ cfg = _resolved_from_ns(args)
496
+ _startup_hints(cfg)
497
+ cfg.apply_to_os_environ()
498
+ root = server._project_root()
499
+ raw = Path(args.path)
500
+ try:
501
+ abs_path = raw.resolve() if raw.is_absolute() else (root / raw).resolve()
502
+ except OSError as exc:
503
+ _emit({"success": False, "message": f"Invalid path: {exc}"})
504
+ return 1
505
+ li = LayeredIgnore(root)
506
+ _emit(li.diagnose_dict(abs_path))
507
+ return 0
508
+
509
+
510
+ def _read_diff_text(args: argparse.Namespace) -> str:
511
+ if args.diff_file:
512
+ return Path(args.diff_file).read_text(encoding="utf-8")
513
+ if args.diff_stdin:
514
+ return sys.stdin.read()
515
+ raise ValueError("Provide exactly one of --diff-file or --diff-stdin")
516
+
517
+
518
+ def _cmd_unresolved_calls_list(args: argparse.Namespace) -> int:
519
+ cfg = _resolved_from_ns(args)
520
+ _startup_hints(cfg)
521
+ cfg.apply_to_os_environ()
522
+ from kuzu_queries import KuzuGraph # lazy
523
+
524
+ if not KuzuGraph.exists():
525
+ _emit({"success": False, "message": "Kuzu graph not found"})
526
+ return 1
527
+ graph = KuzuGraph.get()
528
+ rows = graph.list_unresolved_call_sites(
529
+ method_id=args.method_id,
530
+ reason=args.reason,
531
+ microservice=args.microservice,
532
+ callee_simple=args.callee_simple,
533
+ limit=int(args.limit),
534
+ )
535
+ _emit({"success": True, "count": len(rows), "sites": rows})
536
+ return 0
537
+
538
+
539
+ def _cmd_unresolved_calls_stats(args: argparse.Namespace) -> int:
540
+ cfg = _resolved_from_ns(args)
541
+ _startup_hints(cfg)
542
+ cfg.apply_to_os_environ()
543
+ from kuzu_queries import KuzuGraph # lazy
544
+
545
+ if not KuzuGraph.exists():
546
+ _emit({"success": False, "message": "Kuzu graph not found"})
547
+ return 1
548
+ graph = KuzuGraph.get()
549
+ buckets = graph.stats_unresolved_call_sites(by=args.by)
550
+ total = sum(int(r.get("n") or 0) for r in buckets)
551
+ _emit({"success": True, "total": total, "by": args.by, "buckets": buckets})
552
+ return 0
553
+
554
+
555
+ def _cmd_analyze_pr(args: argparse.Namespace) -> int:
556
+ cfg = _resolved_from_ns(args)
557
+ _startup_hints(cfg)
558
+ cfg.apply_to_os_environ()
559
+ try:
560
+ diff_text = _read_diff_text(args)
561
+ except Exception as exc:
562
+ _emit({"success": False, "message": str(exc)})
563
+ return 1
564
+ if not diff_text.strip():
565
+ _emit({"success": False, "message": "Diff is empty"})
566
+ return 1
567
+ import pr_analysis # lazy
568
+ from kuzu_queries import KuzuGraph # lazy
569
+
570
+ if not KuzuGraph.exists():
571
+ _emit({"success": False, "message": "Kuzu graph not found"})
572
+ return 1
573
+ graph = KuzuGraph.get()
574
+ report = pr_analysis.analyze_pr_pipeline(graph, diff_text)
575
+ _emit(pr_analysis.pr_report_to_dict(report))
576
+ return 0
577
+
578
+
579
+ def build_parser() -> argparse.ArgumentParser:
580
+ description = (
581
+ "java-codebase-rag — graph-native code intelligence for Java microservices.\n\n"
582
+ "Lifecycle commands stream subprocess progress to stderr (including relayed child stdout); "
583
+ "--quiet suppresses that stream; stdout remains the machine-readable payload.\n\n"
584
+ "Lifecycle (manage the index):\n"
585
+ " init Create a fresh index from a Java repository.\n"
586
+ " increment Pick up changes since the last index update (Lance only).\n"
587
+ " reprocess Full vector + graph rebuild (default); optional --vectors-only / --graph-only.\n"
588
+ " erase Delete the index from disk.\n\n"
589
+ "Introspection (inspect the index):\n"
590
+ " meta Print ontology version, edge counts, and table summary.\n"
591
+ " tables List Lance tables and row counts.\n"
592
+ " diagnose-ignore Show which ignore-pattern layer decided a path's fate.\n"
593
+ " unresolved-calls List or aggregate receiver-failure call sites (not in CALLS).\n\n"
594
+ "Analysis (work with code changes):\n"
595
+ " analyze-pr Compute blast-radius + risk score for a unified diff.\n\n"
596
+ "Run `java-codebase-rag <command> --help` for command-specific options."
597
+ )
598
+ parser = argparse.ArgumentParser(
599
+ prog="java-codebase-rag",
600
+ description=description,
601
+ formatter_class=argparse.RawDescriptionHelpFormatter,
602
+ exit_on_error=False,
603
+ )
604
+ subparsers = parser.add_subparsers(dest="subcommand")
605
+
606
+ init = subparsers.add_parser(
607
+ "init",
608
+ help="Create a fresh index from a Java repository.",
609
+ description=(
610
+ "First-time index creation. Refuses if the resolved index directory "
611
+ "already contains a Kuzu graph or Lance tables. Exit 2 on refusal."
612
+ ),
613
+ )
614
+ _add_index_embedding_flags(init)
615
+ init.add_argument(
616
+ "--quiet",
617
+ action="store_true",
618
+ help="Suppress stderr progress relay; stdout payload unchanged.",
619
+ )
620
+ init.set_defaults(handler=_cmd_init)
621
+
622
+ increment = subparsers.add_parser(
623
+ "increment",
624
+ help="Pick up changes since the last index update.",
625
+ description="Runs cocoindex catch-up (no full reprocess). Does not rebuild Kuzu; see stderr warning.",
626
+ )
627
+ _add_index_embedding_flags(increment)
628
+ increment.add_argument(
629
+ "--quiet",
630
+ action="store_true",
631
+ help="Suppress stderr progress relay; stdout payload unchanged.",
632
+ )
633
+ increment.set_defaults(handler=_cmd_increment)
634
+
635
+ reprocess = subparsers.add_parser(
636
+ "reprocess",
637
+ help="Rebuild vectors and/or Kuzu (default: both full phases).",
638
+ description=(
639
+ "Default: full Lance reprocess (cocoindex --full-reprocess) then full Kuzu graph rebuild. "
640
+ "Use --vectors-only or --graph-only to run a single phase (mutually exclusive)."
641
+ ),
642
+ )
643
+ _add_index_embedding_flags(reprocess)
644
+ reprocess.add_argument(
645
+ "--quiet",
646
+ action="store_true",
647
+ help="Suppress stderr progress relay; stdout payload unchanged.",
648
+ )
649
+ _rex = reprocess.add_mutually_exclusive_group()
650
+ _rex.add_argument(
651
+ "--vectors-only",
652
+ action="store_true",
653
+ help="Run only the Lance/cocoindex full reprocess phase (no graph builder).",
654
+ )
655
+ _rex.add_argument(
656
+ "--graph-only",
657
+ action="store_true",
658
+ help="Run only build_ast_graph.py (no cocoindex / Lance reprocess).",
659
+ )
660
+ reprocess.set_defaults(handler=_cmd_reprocess)
661
+
662
+ erase = subparsers.add_parser(
663
+ "erase",
664
+ help="Delete the index from disk.",
665
+ description="Runs cocoindex drop, removes Kuzu, and drops Lance tables. Requires --yes or TTY confirmation.",
666
+ )
667
+ _add_index_embedding_flags(erase)
668
+ erase.add_argument("--yes", action="store_true", help="Confirm destructive deletion (required in CI)")
669
+ erase.add_argument(
670
+ "--quiet",
671
+ action="store_true",
672
+ help="Suppress stderr progress relay; stdout payload unchanged.",
673
+ )
674
+ erase.set_defaults(handler=_cmd_erase)
675
+
676
+ meta = subparsers.add_parser("meta", help="Print graph meta and embedding resolution.")
677
+ _add_index_embedding_flags(meta)
678
+ meta.set_defaults(handler=_cmd_meta)
679
+
680
+ tables = subparsers.add_parser("tables", help="List Lance tables and row counts.")
681
+ _add_index_embedding_flags(tables)
682
+ tables.set_defaults(handler=_cmd_tables)
683
+
684
+ diagnose = subparsers.add_parser(
685
+ "diagnose-ignore",
686
+ help="Show which ignore-pattern layer decided the fate of a path.",
687
+ )
688
+ _add_index_embedding_flags(diagnose)
689
+ diagnose.add_argument("path", type=str)
690
+ diagnose.set_defaults(handler=_cmd_diagnose_ignore)
691
+
692
+ analyze = subparsers.add_parser("analyze-pr", help="Blast-radius + risk score for a unified diff.")
693
+ _add_index_embedding_flags(analyze)
694
+ group = analyze.add_mutually_exclusive_group(required=True)
695
+ group.add_argument("--diff-file", type=str)
696
+ group.add_argument("--diff-stdin", action="store_true")
697
+ analyze.set_defaults(handler=_cmd_analyze_pr)
698
+
699
+ unresolved = subparsers.add_parser(
700
+ "unresolved-calls",
701
+ help="List or aggregate UnresolvedCallSite rows (receiver-failure call sites).",
702
+ )
703
+ _add_index_embedding_flags(unresolved)
704
+ unresolved_sub = unresolved.add_subparsers(dest="unresolved_command", required=True)
705
+
706
+ uc_list = unresolved_sub.add_parser("list", help="List unresolved call sites.")
707
+ _add_index_embedding_flags(uc_list)
708
+ uc_list.add_argument("--method-id", type=str, default=None, help="Caller Symbol id")
709
+ uc_list.add_argument(
710
+ "--reason",
711
+ type=str,
712
+ default=None,
713
+ choices=sorted(VALID_UNRESOLVED_CALL_REASONS),
714
+ help="Filter by UnresolvedCallSite.reason",
715
+ )
716
+ uc_list.add_argument("--microservice", type=str, default=None)
717
+ uc_list.add_argument("--callee-simple", type=str, default=None, dest="callee_simple")
718
+ uc_list.add_argument("--limit", type=int, default=100)
719
+ uc_list.set_defaults(handler=_cmd_unresolved_calls_list)
720
+
721
+ uc_stats = unresolved_sub.add_parser("stats", help="Aggregate unresolved call site counts.")
722
+ _add_index_embedding_flags(uc_stats)
723
+ uc_stats.add_argument(
724
+ "--by",
725
+ type=str,
726
+ choices=("reason", "microservice", "caller_role"),
727
+ default="reason",
728
+ )
729
+ uc_stats.set_defaults(handler=_cmd_unresolved_calls_stats)
730
+
731
+ return parser
732
+
733
+
734
+ def main(argv: list[str] | None = None) -> int:
735
+ raw = list(argv if argv is not None else sys.argv[1:])
736
+ if raw and raw[0] == "refresh":
737
+ print(_REFRESH_DEPRECATION, file=sys.stderr)
738
+ raw[0] = "reprocess"
739
+ parser = build_parser()
740
+ try:
741
+ args = parser.parse_args(raw)
742
+ except SystemExit as e:
743
+ if e.code in (0, None):
744
+ return 0
745
+ return int(e.code) if isinstance(e.code, int) else 2
746
+ except argparse.ArgumentError as exc:
747
+ print(f"java-codebase-rag: {exc}", file=sys.stderr)
748
+ return 2
749
+ handler = getattr(args, "handler", None)
750
+ if handler is None:
751
+ parser.print_help()
752
+ return 2
753
+ try:
754
+ return int(handler(args))
755
+ except Exception as exc: # pragma: no cover - defensive top-level guard
756
+ _emit({"success": False, "exit_code": 2, "message": f"internal error: {exc}"})
757
+ return 2
758
+
759
+
760
+ if __name__ == "__main__":
761
+ raise SystemExit(main())