java-codebase-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,52 @@
1
+ """CLI-owned stderr progress lines (shared by server reprocess path and pipeline helpers)."""
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import sys
6
+ from pathlib import Path
7
+
8
+
9
+ def emit_lance_cocoindex_start(project_root: Path) -> None:
10
+ root = project_root.expanduser().resolve()
11
+ print(
12
+ f"[lance] running cocoindex update (project_root={root})",
13
+ file=sys.stderr,
14
+ flush=True,
15
+ )
16
+
17
+
18
+ def emit_lance_cocoindex_finish(*, elapsed_s: float, exit_code: int) -> None:
19
+ print(
20
+ f"[lance] cocoindex update finished in {elapsed_s:.2f}s (exit={exit_code})",
21
+ file=sys.stderr,
22
+ flush=True,
23
+ )
24
+
25
+
26
+ async def accumulate_and_relay_subprocess_streams(
27
+ proc: asyncio.subprocess.Process,
28
+ *,
29
+ relay: bool,
30
+ ) -> tuple[bytes, bytes]:
31
+ """Read stdout and stderr until EOF; optionally copy each chunk verbatim to stderr."""
32
+ stdout = proc.stdout
33
+ stderr = proc.stderr
34
+ if stdout is None or stderr is None:
35
+ raise RuntimeError("subprocess must be created with stdout=PIPE and stderr=PIPE")
36
+
37
+ out_buf = bytearray()
38
+ err_buf = bytearray()
39
+
40
+ async def drain(reader: asyncio.StreamReader, target: bytearray) -> None:
41
+ while True:
42
+ chunk = await reader.read(65536)
43
+ if not chunk:
44
+ break
45
+ target.extend(chunk)
46
+ if relay:
47
+ sys.stderr.buffer.write(chunk)
48
+ sys.stderr.buffer.flush()
49
+
50
+ await asyncio.gather(drain(stdout, out_buf), drain(stderr, err_buf))
51
+ await proc.wait()
52
+ return bytes(out_buf), bytes(err_buf)
@@ -0,0 +1,327 @@
1
+ """Unified operator config: index paths, embedding knobs, YAML (PR-CLI-2).
2
+
3
+ Precedence for shared knobs: CLI > env > YAML > built-in default.
4
+ Legacy env names and legacy YAML filenames are never read for behaviour;
5
+ optional one-line stderr hints may fire when deprecated names are detected.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import re
11
+ import sys
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Any, Literal
15
+
16
+ SettingSource = Literal["cli", "env", "yaml", "default"]
17
+
18
+ YAML_CONFIG_FILENAMES = (".java-codebase-rag.yml", ".java-codebase-rag.yaml")
19
+ LEGACY_YAML_FILENAMES = (".lancedb-mcp.yml", ".lancedb-mcp.yaml")
20
+
21
+ ENV_INDEX_DIR = "JAVA_CODEBASE_RAG_INDEX_DIR"
22
+ # Public operator contract is five names: INDEX_DIR, DEBUG_CONTEXT, RUN_HEAVY, SBERT_MODEL, SBERT_DEVICE.
23
+ # SOURCE_ROOT is still required for MCP / subprocess Java tree resolution (see mcp.json.example); it is not folded into the headline "5".
24
+ ENV_SOURCE_ROOT = "JAVA_CODEBASE_RAG_SOURCE_ROOT"
25
+ ENV_DEBUG_CONTEXT = "JAVA_CODEBASE_RAG_DEBUG_CONTEXT"
26
+ ENV_RUN_HEAVY = "JAVA_CODEBASE_RAG_RUN_HEAVY"
27
+
28
+ _DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
29
+
30
+ # Matches either $VAR or ${VAR} (POSIX shell variable syntax).
31
+ _UNRESOLVED_VAR_RE = re.compile(r"\$(\w+|\{[^}]+\})")
32
+
33
+
34
+ def maybe_expand_embedding_model_path(value: str) -> str:
35
+ """Expand ``~`` and ``$VAR`` when *value* is path-shaped.
36
+
37
+ Path-shape: starts with ``/``, ``./``, ``../``, ``~``, or contains ``$``.
38
+ Plain ``org/name`` (hub id) does not match and is passed through unchanged.
39
+
40
+ Used for ``embedding.model`` after precedence resolution and for runtime
41
+ ``SBERT_MODEL`` reads (e.g. MCP) so the string matches ``ResolvedOperatorConfig``.
42
+ """
43
+ needs_expand = value.startswith(("/", "./", "../", "~")) or "$" in value
44
+ if not needs_expand:
45
+ return value
46
+ expanded = os.path.expandvars(os.path.expanduser(value))
47
+ if _UNRESOLVED_VAR_RE.search(expanded):
48
+ print(
49
+ f"java-codebase-rag: path-shaped model string contains unresolved variable: {expanded}",
50
+ file=sys.stderr,
51
+ )
52
+ return expanded
53
+
54
+
55
+ def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
56
+ """``SBERT_MODEL`` from the process environment, with the same expansion as YAML/CLI resolution.
57
+
58
+ *import_time_default* is typically ``index_common.SBERT_MODEL`` (expanded at import
59
+ when ``SBERT_MODEL`` was unset); when the env var is set or non-empty, that value wins
60
+ and is normalized with :func:`maybe_expand_embedding_model_path`.
61
+ """
62
+ raw = os.environ.get("SBERT_MODEL")
63
+ picked = import_time_default if (raw is None or not str(raw).strip()) else str(raw).strip()
64
+ return maybe_expand_embedding_model_path(picked)
65
+
66
+
67
+ # Legacy env keys: never honored; detection-only hints name the replacement (if any).
68
+ _LEGACY_ENV_HINTS: tuple[tuple[str, str], ...] = (
69
+ ("LANCEDB_URI", "JAVA_CODEBASE_RAG_INDEX_DIR"),
70
+ ("KUZU_DB_PATH", "JAVA_CODEBASE_RAG_INDEX_DIR (Kuzu lives at <index_dir>/code_graph.kuzu)"),
71
+ ("LANCEDB_MCP_PROJECT_ROOT", "cwd or --source-root (no env replacement)"),
72
+ ("LANCEDB_MCP_ALLOW_REFRESH", "(removed; use init / increment / reprocess / erase)"),
73
+ ("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.kuzu exists)"),
74
+ ("LANCEDB_MCP_MICROSERVICE_ROOTS", "microservice_roots: in .java-codebase-rag.yml"),
75
+ ("LANCEDB_MCP_DEBUG_CONTEXT", ENV_DEBUG_CONTEXT),
76
+ ("LANCEDB_MCP_RUN_HEAVY", ENV_RUN_HEAVY),
77
+ ("COCOINDEX_DB", "defaults to <JAVA_CODEBASE_RAG_INDEX_DIR>/cocoindex.db"),
78
+ )
79
+
80
+ _legacy_hint_seen: set[str] = set()
81
+ _legacy_yaml_hint_roots: set[str] = set()
82
+
83
+
84
+ def emit_legacy_env_hints_if_present() -> None:
85
+ """One-line stderr hints when deprecated env vars are set (values are not read)."""
86
+ for old, replacement in _LEGACY_ENV_HINTS:
87
+ if old not in os.environ:
88
+ continue
89
+ key = f"env:{old}"
90
+ if key in _legacy_hint_seen:
91
+ continue
92
+ _legacy_hint_seen.add(key)
93
+ print(
94
+ f"java-codebase-rag: {old} is set but no longer read; use {replacement}.",
95
+ file=sys.stderr,
96
+ )
97
+
98
+
99
+ def emit_legacy_yaml_hint_if_needed(source_root: Path) -> None:
100
+ """If legacy YAML exists without a new config file, print a one-line stderr hint once per root."""
101
+ root_s = str(source_root.resolve())
102
+ if root_s in _legacy_yaml_hint_roots:
103
+ return
104
+ has_new = any((source_root / n).is_file() for n in YAML_CONFIG_FILENAMES)
105
+ if has_new:
106
+ return
107
+ for name in LEGACY_YAML_FILENAMES:
108
+ if (source_root / name).is_file():
109
+ _legacy_yaml_hint_roots.add(root_s)
110
+ print(
111
+ "java-codebase-rag: found legacy "
112
+ f"{name}; rename to .java-codebase-rag.yml to re-enable config.",
113
+ file=sys.stderr,
114
+ )
115
+ return
116
+
117
+
118
+ def find_yaml_config_file(source_root: Path) -> Path | None:
119
+ for name in YAML_CONFIG_FILENAMES:
120
+ p = source_root / name
121
+ if p.is_file():
122
+ return p
123
+ return None
124
+
125
+
126
+ def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
127
+ path = find_yaml_config_file(source_root)
128
+ if path is None:
129
+ return {}
130
+ try:
131
+ import yaml
132
+ except ImportError:
133
+ return {}
134
+ try:
135
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
136
+ except Exception:
137
+ return {}
138
+ return data if isinstance(data, dict) else {}
139
+
140
+
141
+ @dataclass(frozen=True)
142
+ class ResolvedOperatorConfig:
143
+ source_root: Path
144
+ index_dir: Path
145
+ kuzu_path: Path
146
+ cocoindex_db: Path
147
+ embedding_model: str
148
+ embedding_device: str | None
149
+ index_dir_source: SettingSource
150
+ embedding_model_source: SettingSource
151
+ embedding_device_source: SettingSource
152
+
153
+ def apply_to_os_environ(self) -> None:
154
+ """Make downstream modules (server, kuzu_queries, flows) see a consistent environment.
155
+
156
+ When ``embedding_device`` is unset, ``SBERT_DEVICE`` is not removed from ``os.environ`` so
157
+ a long-lived host process is not mutated for unrelated callers; subprocesses still use
158
+ :meth:`subprocess_env`, which omits ``SBERT_DEVICE`` unless explicitly resolved.
159
+ """
160
+ os.environ[ENV_INDEX_DIR] = str(self.index_dir.resolve())
161
+ os.environ[ENV_SOURCE_ROOT] = str(self.source_root.resolve())
162
+ os.environ["SBERT_MODEL"] = self.embedding_model
163
+ if self.embedding_device is not None:
164
+ os.environ["SBERT_DEVICE"] = self.embedding_device
165
+
166
+ def subprocess_env(self, base: dict[str, str] | None = None) -> dict[str, str]:
167
+ out = dict(base or os.environ)
168
+ out[ENV_INDEX_DIR] = str(self.index_dir.resolve())
169
+ out[ENV_SOURCE_ROOT] = str(self.source_root.resolve())
170
+ out["SBERT_MODEL"] = self.embedding_model
171
+ if self.embedding_device is not None:
172
+ out["SBERT_DEVICE"] = self.embedding_device
173
+ else:
174
+ out.pop("SBERT_DEVICE", None)
175
+ return out
176
+
177
+
178
+ def _pick_str(
179
+ *,
180
+ cli_val: str | None,
181
+ env_key: str,
182
+ yaml_dict: dict[str, Any],
183
+ yaml_path: tuple[str, ...],
184
+ default: str,
185
+ ) -> tuple[str, SettingSource]:
186
+ if cli_val is not None and str(cli_val).strip() != "":
187
+ return str(cli_val).strip(), "cli"
188
+ env_raw = os.environ.get(env_key, "").strip()
189
+ if env_raw:
190
+ return env_raw, "env"
191
+ cur: Any = yaml_dict
192
+ for part in yaml_path:
193
+ if not isinstance(cur, dict) or part not in cur:
194
+ cur = None
195
+ break
196
+ cur = cur.get(part)
197
+ if isinstance(cur, str) and cur.strip():
198
+ return cur.strip(), "yaml"
199
+ return default, "default"
200
+
201
+
202
+ def _pick_optional_device(
203
+ *,
204
+ cli_val: str | None,
205
+ env_key: str,
206
+ yaml_dict: dict[str, Any],
207
+ ) -> tuple[str | None, SettingSource]:
208
+ if cli_val is not None and str(cli_val).strip() != "":
209
+ return str(cli_val).strip(), "cli"
210
+ env_raw = os.environ.get(env_key, "").strip()
211
+ if env_raw:
212
+ return env_raw, "env"
213
+ emb = yaml_dict.get("embedding")
214
+ if isinstance(emb, dict):
215
+ d = emb.get("device")
216
+ if isinstance(d, str) and d.strip():
217
+ return d.strip(), "yaml"
218
+ return None, "default"
219
+
220
+
221
+ def _resolve_index_dir_path(
222
+ *,
223
+ source_root: Path,
224
+ cli_index_dir: str | None,
225
+ yaml_dict: dict[str, Any],
226
+ ) -> tuple[Path, SettingSource]:
227
+ raw_cli = cli_index_dir.strip() if isinstance(cli_index_dir, str) else None
228
+ if raw_cli:
229
+ p = Path(raw_cli).expanduser()
230
+ out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
231
+ return out, "cli"
232
+
233
+ env_raw = os.environ.get(ENV_INDEX_DIR, "").strip()
234
+ if env_raw:
235
+ p = Path(env_raw).expanduser()
236
+ out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
237
+ return out, "env"
238
+
239
+ idx = yaml_dict.get("index_dir")
240
+ if isinstance(idx, str) and idx.strip():
241
+ p = Path(idx.strip()).expanduser()
242
+ out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
243
+ return out, "yaml"
244
+
245
+ return (source_root / ".java-codebase-rag").resolve(), "default"
246
+
247
+
248
+ def resolve_operator_config(
249
+ *,
250
+ source_root: Path | None,
251
+ cli_index_dir: str | None = None,
252
+ cli_embedding_model: str | None = None,
253
+ cli_embedding_device: str | None = None,
254
+ ) -> ResolvedOperatorConfig:
255
+ root = (source_root or Path.cwd()).expanduser().resolve()
256
+ yaml_dict = load_yaml_mapping(root)
257
+ index_dir, index_src = _resolve_index_dir_path(
258
+ source_root=root, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
259
+ )
260
+ model, model_src = _pick_str(
261
+ cli_val=cli_embedding_model,
262
+ env_key="SBERT_MODEL",
263
+ yaml_dict=yaml_dict,
264
+ yaml_path=("embedding", "model"),
265
+ default=_DEFAULT_EMBEDDING_MODEL,
266
+ )
267
+ model = maybe_expand_embedding_model_path(model)
268
+ device, device_src = _pick_optional_device(
269
+ cli_val=cli_embedding_device,
270
+ env_key="SBERT_DEVICE",
271
+ yaml_dict=yaml_dict,
272
+ )
273
+ ku = index_dir / "code_graph.kuzu"
274
+ coco = index_dir / "cocoindex.db"
275
+ return ResolvedOperatorConfig(
276
+ source_root=root,
277
+ index_dir=index_dir,
278
+ kuzu_path=ku,
279
+ cocoindex_db=coco,
280
+ embedding_model=model,
281
+ embedding_device=device,
282
+ index_dir_source=index_src,
283
+ embedding_model_source=model_src,
284
+ embedding_device_source=device_src,
285
+ )
286
+
287
+
288
+ def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
289
+ """True if Kuzu graph dir or any Lance table already exists under index_dir."""
290
+ paths: list[str] = []
291
+ ku = index_dir / "code_graph.kuzu"
292
+ if ku.exists():
293
+ paths.append(str(ku.resolve()))
294
+ if index_dir.is_dir():
295
+ try:
296
+ import lancedb
297
+
298
+ db = lancedb.connect(str(index_dir.resolve()))
299
+ for name in db.table_names():
300
+ paths.append(str((index_dir / name).resolve()) + " (Lance table)")
301
+ except Exception:
302
+ pass
303
+ return bool(paths), paths
304
+
305
+
306
+ def describe_path_sizes(paths: list[Path]) -> list[tuple[Path, int]]:
307
+ """Return (path, bytes) for files/dirs that exist."""
308
+ out: list[tuple[Path, int]] = []
309
+
310
+ def _sz(p: Path) -> int:
311
+ if p.is_file():
312
+ return p.stat().st_size
313
+ if p.is_dir():
314
+ total = 0
315
+ for sub in p.rglob("*"):
316
+ if sub.is_file():
317
+ try:
318
+ total += sub.stat().st_size
319
+ except OSError:
320
+ pass
321
+ return total
322
+ return 0
323
+
324
+ for p in paths:
325
+ if p.exists():
326
+ out.append((p, _sz(p)))
327
+ return out
@@ -0,0 +1,189 @@
1
+ """Subprocess helpers for cocoindex + graph builder (no heavy ML imports at import time)."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ import threading
8
+ import time
9
+ from pathlib import Path
10
+
11
+ from java_codebase_rag.cli_progress import emit_lance_cocoindex_finish, emit_lance_cocoindex_start
12
+
13
+ COCOINDEX_TARGET = "java_index_flow_lancedb.py:JavaCodeIndexLance"
14
+
15
+
16
+ def bundle_dir() -> Path:
17
+ return Path(__file__).resolve().parent.parent
18
+
19
+
20
+ def cocoindex_bin() -> Path:
21
+ return Path(sys.executable).parent / "cocoindex"
22
+
23
+
24
+ def _popen_stream_to_stderr(
25
+ proc: subprocess.Popen[bytes],
26
+ ) -> tuple[str, str, int]:
27
+ out_buf = bytearray()
28
+ err_buf = bytearray()
29
+
30
+ def drain_out() -> None:
31
+ assert proc.stdout is not None
32
+ while True:
33
+ chunk = proc.stdout.read(65536)
34
+ if not chunk:
35
+ break
36
+ out_buf.extend(chunk)
37
+ sys.stderr.buffer.write(chunk)
38
+ sys.stderr.buffer.flush()
39
+
40
+ def drain_err() -> None:
41
+ assert proc.stderr is not None
42
+ while True:
43
+ chunk = proc.stderr.read(65536)
44
+ if not chunk:
45
+ break
46
+ err_buf.extend(chunk)
47
+ sys.stderr.buffer.write(chunk)
48
+ sys.stderr.buffer.flush()
49
+
50
+ t_out = threading.Thread(target=drain_out, name="stream-stdout", daemon=True)
51
+ t_err = threading.Thread(target=drain_err, name="stream-stderr", daemon=True)
52
+ t_out.start()
53
+ t_err.start()
54
+ t_out.join()
55
+ t_err.join()
56
+ code = proc.wait()
57
+ return out_buf.decode(errors="replace"), err_buf.decode(errors="replace"), code
58
+
59
+
60
+ def run_cocoindex_update(
61
+ env: dict[str, str],
62
+ *,
63
+ full_reprocess: bool,
64
+ quiet: bool,
65
+ lance_project_root: Path | None = None,
66
+ ) -> subprocess.CompletedProcess[str]:
67
+ exe = cocoindex_bin()
68
+ if not exe.is_file():
69
+ return subprocess.CompletedProcess(
70
+ args=[str(exe)],
71
+ returncode=127,
72
+ stdout="",
73
+ stderr=f"cocoindex not found next to Python: {exe}",
74
+ )
75
+ bd = bundle_dir()
76
+ flow = bd / "java_index_flow_lancedb.py"
77
+ if not flow.is_file():
78
+ return subprocess.CompletedProcess(
79
+ args=[],
80
+ returncode=126,
81
+ stdout="",
82
+ stderr=f"java_index_flow_lancedb.py not found under {bd}",
83
+ )
84
+ cmd: list[str] = [str(exe), "update", COCOINDEX_TARGET]
85
+ if full_reprocess:
86
+ cmd.extend(["--full-reprocess", "-f"])
87
+ else:
88
+ cmd.append("-f")
89
+ if quiet:
90
+ cmd.append("-q")
91
+ return subprocess.run(
92
+ cmd,
93
+ cwd=str(bd),
94
+ env=env,
95
+ capture_output=True,
96
+ text=True,
97
+ )
98
+
99
+ emit_lance = lance_project_root is not None
100
+ if emit_lance:
101
+ emit_lance_cocoindex_start(lance_project_root)
102
+ t0 = time.perf_counter()
103
+ code = -1
104
+ out_s, err_s = "", ""
105
+ try:
106
+ proc = subprocess.Popen(
107
+ cmd,
108
+ cwd=str(bd),
109
+ env=env,
110
+ stdout=subprocess.PIPE,
111
+ stderr=subprocess.PIPE,
112
+ bufsize=0,
113
+ )
114
+ out_s, err_s, code = _popen_stream_to_stderr(proc)
115
+ finally:
116
+ if emit_lance:
117
+ emit_lance_cocoindex_finish(elapsed_s=time.perf_counter() - t0, exit_code=code)
118
+ return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)
119
+
120
+
121
+ def run_cocoindex_drop(env: dict[str, str], *, quiet: bool) -> subprocess.CompletedProcess[str]:
122
+ exe = cocoindex_bin()
123
+ if not exe.is_file():
124
+ return subprocess.CompletedProcess(
125
+ args=[str(exe)],
126
+ returncode=127,
127
+ stdout="",
128
+ stderr=f"cocoindex not found next to Python: {exe}",
129
+ )
130
+ bd = bundle_dir()
131
+ cmd = [str(exe), "drop", COCOINDEX_TARGET, "-f"]
132
+ if quiet:
133
+ cmd.append("-q")
134
+ return subprocess.run(
135
+ cmd,
136
+ cwd=str(bd),
137
+ env=env,
138
+ capture_output=True,
139
+ text=True,
140
+ )
141
+
142
+
143
+ def run_build_ast_graph(
144
+ *,
145
+ source_root: Path,
146
+ kuzu_path: Path,
147
+ verbose: bool,
148
+ env: dict[str, str] | None = None,
149
+ ) -> subprocess.CompletedProcess[str]:
150
+ builder = bundle_dir() / "build_ast_graph.py"
151
+ if not builder.is_file():
152
+ return subprocess.CompletedProcess(
153
+ args=[],
154
+ returncode=126,
155
+ stdout="",
156
+ stderr=f"build_ast_graph.py not found under {builder.parent}",
157
+ )
158
+ cmd: list[str] = [
159
+ sys.executable,
160
+ str(builder),
161
+ "--source-root",
162
+ str(source_root),
163
+ "--kuzu-path",
164
+ str(kuzu_path),
165
+ ]
166
+ if verbose:
167
+ cmd.append("--verbose")
168
+ if not verbose:
169
+ return subprocess.run(
170
+ cmd,
171
+ cwd=str(source_root),
172
+ env=env or os.environ.copy(),
173
+ capture_output=True,
174
+ text=True,
175
+ )
176
+ proc = subprocess.Popen(
177
+ cmd,
178
+ cwd=str(source_root),
179
+ env=env or os.environ.copy(),
180
+ stdout=subprocess.PIPE,
181
+ stderr=subprocess.PIPE,
182
+ bufsize=0,
183
+ )
184
+ out_s, err_s, code = _popen_stream_to_stderr(proc)
185
+ return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)
186
+
187
+
188
+ def clip(s: str, n: int) -> str:
189
+ return s[-n:] if len(s) > n else s