java-codebase-rag 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +2813 -0
- brownfield_events.py +58 -0
- build_ast_graph.py +3081 -0
- chunk_heuristics.py +62 -0
- graph_enrich.py +1681 -0
- index_common.py +10 -0
- java_codebase_rag/__init__.py +1 -0
- java_codebase_rag/cli.py +761 -0
- java_codebase_rag/cli_progress.py +52 -0
- java_codebase_rag/config.py +327 -0
- java_codebase_rag/pipeline.py +189 -0
- java_codebase_rag-0.1.0.dist-info/METADATA +818 -0
- java_codebase_rag-0.1.0.dist-info/RECORD +27 -0
- java_codebase_rag-0.1.0.dist-info/WHEEL +5 -0
- java_codebase_rag-0.1.0.dist-info/entry_points.txt +3 -0
- java_codebase_rag-0.1.0.dist-info/licenses/LICENSE +21 -0
- java_codebase_rag-0.1.0.dist-info/top_level.txt +17 -0
- java_index_flow_lancedb.py +398 -0
- java_index_v1_common.py +33 -0
- java_ontology.py +446 -0
- kuzu_queries.py +1989 -0
- mcp_hints.py +748 -0
- mcp_v2.py +1957 -0
- path_filtering.py +472 -0
- pr_analysis.py +534 -0
- search_lancedb.py +1075 -0
- server.py +578 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""CLI-owned stderr progress lines (shared by server reprocess path and pipeline helpers)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def emit_lance_cocoindex_start(project_root: Path) -> None:
|
|
10
|
+
root = project_root.expanduser().resolve()
|
|
11
|
+
print(
|
|
12
|
+
f"[lance] running cocoindex update (project_root={root})",
|
|
13
|
+
file=sys.stderr,
|
|
14
|
+
flush=True,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def emit_lance_cocoindex_finish(*, elapsed_s: float, exit_code: int) -> None:
|
|
19
|
+
print(
|
|
20
|
+
f"[lance] cocoindex update finished in {elapsed_s:.2f}s (exit={exit_code})",
|
|
21
|
+
file=sys.stderr,
|
|
22
|
+
flush=True,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def accumulate_and_relay_subprocess_streams(
|
|
27
|
+
proc: asyncio.subprocess.Process,
|
|
28
|
+
*,
|
|
29
|
+
relay: bool,
|
|
30
|
+
) -> tuple[bytes, bytes]:
|
|
31
|
+
"""Read stdout and stderr until EOF; optionally copy each chunk verbatim to stderr."""
|
|
32
|
+
stdout = proc.stdout
|
|
33
|
+
stderr = proc.stderr
|
|
34
|
+
if stdout is None or stderr is None:
|
|
35
|
+
raise RuntimeError("subprocess must be created with stdout=PIPE and stderr=PIPE")
|
|
36
|
+
|
|
37
|
+
out_buf = bytearray()
|
|
38
|
+
err_buf = bytearray()
|
|
39
|
+
|
|
40
|
+
async def drain(reader: asyncio.StreamReader, target: bytearray) -> None:
|
|
41
|
+
while True:
|
|
42
|
+
chunk = await reader.read(65536)
|
|
43
|
+
if not chunk:
|
|
44
|
+
break
|
|
45
|
+
target.extend(chunk)
|
|
46
|
+
if relay:
|
|
47
|
+
sys.stderr.buffer.write(chunk)
|
|
48
|
+
sys.stderr.buffer.flush()
|
|
49
|
+
|
|
50
|
+
await asyncio.gather(drain(stdout, out_buf), drain(stderr, err_buf))
|
|
51
|
+
await proc.wait()
|
|
52
|
+
return bytes(out_buf), bytes(err_buf)
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""Unified operator config: index paths, embedding knobs, YAML (PR-CLI-2).
|
|
2
|
+
|
|
3
|
+
Precedence for shared knobs: CLI > env > YAML > built-in default.
|
|
4
|
+
Legacy env names and legacy YAML filenames are never read for behaviour;
|
|
5
|
+
optional one-line stderr hints may fire when deprecated names are detected.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import sys
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any, Literal
|
|
15
|
+
|
|
16
|
+
SettingSource = Literal["cli", "env", "yaml", "default"]
|
|
17
|
+
|
|
18
|
+
YAML_CONFIG_FILENAMES = (".java-codebase-rag.yml", ".java-codebase-rag.yaml")
|
|
19
|
+
LEGACY_YAML_FILENAMES = (".lancedb-mcp.yml", ".lancedb-mcp.yaml")
|
|
20
|
+
|
|
21
|
+
ENV_INDEX_DIR = "JAVA_CODEBASE_RAG_INDEX_DIR"
|
|
22
|
+
# Public operator contract is five names: INDEX_DIR, DEBUG_CONTEXT, RUN_HEAVY, SBERT_MODEL, SBERT_DEVICE.
|
|
23
|
+
# SOURCE_ROOT is still required for MCP / subprocess Java tree resolution (see mcp.json.example); it is not folded into the headline "5".
|
|
24
|
+
ENV_SOURCE_ROOT = "JAVA_CODEBASE_RAG_SOURCE_ROOT"
|
|
25
|
+
ENV_DEBUG_CONTEXT = "JAVA_CODEBASE_RAG_DEBUG_CONTEXT"
|
|
26
|
+
ENV_RUN_HEAVY = "JAVA_CODEBASE_RAG_RUN_HEAVY"
|
|
27
|
+
|
|
28
|
+
_DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
|
29
|
+
|
|
30
|
+
# Matches either $VAR or ${VAR} (POSIX shell variable syntax).
|
|
31
|
+
_UNRESOLVED_VAR_RE = re.compile(r"\$(\w+|\{[^}]+\})")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def maybe_expand_embedding_model_path(value: str) -> str:
|
|
35
|
+
"""Expand ``~`` and ``$VAR`` when *value* is path-shaped.
|
|
36
|
+
|
|
37
|
+
Path-shape: starts with ``/``, ``./``, ``../``, ``~``, or contains ``$``.
|
|
38
|
+
Plain ``org/name`` (hub id) does not match and is passed through unchanged.
|
|
39
|
+
|
|
40
|
+
Used for ``embedding.model`` after precedence resolution and for runtime
|
|
41
|
+
``SBERT_MODEL`` reads (e.g. MCP) so the string matches ``ResolvedOperatorConfig``.
|
|
42
|
+
"""
|
|
43
|
+
needs_expand = value.startswith(("/", "./", "../", "~")) or "$" in value
|
|
44
|
+
if not needs_expand:
|
|
45
|
+
return value
|
|
46
|
+
expanded = os.path.expandvars(os.path.expanduser(value))
|
|
47
|
+
if _UNRESOLVED_VAR_RE.search(expanded):
|
|
48
|
+
print(
|
|
49
|
+
f"java-codebase-rag: path-shaped model string contains unresolved variable: {expanded}",
|
|
50
|
+
file=sys.stderr,
|
|
51
|
+
)
|
|
52
|
+
return expanded
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def resolved_sbert_model_for_process_env(import_time_default: str) -> str:
|
|
56
|
+
"""``SBERT_MODEL`` from the process environment, with the same expansion as YAML/CLI resolution.
|
|
57
|
+
|
|
58
|
+
*import_time_default* is typically ``index_common.SBERT_MODEL`` (expanded at import
|
|
59
|
+
when ``SBERT_MODEL`` was unset); when the env var is set or non-empty, that value wins
|
|
60
|
+
and is normalized with :func:`maybe_expand_embedding_model_path`.
|
|
61
|
+
"""
|
|
62
|
+
raw = os.environ.get("SBERT_MODEL")
|
|
63
|
+
picked = import_time_default if (raw is None or not str(raw).strip()) else str(raw).strip()
|
|
64
|
+
return maybe_expand_embedding_model_path(picked)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Legacy env keys: never honored; detection-only hints name the replacement (if any).
|
|
68
|
+
_LEGACY_ENV_HINTS: tuple[tuple[str, str], ...] = (
|
|
69
|
+
("LANCEDB_URI", "JAVA_CODEBASE_RAG_INDEX_DIR"),
|
|
70
|
+
("KUZU_DB_PATH", "JAVA_CODEBASE_RAG_INDEX_DIR (Kuzu lives at <index_dir>/code_graph.kuzu)"),
|
|
71
|
+
("LANCEDB_MCP_PROJECT_ROOT", "cwd or --source-root (no env replacement)"),
|
|
72
|
+
("LANCEDB_MCP_ALLOW_REFRESH", "(removed; use init / increment / reprocess / erase)"),
|
|
73
|
+
("LANCEDB_MCP_GRAPH_ENABLED", "(removed; graph is used when code_graph.kuzu exists)"),
|
|
74
|
+
("LANCEDB_MCP_MICROSERVICE_ROOTS", "microservice_roots: in .java-codebase-rag.yml"),
|
|
75
|
+
("LANCEDB_MCP_DEBUG_CONTEXT", ENV_DEBUG_CONTEXT),
|
|
76
|
+
("LANCEDB_MCP_RUN_HEAVY", ENV_RUN_HEAVY),
|
|
77
|
+
("COCOINDEX_DB", "defaults to <JAVA_CODEBASE_RAG_INDEX_DIR>/cocoindex.db"),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
_legacy_hint_seen: set[str] = set()
|
|
81
|
+
_legacy_yaml_hint_roots: set[str] = set()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def emit_legacy_env_hints_if_present() -> None:
|
|
85
|
+
"""One-line stderr hints when deprecated env vars are set (values are not read)."""
|
|
86
|
+
for old, replacement in _LEGACY_ENV_HINTS:
|
|
87
|
+
if old not in os.environ:
|
|
88
|
+
continue
|
|
89
|
+
key = f"env:{old}"
|
|
90
|
+
if key in _legacy_hint_seen:
|
|
91
|
+
continue
|
|
92
|
+
_legacy_hint_seen.add(key)
|
|
93
|
+
print(
|
|
94
|
+
f"java-codebase-rag: {old} is set but no longer read; use {replacement}.",
|
|
95
|
+
file=sys.stderr,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def emit_legacy_yaml_hint_if_needed(source_root: Path) -> None:
|
|
100
|
+
"""If legacy YAML exists without a new config file, print a one-line stderr hint once per root."""
|
|
101
|
+
root_s = str(source_root.resolve())
|
|
102
|
+
if root_s in _legacy_yaml_hint_roots:
|
|
103
|
+
return
|
|
104
|
+
has_new = any((source_root / n).is_file() for n in YAML_CONFIG_FILENAMES)
|
|
105
|
+
if has_new:
|
|
106
|
+
return
|
|
107
|
+
for name in LEGACY_YAML_FILENAMES:
|
|
108
|
+
if (source_root / name).is_file():
|
|
109
|
+
_legacy_yaml_hint_roots.add(root_s)
|
|
110
|
+
print(
|
|
111
|
+
"java-codebase-rag: found legacy "
|
|
112
|
+
f"{name}; rename to .java-codebase-rag.yml to re-enable config.",
|
|
113
|
+
file=sys.stderr,
|
|
114
|
+
)
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def find_yaml_config_file(source_root: Path) -> Path | None:
|
|
119
|
+
for name in YAML_CONFIG_FILENAMES:
|
|
120
|
+
p = source_root / name
|
|
121
|
+
if p.is_file():
|
|
122
|
+
return p
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
|
|
127
|
+
path = find_yaml_config_file(source_root)
|
|
128
|
+
if path is None:
|
|
129
|
+
return {}
|
|
130
|
+
try:
|
|
131
|
+
import yaml
|
|
132
|
+
except ImportError:
|
|
133
|
+
return {}
|
|
134
|
+
try:
|
|
135
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
|
136
|
+
except Exception:
|
|
137
|
+
return {}
|
|
138
|
+
return data if isinstance(data, dict) else {}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass(frozen=True)
|
|
142
|
+
class ResolvedOperatorConfig:
|
|
143
|
+
source_root: Path
|
|
144
|
+
index_dir: Path
|
|
145
|
+
kuzu_path: Path
|
|
146
|
+
cocoindex_db: Path
|
|
147
|
+
embedding_model: str
|
|
148
|
+
embedding_device: str | None
|
|
149
|
+
index_dir_source: SettingSource
|
|
150
|
+
embedding_model_source: SettingSource
|
|
151
|
+
embedding_device_source: SettingSource
|
|
152
|
+
|
|
153
|
+
def apply_to_os_environ(self) -> None:
|
|
154
|
+
"""Make downstream modules (server, kuzu_queries, flows) see a consistent environment.
|
|
155
|
+
|
|
156
|
+
When ``embedding_device`` is unset, ``SBERT_DEVICE`` is not removed from ``os.environ`` so
|
|
157
|
+
a long-lived host process is not mutated for unrelated callers; subprocesses still use
|
|
158
|
+
:meth:`subprocess_env`, which omits ``SBERT_DEVICE`` unless explicitly resolved.
|
|
159
|
+
"""
|
|
160
|
+
os.environ[ENV_INDEX_DIR] = str(self.index_dir.resolve())
|
|
161
|
+
os.environ[ENV_SOURCE_ROOT] = str(self.source_root.resolve())
|
|
162
|
+
os.environ["SBERT_MODEL"] = self.embedding_model
|
|
163
|
+
if self.embedding_device is not None:
|
|
164
|
+
os.environ["SBERT_DEVICE"] = self.embedding_device
|
|
165
|
+
|
|
166
|
+
def subprocess_env(self, base: dict[str, str] | None = None) -> dict[str, str]:
|
|
167
|
+
out = dict(base or os.environ)
|
|
168
|
+
out[ENV_INDEX_DIR] = str(self.index_dir.resolve())
|
|
169
|
+
out[ENV_SOURCE_ROOT] = str(self.source_root.resolve())
|
|
170
|
+
out["SBERT_MODEL"] = self.embedding_model
|
|
171
|
+
if self.embedding_device is not None:
|
|
172
|
+
out["SBERT_DEVICE"] = self.embedding_device
|
|
173
|
+
else:
|
|
174
|
+
out.pop("SBERT_DEVICE", None)
|
|
175
|
+
return out
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _pick_str(
|
|
179
|
+
*,
|
|
180
|
+
cli_val: str | None,
|
|
181
|
+
env_key: str,
|
|
182
|
+
yaml_dict: dict[str, Any],
|
|
183
|
+
yaml_path: tuple[str, ...],
|
|
184
|
+
default: str,
|
|
185
|
+
) -> tuple[str, SettingSource]:
|
|
186
|
+
if cli_val is not None and str(cli_val).strip() != "":
|
|
187
|
+
return str(cli_val).strip(), "cli"
|
|
188
|
+
env_raw = os.environ.get(env_key, "").strip()
|
|
189
|
+
if env_raw:
|
|
190
|
+
return env_raw, "env"
|
|
191
|
+
cur: Any = yaml_dict
|
|
192
|
+
for part in yaml_path:
|
|
193
|
+
if not isinstance(cur, dict) or part not in cur:
|
|
194
|
+
cur = None
|
|
195
|
+
break
|
|
196
|
+
cur = cur.get(part)
|
|
197
|
+
if isinstance(cur, str) and cur.strip():
|
|
198
|
+
return cur.strip(), "yaml"
|
|
199
|
+
return default, "default"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _pick_optional_device(
|
|
203
|
+
*,
|
|
204
|
+
cli_val: str | None,
|
|
205
|
+
env_key: str,
|
|
206
|
+
yaml_dict: dict[str, Any],
|
|
207
|
+
) -> tuple[str | None, SettingSource]:
|
|
208
|
+
if cli_val is not None and str(cli_val).strip() != "":
|
|
209
|
+
return str(cli_val).strip(), "cli"
|
|
210
|
+
env_raw = os.environ.get(env_key, "").strip()
|
|
211
|
+
if env_raw:
|
|
212
|
+
return env_raw, "env"
|
|
213
|
+
emb = yaml_dict.get("embedding")
|
|
214
|
+
if isinstance(emb, dict):
|
|
215
|
+
d = emb.get("device")
|
|
216
|
+
if isinstance(d, str) and d.strip():
|
|
217
|
+
return d.strip(), "yaml"
|
|
218
|
+
return None, "default"
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _resolve_index_dir_path(
|
|
222
|
+
*,
|
|
223
|
+
source_root: Path,
|
|
224
|
+
cli_index_dir: str | None,
|
|
225
|
+
yaml_dict: dict[str, Any],
|
|
226
|
+
) -> tuple[Path, SettingSource]:
|
|
227
|
+
raw_cli = cli_index_dir.strip() if isinstance(cli_index_dir, str) else None
|
|
228
|
+
if raw_cli:
|
|
229
|
+
p = Path(raw_cli).expanduser()
|
|
230
|
+
out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
|
|
231
|
+
return out, "cli"
|
|
232
|
+
|
|
233
|
+
env_raw = os.environ.get(ENV_INDEX_DIR, "").strip()
|
|
234
|
+
if env_raw:
|
|
235
|
+
p = Path(env_raw).expanduser()
|
|
236
|
+
out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
|
|
237
|
+
return out, "env"
|
|
238
|
+
|
|
239
|
+
idx = yaml_dict.get("index_dir")
|
|
240
|
+
if isinstance(idx, str) and idx.strip():
|
|
241
|
+
p = Path(idx.strip()).expanduser()
|
|
242
|
+
out = p.resolve() if p.is_absolute() else (source_root / p).resolve()
|
|
243
|
+
return out, "yaml"
|
|
244
|
+
|
|
245
|
+
return (source_root / ".java-codebase-rag").resolve(), "default"
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def resolve_operator_config(
|
|
249
|
+
*,
|
|
250
|
+
source_root: Path | None,
|
|
251
|
+
cli_index_dir: str | None = None,
|
|
252
|
+
cli_embedding_model: str | None = None,
|
|
253
|
+
cli_embedding_device: str | None = None,
|
|
254
|
+
) -> ResolvedOperatorConfig:
|
|
255
|
+
root = (source_root or Path.cwd()).expanduser().resolve()
|
|
256
|
+
yaml_dict = load_yaml_mapping(root)
|
|
257
|
+
index_dir, index_src = _resolve_index_dir_path(
|
|
258
|
+
source_root=root, cli_index_dir=cli_index_dir, yaml_dict=yaml_dict
|
|
259
|
+
)
|
|
260
|
+
model, model_src = _pick_str(
|
|
261
|
+
cli_val=cli_embedding_model,
|
|
262
|
+
env_key="SBERT_MODEL",
|
|
263
|
+
yaml_dict=yaml_dict,
|
|
264
|
+
yaml_path=("embedding", "model"),
|
|
265
|
+
default=_DEFAULT_EMBEDDING_MODEL,
|
|
266
|
+
)
|
|
267
|
+
model = maybe_expand_embedding_model_path(model)
|
|
268
|
+
device, device_src = _pick_optional_device(
|
|
269
|
+
cli_val=cli_embedding_device,
|
|
270
|
+
env_key="SBERT_DEVICE",
|
|
271
|
+
yaml_dict=yaml_dict,
|
|
272
|
+
)
|
|
273
|
+
ku = index_dir / "code_graph.kuzu"
|
|
274
|
+
coco = index_dir / "cocoindex.db"
|
|
275
|
+
return ResolvedOperatorConfig(
|
|
276
|
+
source_root=root,
|
|
277
|
+
index_dir=index_dir,
|
|
278
|
+
kuzu_path=ku,
|
|
279
|
+
cocoindex_db=coco,
|
|
280
|
+
embedding_model=model,
|
|
281
|
+
embedding_device=device,
|
|
282
|
+
index_dir_source=index_src,
|
|
283
|
+
embedding_model_source=model_src,
|
|
284
|
+
embedding_device_source=device_src,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
|
|
289
|
+
"""True if Kuzu graph dir or any Lance table already exists under index_dir."""
|
|
290
|
+
paths: list[str] = []
|
|
291
|
+
ku = index_dir / "code_graph.kuzu"
|
|
292
|
+
if ku.exists():
|
|
293
|
+
paths.append(str(ku.resolve()))
|
|
294
|
+
if index_dir.is_dir():
|
|
295
|
+
try:
|
|
296
|
+
import lancedb
|
|
297
|
+
|
|
298
|
+
db = lancedb.connect(str(index_dir.resolve()))
|
|
299
|
+
for name in db.table_names():
|
|
300
|
+
paths.append(str((index_dir / name).resolve()) + " (Lance table)")
|
|
301
|
+
except Exception:
|
|
302
|
+
pass
|
|
303
|
+
return bool(paths), paths
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def describe_path_sizes(paths: list[Path]) -> list[tuple[Path, int]]:
|
|
307
|
+
"""Return (path, bytes) for files/dirs that exist."""
|
|
308
|
+
out: list[tuple[Path, int]] = []
|
|
309
|
+
|
|
310
|
+
def _sz(p: Path) -> int:
|
|
311
|
+
if p.is_file():
|
|
312
|
+
return p.stat().st_size
|
|
313
|
+
if p.is_dir():
|
|
314
|
+
total = 0
|
|
315
|
+
for sub in p.rglob("*"):
|
|
316
|
+
if sub.is_file():
|
|
317
|
+
try:
|
|
318
|
+
total += sub.stat().st_size
|
|
319
|
+
except OSError:
|
|
320
|
+
pass
|
|
321
|
+
return total
|
|
322
|
+
return 0
|
|
323
|
+
|
|
324
|
+
for p in paths:
|
|
325
|
+
if p.exists():
|
|
326
|
+
out.append((p, _sz(p)))
|
|
327
|
+
return out
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Subprocess helpers for cocoindex + graph builder (no heavy ML imports at import time)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from java_codebase_rag.cli_progress import emit_lance_cocoindex_finish, emit_lance_cocoindex_start
|
|
12
|
+
|
|
13
|
+
COCOINDEX_TARGET = "java_index_flow_lancedb.py:JavaCodeIndexLance"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def bundle_dir() -> Path:
|
|
17
|
+
return Path(__file__).resolve().parent.parent
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cocoindex_bin() -> Path:
|
|
21
|
+
return Path(sys.executable).parent / "cocoindex"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _popen_stream_to_stderr(
|
|
25
|
+
proc: subprocess.Popen[bytes],
|
|
26
|
+
) -> tuple[str, str, int]:
|
|
27
|
+
out_buf = bytearray()
|
|
28
|
+
err_buf = bytearray()
|
|
29
|
+
|
|
30
|
+
def drain_out() -> None:
|
|
31
|
+
assert proc.stdout is not None
|
|
32
|
+
while True:
|
|
33
|
+
chunk = proc.stdout.read(65536)
|
|
34
|
+
if not chunk:
|
|
35
|
+
break
|
|
36
|
+
out_buf.extend(chunk)
|
|
37
|
+
sys.stderr.buffer.write(chunk)
|
|
38
|
+
sys.stderr.buffer.flush()
|
|
39
|
+
|
|
40
|
+
def drain_err() -> None:
|
|
41
|
+
assert proc.stderr is not None
|
|
42
|
+
while True:
|
|
43
|
+
chunk = proc.stderr.read(65536)
|
|
44
|
+
if not chunk:
|
|
45
|
+
break
|
|
46
|
+
err_buf.extend(chunk)
|
|
47
|
+
sys.stderr.buffer.write(chunk)
|
|
48
|
+
sys.stderr.buffer.flush()
|
|
49
|
+
|
|
50
|
+
t_out = threading.Thread(target=drain_out, name="stream-stdout", daemon=True)
|
|
51
|
+
t_err = threading.Thread(target=drain_err, name="stream-stderr", daemon=True)
|
|
52
|
+
t_out.start()
|
|
53
|
+
t_err.start()
|
|
54
|
+
t_out.join()
|
|
55
|
+
t_err.join()
|
|
56
|
+
code = proc.wait()
|
|
57
|
+
return out_buf.decode(errors="replace"), err_buf.decode(errors="replace"), code
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def run_cocoindex_update(
|
|
61
|
+
env: dict[str, str],
|
|
62
|
+
*,
|
|
63
|
+
full_reprocess: bool,
|
|
64
|
+
quiet: bool,
|
|
65
|
+
lance_project_root: Path | None = None,
|
|
66
|
+
) -> subprocess.CompletedProcess[str]:
|
|
67
|
+
exe = cocoindex_bin()
|
|
68
|
+
if not exe.is_file():
|
|
69
|
+
return subprocess.CompletedProcess(
|
|
70
|
+
args=[str(exe)],
|
|
71
|
+
returncode=127,
|
|
72
|
+
stdout="",
|
|
73
|
+
stderr=f"cocoindex not found next to Python: {exe}",
|
|
74
|
+
)
|
|
75
|
+
bd = bundle_dir()
|
|
76
|
+
flow = bd / "java_index_flow_lancedb.py"
|
|
77
|
+
if not flow.is_file():
|
|
78
|
+
return subprocess.CompletedProcess(
|
|
79
|
+
args=[],
|
|
80
|
+
returncode=126,
|
|
81
|
+
stdout="",
|
|
82
|
+
stderr=f"java_index_flow_lancedb.py not found under {bd}",
|
|
83
|
+
)
|
|
84
|
+
cmd: list[str] = [str(exe), "update", COCOINDEX_TARGET]
|
|
85
|
+
if full_reprocess:
|
|
86
|
+
cmd.extend(["--full-reprocess", "-f"])
|
|
87
|
+
else:
|
|
88
|
+
cmd.append("-f")
|
|
89
|
+
if quiet:
|
|
90
|
+
cmd.append("-q")
|
|
91
|
+
return subprocess.run(
|
|
92
|
+
cmd,
|
|
93
|
+
cwd=str(bd),
|
|
94
|
+
env=env,
|
|
95
|
+
capture_output=True,
|
|
96
|
+
text=True,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
emit_lance = lance_project_root is not None
|
|
100
|
+
if emit_lance:
|
|
101
|
+
emit_lance_cocoindex_start(lance_project_root)
|
|
102
|
+
t0 = time.perf_counter()
|
|
103
|
+
code = -1
|
|
104
|
+
out_s, err_s = "", ""
|
|
105
|
+
try:
|
|
106
|
+
proc = subprocess.Popen(
|
|
107
|
+
cmd,
|
|
108
|
+
cwd=str(bd),
|
|
109
|
+
env=env,
|
|
110
|
+
stdout=subprocess.PIPE,
|
|
111
|
+
stderr=subprocess.PIPE,
|
|
112
|
+
bufsize=0,
|
|
113
|
+
)
|
|
114
|
+
out_s, err_s, code = _popen_stream_to_stderr(proc)
|
|
115
|
+
finally:
|
|
116
|
+
if emit_lance:
|
|
117
|
+
emit_lance_cocoindex_finish(elapsed_s=time.perf_counter() - t0, exit_code=code)
|
|
118
|
+
return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def run_cocoindex_drop(env: dict[str, str], *, quiet: bool) -> subprocess.CompletedProcess[str]:
|
|
122
|
+
exe = cocoindex_bin()
|
|
123
|
+
if not exe.is_file():
|
|
124
|
+
return subprocess.CompletedProcess(
|
|
125
|
+
args=[str(exe)],
|
|
126
|
+
returncode=127,
|
|
127
|
+
stdout="",
|
|
128
|
+
stderr=f"cocoindex not found next to Python: {exe}",
|
|
129
|
+
)
|
|
130
|
+
bd = bundle_dir()
|
|
131
|
+
cmd = [str(exe), "drop", COCOINDEX_TARGET, "-f"]
|
|
132
|
+
if quiet:
|
|
133
|
+
cmd.append("-q")
|
|
134
|
+
return subprocess.run(
|
|
135
|
+
cmd,
|
|
136
|
+
cwd=str(bd),
|
|
137
|
+
env=env,
|
|
138
|
+
capture_output=True,
|
|
139
|
+
text=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def run_build_ast_graph(
|
|
144
|
+
*,
|
|
145
|
+
source_root: Path,
|
|
146
|
+
kuzu_path: Path,
|
|
147
|
+
verbose: bool,
|
|
148
|
+
env: dict[str, str] | None = None,
|
|
149
|
+
) -> subprocess.CompletedProcess[str]:
|
|
150
|
+
builder = bundle_dir() / "build_ast_graph.py"
|
|
151
|
+
if not builder.is_file():
|
|
152
|
+
return subprocess.CompletedProcess(
|
|
153
|
+
args=[],
|
|
154
|
+
returncode=126,
|
|
155
|
+
stdout="",
|
|
156
|
+
stderr=f"build_ast_graph.py not found under {builder.parent}",
|
|
157
|
+
)
|
|
158
|
+
cmd: list[str] = [
|
|
159
|
+
sys.executable,
|
|
160
|
+
str(builder),
|
|
161
|
+
"--source-root",
|
|
162
|
+
str(source_root),
|
|
163
|
+
"--kuzu-path",
|
|
164
|
+
str(kuzu_path),
|
|
165
|
+
]
|
|
166
|
+
if verbose:
|
|
167
|
+
cmd.append("--verbose")
|
|
168
|
+
if not verbose:
|
|
169
|
+
return subprocess.run(
|
|
170
|
+
cmd,
|
|
171
|
+
cwd=str(source_root),
|
|
172
|
+
env=env or os.environ.copy(),
|
|
173
|
+
capture_output=True,
|
|
174
|
+
text=True,
|
|
175
|
+
)
|
|
176
|
+
proc = subprocess.Popen(
|
|
177
|
+
cmd,
|
|
178
|
+
cwd=str(source_root),
|
|
179
|
+
env=env or os.environ.copy(),
|
|
180
|
+
stdout=subprocess.PIPE,
|
|
181
|
+
stderr=subprocess.PIPE,
|
|
182
|
+
bufsize=0,
|
|
183
|
+
)
|
|
184
|
+
out_s, err_s, code = _popen_stream_to_stderr(proc)
|
|
185
|
+
return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def clip(s: str, n: int) -> str:
|
|
189
|
+
return s[-n:] if len(s) > n else s
|