java-codebase-rag 0.5.2__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +1 -1
- build_ast_graph.py +142 -90
- graph_enrich.py +3 -3
- java_codebase_rag/_fdlimit.py +48 -0
- java_codebase_rag/cli.py +31 -28
- java_codebase_rag/config.py +28 -8
- java_codebase_rag/installer.py +99 -10
- java_codebase_rag/lance_optimize.py +148 -0
- java_codebase_rag/pipeline.py +64 -6
- {java_codebase_rag-0.5.2.dist-info → java_codebase_rag-0.6.0.dist-info}/METADATA +5 -5
- java_codebase_rag-0.6.0.dist-info/RECORD +33 -0
- {java_codebase_rag-0.5.2.dist-info → java_codebase_rag-0.6.0.dist-info}/top_level.txt +1 -1
- java_index_flow_lancedb.py +22 -4
- java_ontology.py +1 -1
- kuzu_queries.py → ladybug_queries.py +62 -56
- mcp_v2.py +16 -16
- pr_analysis.py +1 -1
- search_lancedb.py +8 -8
- server.py +47 -14
- java_codebase_rag-0.5.2.dist-info/RECORD +0 -31
- {java_codebase_rag-0.5.2.dist-info → java_codebase_rag-0.6.0.dist-info}/WHEEL +0 -0
- {java_codebase_rag-0.5.2.dist-info → java_codebase_rag-0.6.0.dist-info}/entry_points.txt +0 -0
- {java_codebase_rag-0.5.2.dist-info → java_codebase_rag-0.6.0.dist-info}/licenses/LICENSE +0 -0
java_codebase_rag/pipeline.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Subprocess helpers for cocoindex + graph builder (no heavy ML imports at import time)."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import os
|
|
5
6
|
import shutil
|
|
6
7
|
import subprocess
|
|
@@ -11,6 +12,7 @@ from pathlib import Path
|
|
|
11
12
|
|
|
12
13
|
from java_codebase_rag.cli_format import Spinner, is_noise_line, stderr_is_tty
|
|
13
14
|
from java_codebase_rag.cli_progress import emit_vectors_finish, emit_vectors_start
|
|
15
|
+
from java_codebase_rag.config import cocoindex_subprocess_env_defaults
|
|
14
16
|
|
|
15
17
|
COCOINDEX_TARGET = "java_index_flow_lancedb.py:JavaCodeIndexLance"
|
|
16
18
|
|
|
@@ -110,6 +112,57 @@ def run_cocoindex_update(
|
|
|
110
112
|
quiet: bool,
|
|
111
113
|
verbose: bool = True,
|
|
112
114
|
lance_project_root: Path | None = None,
|
|
115
|
+
) -> subprocess.CompletedProcess[str]:
|
|
116
|
+
result = _run_cocoindex_update_impl(
|
|
117
|
+
env,
|
|
118
|
+
full_reprocess=full_reprocess,
|
|
119
|
+
quiet=quiet,
|
|
120
|
+
verbose=verbose,
|
|
121
|
+
lance_project_root=lance_project_root,
|
|
122
|
+
)
|
|
123
|
+
# After cocoindex returns exit 0 there are no concurrent writers, so this
|
|
124
|
+
# is the safe window to compact the Lance tables. The flow disabled its
|
|
125
|
+
# in-flight background optimize (see java_index_flow_lancedb.py), making
|
|
126
|
+
# this serialized pass the sole optimizer. Optimize failure does not flip
|
|
127
|
+
# the cocoindex CompletedProcess (a successful index is still usable, just
|
|
128
|
+
# not compacted); the outcome is logged to stderr only.
|
|
129
|
+
if result.returncode == 0:
|
|
130
|
+
_maybe_run_serialized_optimize(env, quiet=quiet)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _maybe_run_serialized_optimize(env: dict[str, str], *, quiet: bool) -> None:
|
|
135
|
+
"""Resolve the index dir from *env* and run the serialized Lance optimize.
|
|
136
|
+
|
|
137
|
+
The flow's lifespan reads ``JAVA_CODEBASE_RAG_INDEX_DIR`` (set by the CLI /
|
|
138
|
+
config.subprocess_env), so it is guaranteed present when cocoindex ran.
|
|
139
|
+
If it is somehow absent we skip optimize with a stderr warning rather than
|
|
140
|
+
crash — a successful index is still searchable un-compacted.
|
|
141
|
+
"""
|
|
142
|
+
idx_raw = env.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
|
|
143
|
+
if not idx_raw:
|
|
144
|
+
print(
|
|
145
|
+
"java-codebase-rag: optimize skipped — JAVA_CODEBASE_RAG_INDEX_DIR "
|
|
146
|
+
"not set in subprocess env",
|
|
147
|
+
file=sys.stderr,
|
|
148
|
+
)
|
|
149
|
+
return
|
|
150
|
+
try:
|
|
151
|
+
from java_codebase_rag.lance_optimize import optimize_lance_tables
|
|
152
|
+
|
|
153
|
+
asyncio.run(optimize_lance_tables(Path(idx_raw), quiet=quiet))
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
# Never crash the CLI on an optimize failure — surface on stderr only.
|
|
156
|
+
print(f"java-codebase-rag: optimize failed: {exc}", file=sys.stderr)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _run_cocoindex_update_impl(
|
|
160
|
+
env: dict[str, str],
|
|
161
|
+
*,
|
|
162
|
+
full_reprocess: bool,
|
|
163
|
+
quiet: bool,
|
|
164
|
+
verbose: bool = True,
|
|
165
|
+
lance_project_root: Path | None = None,
|
|
113
166
|
) -> subprocess.CompletedProcess[str]:
|
|
114
167
|
exe = cocoindex_bin()
|
|
115
168
|
if not exe.is_file():
|
|
@@ -128,6 +181,11 @@ def run_cocoindex_update(
|
|
|
128
181
|
stdout="",
|
|
129
182
|
stderr=f"java_index_flow_lancedb.py not found under {bd}",
|
|
130
183
|
)
|
|
184
|
+
# Cap CocoIndex concurrency to avoid EMFILE ("too many open files") under
|
|
185
|
+
# default OS fd limits. See: https://github.com/HumanBean17/java-codebase-rag/issues/306
|
|
186
|
+
env = env.copy()
|
|
187
|
+
for _k, _v in cocoindex_subprocess_env_defaults().items():
|
|
188
|
+
env.setdefault(_k, _v)
|
|
131
189
|
cmd: list[str] = [str(exe), "update", COCOINDEX_TARGET]
|
|
132
190
|
if full_reprocess:
|
|
133
191
|
cmd.extend(["--full-reprocess", "-f"])
|
|
@@ -197,7 +255,7 @@ def run_cocoindex_drop(env: dict[str, str], *, quiet: bool) -> subprocess.Comple
|
|
|
197
255
|
def run_build_ast_graph(
|
|
198
256
|
*,
|
|
199
257
|
source_root: Path,
|
|
200
|
-
|
|
258
|
+
ladybug_path: Path,
|
|
201
259
|
verbose: bool,
|
|
202
260
|
quiet: bool = False,
|
|
203
261
|
env: dict[str, str] | None = None,
|
|
@@ -215,8 +273,8 @@ def run_build_ast_graph(
|
|
|
215
273
|
str(builder),
|
|
216
274
|
"--source-root",
|
|
217
275
|
str(source_root),
|
|
218
|
-
"--
|
|
219
|
-
str(
|
|
276
|
+
"--ladybug-path",
|
|
277
|
+
str(ladybug_path),
|
|
220
278
|
]
|
|
221
279
|
# Three-tier: --quiet (silent) / default (filtered progress) / --verbose (raw).
|
|
222
280
|
# Default passes --verbose so the builder emits per-pass progress lines,
|
|
@@ -250,7 +308,7 @@ def run_build_ast_graph(
|
|
|
250
308
|
def run_incremental_graph(
|
|
251
309
|
*,
|
|
252
310
|
source_root: Path,
|
|
253
|
-
|
|
311
|
+
ladybug_path: Path,
|
|
254
312
|
verbose: bool,
|
|
255
313
|
quiet: bool = False,
|
|
256
314
|
env: dict[str, str] | None = None,
|
|
@@ -269,8 +327,8 @@ def run_incremental_graph(
|
|
|
269
327
|
str(builder),
|
|
270
328
|
"--source-root",
|
|
271
329
|
str(source_root),
|
|
272
|
-
"--
|
|
273
|
-
str(
|
|
330
|
+
"--ladybug-path",
|
|
331
|
+
str(ladybug_path),
|
|
274
332
|
"--incremental",
|
|
275
333
|
]
|
|
276
334
|
# Three-tier: --quiet (silent) / default (filtered progress) / --verbose (raw).
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: java-codebase-rag
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: MCP server for semantic + structural search over Java codebases
|
|
5
5
|
Author: HumanBean17
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/HumanBean17/java-codebase-rag
|
|
8
8
|
Project-URL: Repository, https://github.com/HumanBean17/java-codebase-rag
|
|
9
9
|
Project-URL: Issues, https://github.com/HumanBean17/java-codebase-rag/issues
|
|
10
|
-
Keywords: mcp,java,rag,code-search,graph,lancedb,
|
|
10
|
+
Keywords: mcp,java,rag,code-search,graph,lancedb,ladybug
|
|
11
11
|
Classifier: Development Status :: 3 - Alpha
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -19,7 +19,7 @@ Requires-Python: >=3.11
|
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
21
|
Requires-Dist: cocoindex[lancedb]<2,>=1.0.0a43
|
|
22
|
-
Requires-Dist:
|
|
22
|
+
Requires-Dist: ladybug<0.18,>=0.17.1
|
|
23
23
|
Requires-Dist: lancedb<0.31,>=0.25.3
|
|
24
24
|
Requires-Dist: mcp<2,>=1.27.0
|
|
25
25
|
Requires-Dist: numpy<2.5,>=1.26.4
|
|
@@ -103,7 +103,7 @@ java-codebase-rag install
|
|
|
103
103
|
java-codebase-rag install --non-interactive --agent claude-code
|
|
104
104
|
```
|
|
105
105
|
|
|
106
|
-
After `pip install --upgrade java-codebase-rag`, run `java-codebase-rag update` to refresh shipped artifacts.
|
|
106
|
+
After `pip install --upgrade java-codebase-rag`, run `java-codebase-rag update` to refresh shipped artifacts and catch up the index (Lance + graph).
|
|
107
107
|
|
|
108
108
|
### Manual registration
|
|
109
109
|
|
|
@@ -235,7 +235,7 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi
|
|
|
235
235
|
| Group | Subcommand | What it does |
|
|
236
236
|
|---|---|---|
|
|
237
237
|
| Setup | `install` | Interactive setup wizard: config, MCP registration, skill/agent deployment, indexing. |
|
|
238
|
-
| Setup | `update` | Refresh shipped artifacts (skill, agent, MCP entry) after pip upgrade. |
|
|
238
|
+
| Setup | `update` | Refresh shipped artifacts (skill, agent, MCP entry) + incremental Lance/graph catch-up after pip upgrade. |
|
|
239
239
|
| Lifecycle | `init` | First-time index. Refuses if artifacts already exist. |
|
|
240
240
|
| Lifecycle | `increment` | CocoIndex catch-up + incremental Kuzu update. `--vectors-only` for Lance only. |
|
|
241
241
|
| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. |
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
ast_java.py,sha256=TMesuv4SYqzkwfKxf_Pps0KaPLZNZOrhU8mL20bwqeQ,98882
|
|
2
|
+
brownfield_events.py,sha256=yxXkKDgMb3VPtaiakGzncHM_EGnda8xIue6w90yYp8s,2055
|
|
3
|
+
build_ast_graph.py,sha256=GNbjiIAwsXaJQ9Je6gbR-dB9SbnaLThya2pEw3tggQs,152396
|
|
4
|
+
chunk_heuristics.py,sha256=aQk2NOKxzUdqoUAJUO3G3LE0MN_bYZWNLQ0tkmj5uts,1813
|
|
5
|
+
graph_enrich.py,sha256=POT4LwSkTsrjUmP67bsm2UezUam70cunuPDYDh-v1Bs,63332
|
|
6
|
+
index_common.py,sha256=HT6FKHFJ084eFvd3fR1j8z8gf4eWoPHVW8GXLpw464I,285
|
|
7
|
+
java_index_flow_lancedb.py,sha256=MH9iTNF6HDHDTt5Jn7TOVE5hQ4WUPNt7PlQoh1tuh9o,13212
|
|
8
|
+
java_index_v1_common.py,sha256=nF1KrSqboF_RRvWerG9knRRFmWwsrG_CvhgnsoZ8KqA,1154
|
|
9
|
+
java_ontology.py,sha256=FcnOq1XWhUP03OfnTkRStslqrNyukzUKH7VNuK6Bme4,16425
|
|
10
|
+
ladybug_queries.py,sha256=912j9VAYDjcU4ReVorWQ6R4DZl0tteKic-Pqu0jyBS0,90837
|
|
11
|
+
mcp_hints.py,sha256=3swh05LSiWur3tm3-yssndBsLxIxFhy501kBtJI8jJ0,42509
|
|
12
|
+
mcp_v2.py,sha256=64UDrQ27hAQtlz3pFp9A3Xlk95bUjYZ4VBscsyAPCIY,79116
|
|
13
|
+
path_filtering.py,sha256=-oX16SYLWYwX9pcV1fu3vbVTIhY1GzFflT7J1E2tqPY,17122
|
|
14
|
+
pr_analysis.py,sha256=3-5L8_G5XupdJsl9RN73Lq-ejPoK11B3m_VzAx2fGG8,18413
|
|
15
|
+
search_lancedb.py,sha256=scG6HBUrsgIeSWFrGcLcGdhWv1qODOx4JOBMAlLDY_E,36793
|
|
16
|
+
server.py,sha256=uGKT0PdM-bVrzIsfbxF6ZuHGcuRMSSlvkJk0e7Ff43Y,30556
|
|
17
|
+
java_codebase_rag/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
18
|
+
java_codebase_rag/_fdlimit.py,sha256=WroFdfSNbcriKok6q8znTf74dqlznxea_1Fd5bHl_3o,1930
|
|
19
|
+
java_codebase_rag/cli.py,sha256=a5IFLWAsh77mfLv1Z9OdpvLaYvj4i0KR3_kLtL-ans8,34156
|
|
20
|
+
java_codebase_rag/cli_format.py,sha256=arU7P9W6Fvm7X_wzR1wJ8EfyxK1rDP_ESEhdA0ub4Mo,2579
|
|
21
|
+
java_codebase_rag/cli_progress.py,sha256=9jCqEagYOXs32SYVA31_sOCrONvYy7cl1CrdBD2Pg44,3168
|
|
22
|
+
java_codebase_rag/config.py,sha256=u4OomvglTWHUmMpcxN8wPRnRGfXVp3qK_GJ5pY96O98,16267
|
|
23
|
+
java_codebase_rag/installer.py,sha256=DlBuVVWbHXgcjaQkuXUeT9fNdmk7XZefVT3zzw47k18,45965
|
|
24
|
+
java_codebase_rag/lance_optimize.py,sha256=MzACYlgwxmkJCK64qQLyIAdizSq5BARqaMYSZONlc1I,6069
|
|
25
|
+
java_codebase_rag/pipeline.py,sha256=UcgluFAW9Ghnas8u40x45bVic0mQv6rjzcliDKsnYJI,11936
|
|
26
|
+
java_codebase_rag/install_data/agents/explorer-rag-enhanced.md,sha256=APl9d-No12qZNZLjU7mwNRwxHIgnT3ZtQZiD4clWlyU,14413
|
|
27
|
+
java_codebase_rag/install_data/skills/explore-codebase/SKILL.md,sha256=pIM-Xdwq_fXkhhBJCdb-fA2nes5c_mMPcdUXb7Adyxo,12040
|
|
28
|
+
java_codebase_rag-0.6.0.dist-info/licenses/LICENSE,sha256=gxvtiHtuviR_q8ZAjWw-QTcF3DyPzg6ZY-lQrr8OPpw,1068
|
|
29
|
+
java_codebase_rag-0.6.0.dist-info/METADATA,sha256=GoMO3zFTb98w4rVV5SMXpcLK-irlDs7aUH0wBGlv5cQ,16887
|
|
30
|
+
java_codebase_rag-0.6.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
31
|
+
java_codebase_rag-0.6.0.dist-info/entry_points.txt,sha256=mVVQJa0n73OWfhHXYCDoPRrWin_LJhH2Rn0CkJ2iax4,101
|
|
32
|
+
java_codebase_rag-0.6.0.dist-info/top_level.txt,sha256=syQgi8XPBwY2ws_NZ1uRCxTf_s41NpshwEHNdcdnk3A,245
|
|
33
|
+
java_codebase_rag-0.6.0.dist-info/RECORD,,
|
java_index_flow_lancedb.py
CHANGED
|
@@ -4,7 +4,7 @@ CocoIndex 1.0 app: index Java, Flyway SQL, and YAML into LanceDB.
|
|
|
4
4
|
LanceDB requires a single primary key per table; each chunk gets a UUID `id`.
|
|
5
5
|
|
|
6
6
|
Environment:
|
|
7
|
-
JAVA_CODEBASE_RAG_INDEX_DIR — Lance tables +
|
|
7
|
+
JAVA_CODEBASE_RAG_INDEX_DIR — Lance tables + LadybugDB + cocoindex state (default: ./.java-codebase-rag)
|
|
8
8
|
JAVA_CODEBASE_RAG_SOURCE_ROOT — Java repo root for indexing (optional; else cocoindex cwd)
|
|
9
9
|
SBERT_MODEL / SBERT_DEVICE — embedding (optional; YAML also supported via java-codebase-rag CLI)
|
|
10
10
|
|
|
@@ -36,6 +36,7 @@ from cocoindex.ops.text import RecursiveSplitter, detect_code_language
|
|
|
36
36
|
from cocoindex.resources.file import PatternFilePathMatcher
|
|
37
37
|
|
|
38
38
|
from java_codebase_rag.config import resolved_sbert_model_for_process_env
|
|
39
|
+
from java_codebase_rag.lance_optimize import LANCE_TABLE_NAMES
|
|
39
40
|
from java_index_v1_common import (
|
|
40
41
|
JAVA_CHUNK,
|
|
41
42
|
SBERT_MODEL,
|
|
@@ -68,6 +69,20 @@ else:
|
|
|
68
69
|
|
|
69
70
|
splitter = RecursiveSplitter()
|
|
70
71
|
|
|
72
|
+
# cocoindex 1.0.7 schedules ``table.optimize()`` (a LanceDB Rewrite/compaction
|
|
73
|
+
# transaction) as a *background* asyncio task after every
|
|
74
|
+
# ``num_transactions_before_optimize`` mutation batches (default 50). That
|
|
75
|
+
# background Rewrite races the concurrent ``table.delete()`` (Delete)
|
|
76
|
+
# transactions emitted by later batches, and LanceDB does not allow a Rewrite
|
|
77
|
+
# to commit concurrently with a Delete (upstream lancedb#1504), which floods
|
|
78
|
+
# stderr with "Retryable commit conflict ... preempted by concurrent
|
|
79
|
+
# transaction Delete". Setting this effectively to infinity disables the
|
|
80
|
+
# in-flight background optimize; the serialized post-flow optimize in
|
|
81
|
+
# ``lance_optimize.optimize_lance_tables`` then compacts the table with no
|
|
82
|
+
# concurrent writers. ``optimize()`` is pure maintenance (compact/prune/index);
|
|
83
|
+
# upsert/delete correctness via merge_insert does not depend on it.
|
|
84
|
+
_NUM_TXN_BEFORE_OPTIMIZE = 10**12
|
|
85
|
+
|
|
71
86
|
|
|
72
87
|
@dataclass
|
|
73
88
|
class JavaLanceChunk:
|
|
@@ -317,8 +332,9 @@ async def app_main() -> None:
|
|
|
317
332
|
)
|
|
318
333
|
java_table = await lancedb.mount_table_target(
|
|
319
334
|
LANCE_DB,
|
|
320
|
-
|
|
335
|
+
LANCE_TABLE_NAMES[0],
|
|
321
336
|
java_schema,
|
|
337
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
322
338
|
)
|
|
323
339
|
|
|
324
340
|
sql_schema = await lancedb.TableSchema.from_class(
|
|
@@ -327,8 +343,9 @@ async def app_main() -> None:
|
|
|
327
343
|
)
|
|
328
344
|
sql_table = await lancedb.mount_table_target(
|
|
329
345
|
LANCE_DB,
|
|
330
|
-
|
|
346
|
+
LANCE_TABLE_NAMES[1],
|
|
331
347
|
sql_schema,
|
|
348
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
332
349
|
)
|
|
333
350
|
|
|
334
351
|
yaml_schema = await lancedb.TableSchema.from_class(
|
|
@@ -337,8 +354,9 @@ async def app_main() -> None:
|
|
|
337
354
|
)
|
|
338
355
|
yaml_table = await lancedb.mount_table_target(
|
|
339
356
|
LANCE_DB,
|
|
340
|
-
|
|
357
|
+
LANCE_TABLE_NAMES[2],
|
|
341
358
|
yaml_schema,
|
|
359
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
342
360
|
)
|
|
343
361
|
|
|
344
362
|
project_root = coco.use_context(PROJECT_ROOT)
|
java_ontology.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
"""Read-only Cypher helpers over the
|
|
1
|
+
"""Read-only Cypher helpers over the Ladybug AST graph built by `build_ast_graph.py`.
|
|
2
2
|
|
|
3
|
-
Each function opens a
|
|
3
|
+
Each function opens a Ladybug connection on demand and returns plain JSON-ish dicts
|
|
4
4
|
so the MCP server can serialize them without further mapping.
|
|
5
5
|
|
|
6
|
-
The
|
|
6
|
+
The Ladybug database is opened read-only and cached per-process. This module is
|
|
7
7
|
intentionally dependency-light: nothing here imports LanceDB or sentence-transformers.
|
|
8
8
|
|
|
9
9
|
Cypher pitfalls (see also ``AGENTS.md``): avoid ``label(e) IN $list`` in ``WHERE`` for
|
|
@@ -16,17 +16,37 @@ from __future__ import annotations
|
|
|
16
16
|
import json
|
|
17
17
|
import logging
|
|
18
18
|
import os
|
|
19
|
+
import re
|
|
19
20
|
import threading
|
|
20
21
|
from dataclasses import asdict, dataclass
|
|
21
22
|
from pathlib import Path
|
|
22
23
|
from typing import Any, Literal
|
|
23
24
|
|
|
24
|
-
import
|
|
25
|
+
import ladybug
|
|
25
26
|
|
|
26
27
|
from ast_java import ONTOLOGY_VERSION as _ONTOLOGY_VERSION
|
|
27
28
|
|
|
28
29
|
log = logging.getLogger(__name__)
|
|
29
30
|
|
|
31
|
+
|
|
32
|
+
def _parse_ladybug_json(raw: str | None) -> dict[str, Any]:
|
|
33
|
+
"""Parse JSON from LadybugDB which returns unquoted keys like {key: value}."""
|
|
34
|
+
if not raw:
|
|
35
|
+
return {}
|
|
36
|
+
# LadybugDB returns JSON without quotes around keys: {packages: 1, files: 2}
|
|
37
|
+
# Convert to standard JSON: {"packages": 1, "files": 2}
|
|
38
|
+
# This regex matches word characters followed by ':' at the start of a key
|
|
39
|
+
quoted = re.sub(r'(\w+):', r'"\1":', raw)
|
|
40
|
+
try:
|
|
41
|
+
return json.loads(quoted)
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
# Fallback: try parsing as-is (for standard JSON)
|
|
45
|
+
return json.loads(raw)
|
|
46
|
+
except Exception:
|
|
47
|
+
log.warning("Failed to parse counts_json: %s", raw[:100])
|
|
48
|
+
return {}
|
|
49
|
+
|
|
30
50
|
# Composed describe / neighbors dot-keys (not stored graph edge labels).
|
|
31
51
|
_MEMBER_EDGE_COMPOSED_REL_MAP: tuple[tuple[str, str], ...] = (
|
|
32
52
|
("DECLARES.DECLARES_CLIENT", "DECLARES_CLIENT"),
|
|
@@ -46,7 +66,7 @@ OVERRIDE_AXIS_COMPOSED_EDGE_TYPES: frozenset[str] = frozenset(_OVERRIDE_AXIS_COM
|
|
|
46
66
|
|
|
47
67
|
|
|
48
68
|
def _coerce_id_list(raw: Any) -> list[str]:
|
|
49
|
-
"""Normalize
|
|
69
|
+
"""Normalize Ladybug ``collect(DISTINCT ...)`` list results to string ids."""
|
|
50
70
|
if raw is None:
|
|
51
71
|
return []
|
|
52
72
|
if isinstance(raw, list):
|
|
@@ -56,8 +76,8 @@ def _coerce_id_list(raw: Any) -> list[str]:
|
|
|
56
76
|
|
|
57
77
|
|
|
58
78
|
__all__ = [
|
|
59
|
-
"
|
|
60
|
-
"
|
|
79
|
+
"LadybugGraph",
|
|
80
|
+
"resolve_ladybug_path",
|
|
61
81
|
"SymbolHit",
|
|
62
82
|
"EdgeHit",
|
|
63
83
|
"CallEdge",
|
|
@@ -68,14 +88,14 @@ __all__ = [
|
|
|
68
88
|
]
|
|
69
89
|
|
|
70
90
|
|
|
71
|
-
def
|
|
72
|
-
"""Resolve the
|
|
91
|
+
def resolve_ladybug_path(explicit: str | None = None) -> str:
|
|
92
|
+
"""Resolve the Ladybug DB path the same way the builder does."""
|
|
73
93
|
if explicit:
|
|
74
94
|
return str(Path(explicit).expanduser())
|
|
75
95
|
idx = os.environ.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
|
|
76
96
|
if idx and not idx.startswith(("s3://", "gs://", "az://")):
|
|
77
|
-
return str(Path(os.path.expanduser(idx.rstrip("/"))) / "code_graph.
|
|
78
|
-
return str((Path.cwd() / ".java-codebase-rag" / "code_graph.
|
|
97
|
+
return str(Path(os.path.expanduser(idx.rstrip("/"))) / "code_graph.lbug")
|
|
98
|
+
return str((Path.cwd() / ".java-codebase-rag" / "code_graph.lbug").resolve())
|
|
79
99
|
|
|
80
100
|
|
|
81
101
|
@dataclass
|
|
@@ -165,10 +185,10 @@ class RouteCaller:
|
|
|
165
185
|
|
|
166
186
|
|
|
167
187
|
def _symbol_return_for(alias: str) -> str:
|
|
168
|
-
"""
|
|
188
|
+
"""Ladybug RETURN projection for Symbol properties, using the given node alias.
|
|
169
189
|
|
|
170
190
|
Centralised so queries that bind Symbol under a non-`s` alias (e.g. `n` in
|
|
171
|
-
graph-expansion / flow-tracing) don't emit `s.*` references that
|
|
191
|
+
graph-expansion / flow-tracing) don't emit `s.*` references that Ladybug
|
|
172
192
|
rejects with `Variable s is not in scope`.
|
|
173
193
|
"""
|
|
174
194
|
return (
|
|
@@ -198,7 +218,7 @@ def _scope_filters(
|
|
|
198
218
|
|
|
199
219
|
Mutates `params` to bind `$module` / `$microservice` only when the
|
|
200
220
|
corresponding filter is set, so unused names don't leak into the
|
|
201
|
-
|
|
221
|
+
Ladybug plan.
|
|
202
222
|
"""
|
|
203
223
|
out: list[str] = []
|
|
204
224
|
if module:
|
|
@@ -274,7 +294,7 @@ _SYM_COLS = (
|
|
|
274
294
|
|
|
275
295
|
|
|
276
296
|
def find_symbols_in_file_range(
|
|
277
|
-
graph: "
|
|
297
|
+
graph: "LadybugGraph",
|
|
278
298
|
*,
|
|
279
299
|
filename: str,
|
|
280
300
|
start_line: int,
|
|
@@ -324,25 +344,25 @@ def _call_graph_needle_phantom_arity_alt(needle: str) -> str | None:
|
|
|
324
344
|
return needle[:i] + "(?)"
|
|
325
345
|
|
|
326
346
|
|
|
327
|
-
class
|
|
328
|
-
"""Thin wrapper around a read-only
|
|
347
|
+
class LadybugGraph:
|
|
348
|
+
"""Thin wrapper around a read-only Ladybug connection.
|
|
329
349
|
|
|
330
350
|
Safe to share across threads: we hold a single `Connection`, guarded by a lock.
|
|
331
351
|
"""
|
|
332
352
|
|
|
333
353
|
_lock = threading.Lock()
|
|
334
|
-
_instance: "
|
|
354
|
+
_instance: "LadybugGraph | None" = None
|
|
335
355
|
_instance_path: str | None = None
|
|
336
356
|
|
|
337
357
|
def __init__(self, db_path: str) -> None:
|
|
338
358
|
self.db_path = db_path
|
|
339
|
-
self._db =
|
|
340
|
-
self._conn =
|
|
359
|
+
self._db = ladybug.Database(db_path, read_only=True)
|
|
360
|
+
self._conn = ladybug.Connection(self._db)
|
|
341
361
|
self._conn_lock = threading.Lock()
|
|
342
362
|
|
|
343
363
|
@classmethod
|
|
344
|
-
def get(cls, db_path: str | None = None) -> "
|
|
345
|
-
resolved =
|
|
364
|
+
def get(cls, db_path: str | None = None) -> "LadybugGraph":
|
|
365
|
+
resolved = resolve_ladybug_path(db_path)
|
|
346
366
|
with cls._lock:
|
|
347
367
|
if cls._instance is None or cls._instance_path != resolved:
|
|
348
368
|
instance = cls(resolved)
|
|
@@ -354,7 +374,7 @@ class KuzuGraph:
|
|
|
354
374
|
f"required version {_ONTOLOGY_VERSION}. "
|
|
355
375
|
"Rebuild the graph: `python build_ast_graph.py --source-root <repo>`, "
|
|
356
376
|
"or run `java-codebase-rag reprocess --source-root <repo>` for a full "
|
|
357
|
-
"Lance+
|
|
377
|
+
"Lance+Ladybug re-index."
|
|
358
378
|
)
|
|
359
379
|
cls._instance = instance
|
|
360
380
|
cls._instance_path = resolved
|
|
@@ -362,11 +382,11 @@ class KuzuGraph:
|
|
|
362
382
|
|
|
363
383
|
@classmethod
|
|
364
384
|
def exists(cls, db_path: str | None = None) -> bool:
|
|
365
|
-
resolved =
|
|
385
|
+
resolved = resolve_ladybug_path(db_path)
|
|
366
386
|
p = Path(resolved)
|
|
367
387
|
if not p.exists():
|
|
368
388
|
return False
|
|
369
|
-
#
|
|
389
|
+
# Ladybug represents DB as a directory; allow file form too (single-file DBs).
|
|
370
390
|
return True
|
|
371
391
|
|
|
372
392
|
# ---- low-level ----
|
|
@@ -481,11 +501,15 @@ class KuzuGraph:
|
|
|
481
501
|
if not rows:
|
|
482
502
|
return {"error": "no GraphMeta node"}
|
|
483
503
|
row = rows[0]
|
|
484
|
-
counts: dict[str, Any]
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
504
|
+
counts: dict[str, Any] = _parse_ladybug_json(row.get("counts_json"))
|
|
505
|
+
# Ensure counts has expected keys even if empty
|
|
506
|
+
if not counts:
|
|
507
|
+
counts = {
|
|
508
|
+
"packages": 0, "files": 0, "types": 0, "members": 0, "phantoms": 0,
|
|
509
|
+
"extends": 0, "implements": 0, "injects": 0, "declares": 0, "overrides": 0,
|
|
510
|
+
"calls": 0, "routes": 0, "exposes": 0, "clients": 0, "declares_client": 0,
|
|
511
|
+
"producers": 0, "declares_producer": 0, "http_calls": 0, "async_calls": 0,
|
|
512
|
+
}
|
|
489
513
|
routes_total = exposes_total = 0
|
|
490
514
|
routes_resolved_pct = 0.0
|
|
491
515
|
routes_by_framework: dict[str, Any] = {}
|
|
@@ -507,10 +531,7 @@ class KuzuGraph:
|
|
|
507
531
|
cross_service_resolution: str | None = None
|
|
508
532
|
if meta_mode != "legacy":
|
|
509
533
|
rfw_raw = row.get("routes_by_framework") or "{}"
|
|
510
|
-
|
|
511
|
-
routes_by_framework = json.loads(rfw_raw) if isinstance(rfw_raw, str) else (rfw_raw or {})
|
|
512
|
-
except Exception:
|
|
513
|
-
routes_by_framework = {}
|
|
534
|
+
routes_by_framework = _parse_ladybug_json(rfw_raw) if isinstance(rfw_raw, str) else (rfw_raw or {})
|
|
514
535
|
if not isinstance(routes_by_framework, dict):
|
|
515
536
|
routes_by_framework = {}
|
|
516
537
|
routes_total = int(row.get("routes_total") or 0)
|
|
@@ -519,26 +540,17 @@ class KuzuGraph:
|
|
|
519
540
|
if meta_mode in ("pr_f1", "pr_e3", "pre_e3"):
|
|
520
541
|
routes_from_brownfield_pct = float(row.get("routes_from_brownfield_pct") or 0.0)
|
|
521
542
|
rbl_raw = row.get("routes_by_layer") or "{}"
|
|
522
|
-
|
|
523
|
-
routes_by_layer = json.loads(rbl_raw) if isinstance(rbl_raw, str) else (rbl_raw or {})
|
|
524
|
-
except Exception:
|
|
525
|
-
routes_by_layer = {}
|
|
543
|
+
routes_by_layer = _parse_ladybug_json(rbl_raw) if isinstance(rbl_raw, str) else (rbl_raw or {})
|
|
526
544
|
if not isinstance(routes_by_layer, dict):
|
|
527
545
|
routes_by_layer = {}
|
|
528
546
|
http_calls_total = int(row.get("http_calls_total") or 0)
|
|
529
547
|
async_calls_total = int(row.get("async_calls_total") or 0)
|
|
530
548
|
hbs_raw = row.get("http_calls_by_strategy") or "{}"
|
|
531
549
|
abs_raw = row.get("async_calls_by_strategy") or "{}"
|
|
532
|
-
|
|
533
|
-
http_calls_by_strategy = json.loads(hbs_raw) if isinstance(hbs_raw, str) else (hbs_raw or {})
|
|
534
|
-
except Exception:
|
|
535
|
-
http_calls_by_strategy = {}
|
|
550
|
+
http_calls_by_strategy = _parse_ladybug_json(hbs_raw) if isinstance(hbs_raw, str) else (hbs_raw or {})
|
|
536
551
|
if not isinstance(http_calls_by_strategy, dict):
|
|
537
552
|
http_calls_by_strategy = {}
|
|
538
|
-
|
|
539
|
-
async_calls_by_strategy = json.loads(abs_raw) if isinstance(abs_raw, str) else (abs_raw or {})
|
|
540
|
-
except Exception:
|
|
541
|
-
async_calls_by_strategy = {}
|
|
553
|
+
async_calls_by_strategy = _parse_ladybug_json(abs_raw) if isinstance(abs_raw, str) else (abs_raw or {})
|
|
542
554
|
if not isinstance(async_calls_by_strategy, dict):
|
|
543
555
|
async_calls_by_strategy = {}
|
|
544
556
|
http_calls_resolved_pct = float(row.get("http_calls_resolved_pct") or 0.0)
|
|
@@ -547,16 +559,10 @@ class KuzuGraph:
|
|
|
547
559
|
async_producers_from_brownfield_pct = float(row.get("async_producers_from_brownfield_pct") or 0.0)
|
|
548
560
|
hmb_raw = row.get("http_calls_match_breakdown") or "{}"
|
|
549
561
|
amb_raw = row.get("async_calls_match_breakdown") or "{}"
|
|
550
|
-
|
|
551
|
-
http_calls_match_breakdown = json.loads(hmb_raw) if isinstance(hmb_raw, str) else (hmb_raw or {})
|
|
552
|
-
except Exception:
|
|
553
|
-
http_calls_match_breakdown = {}
|
|
562
|
+
http_calls_match_breakdown = _parse_ladybug_json(hmb_raw) if isinstance(hmb_raw, str) else (hmb_raw or {})
|
|
554
563
|
if not isinstance(http_calls_match_breakdown, dict):
|
|
555
564
|
http_calls_match_breakdown = {}
|
|
556
|
-
|
|
557
|
-
async_calls_match_breakdown = json.loads(amb_raw) if isinstance(amb_raw, str) else (amb_raw or {})
|
|
558
|
-
except Exception:
|
|
559
|
-
async_calls_match_breakdown = {}
|
|
565
|
+
async_calls_match_breakdown = _parse_ladybug_json(amb_raw) if isinstance(amb_raw, str) else (amb_raw or {})
|
|
560
566
|
if not isinstance(async_calls_match_breakdown, dict):
|
|
561
567
|
async_calls_match_breakdown = {}
|
|
562
568
|
cross_service_calls_total = int(row.get("cross_service_calls_total") or 0)
|
|
@@ -1013,7 +1019,7 @@ class KuzuGraph:
|
|
|
1013
1019
|
microservice: str | None = None,
|
|
1014
1020
|
capability: str | None = None,
|
|
1015
1021
|
limit: int = 100) -> list[SymbolHit]:
|
|
1016
|
-
#
|
|
1022
|
+
# Ladybug supports `list_contains` for STRING[].
|
|
1017
1023
|
filters = ["list_contains(s.annotations, $ann)"]
|
|
1018
1024
|
params: dict[str, Any] = {"ann": annotation}
|
|
1019
1025
|
if capability:
|
|
@@ -1454,7 +1460,7 @@ class KuzuGraph:
|
|
|
1454
1460
|
))
|
|
1455
1461
|
if entry_roles:
|
|
1456
1462
|
params["entry_roles"] = list(entry_roles)
|
|
1457
|
-
#
|
|
1463
|
+
# Ladybug 0.17.x does not support parameterized lists inside ANY
|
|
1458
1464
|
# comprehensions, so we expand the fixed capability set as
|
|
1459
1465
|
# individual list_contains predicates ORed together.
|
|
1460
1466
|
cap_predicates = " OR ".join(
|