java-codebase-rag 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +24 -7
- build_ast_graph.py +153 -94
- graph_enrich.py +3 -3
- java_codebase_rag/_fdlimit.py +48 -0
- java_codebase_rag/cli.py +31 -28
- java_codebase_rag/config.py +40 -10
- java_codebase_rag/installer.py +99 -10
- java_codebase_rag/lance_optimize.py +148 -0
- java_codebase_rag/pipeline.py +63 -9
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/METADATA +6 -5
- java_codebase_rag-0.6.1.dist-info/RECORD +36 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/top_level.txt +1 -1
- java_index_flow_lancedb.py +22 -4
- java_ontology.py +5 -2
- ladybug_queries.py +1995 -0
- mcp_v2.py +51 -26
- pr_analysis.py +1 -1
- search_lancedb.py +8 -8
- server.py +116 -68
- user_rag/__init__.py +1 -0
- user_rag/cli.py +175 -0
- java_codebase_rag-0.5.3.dist-info/RECORD +0 -31
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/WHEEL +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/entry_points.txt +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Serialized post-flow LanceDB optimize with commit-conflict retry.
|
|
2
|
+
|
|
3
|
+
cocoindex 1.0.7 schedules ``table.optimize()`` (a LanceDB **Rewrite**/compaction
|
|
4
|
+
transaction) as a *background* ``asyncio`` task that races concurrent
|
|
5
|
+
``table.delete()`` (**Delete**) transactions emitted by later mutation batches.
|
|
6
|
+
LanceDB does not allow a Rewrite to commit concurrently with a Delete
|
|
7
|
+
(upstream lancedb#1504 — "We do not support concurrent deletes right now"),
|
|
8
|
+
which surfaces as a flood of::
|
|
9
|
+
|
|
10
|
+
RuntimeError: lance error: Retryable commit conflict for version N: \
|
|
11
|
+
This Rewrite transaction was preempted by concurrent transaction Delete ...
|
|
12
|
+
|
|
13
|
+
To eliminate the race, the flow (``java_index_flow_lancedb.py``) disables the
|
|
14
|
+
in-flight background optimize entirely by raising
|
|
15
|
+
``num_transactions_before_optimize`` to a value that is effectively never
|
|
16
|
+
reached. This module then performs a *single*, serialized optimize after the
|
|
17
|
+
flow returns (exit 0 → no concurrent writers), retrying the rare residual
|
|
18
|
+
commit conflict that two internal compaction passes can still produce.
|
|
19
|
+
"""
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import asyncio
|
|
23
|
+
import sys
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
# Single source of truth for the three Lance table names created by the flow.
|
|
27
|
+
# Keep in sync with ``search_lancedb.TABLES`` (the values there mirror these).
|
|
28
|
+
LANCE_TABLE_NAMES: tuple[str, ...] = (
|
|
29
|
+
"javacodeindex_java_code",
|
|
30
|
+
"sqlschemaindex_sql_schema",
|
|
31
|
+
"yamlconfigindex_yaml_config",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Commit conflicts are transient; a handful of exponential-backoff retries is
|
|
35
|
+
# enough because, post-flow, there are no concurrent writers — only successive
|
|
36
|
+
# optimize/compaction passes within this single serialized call can still
|
|
37
|
+
# transiently preempt one another.
|
|
38
|
+
_MAX_ATTEMPTS = 6
|
|
39
|
+
_BASE_BACKOFF_S = 0.1
|
|
40
|
+
|
|
41
|
+
# Substrings identifying the retryable Lance commit-conflict error. LanceDB
|
|
42
|
+
# wraps the underlying lance error text into the raised ``RuntimeError`` str,
|
|
43
|
+
# so a substring match is the robust detector (no dedicated exception type).
|
|
44
|
+
_RETRYABLE_MARKERS = (
|
|
45
|
+
"Retryable commit conflict",
|
|
46
|
+
"preempted by concurrent transaction",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _is_retryable(exc: BaseException) -> bool:
|
|
51
|
+
text = str(exc)
|
|
52
|
+
return any(marker in text for marker in _RETRYABLE_MARKERS)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def _list_table_names(db: object) -> set[str]:
|
|
56
|
+
"""Existing table names across LanceDB API variants (``list_tables`` ≥ ``table_names``)."""
|
|
57
|
+
if hasattr(db, "list_tables"):
|
|
58
|
+
response = await db.list_tables()
|
|
59
|
+
return set(getattr(response, "tables", response))
|
|
60
|
+
return set(await db.table_names())
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
async def optimize_lance_tables(index_dir: Path, *, quiet: bool = False) -> dict[str, str]:
|
|
64
|
+
"""Optimize all known Lance tables under *index_dir*, serially, with retry.
|
|
65
|
+
|
|
66
|
+
Runs ``table.optimize()`` for each name in :data:`LANCE_TABLE_NAMES` that
|
|
67
|
+
exists in the DB. Retryable commit conflicts are retried with exponential
|
|
68
|
+
backoff; any other exception (or an exhausted retry budget) is captured
|
|
69
|
+
per-table in the returned dict and logged to **stderr** — never stdout,
|
|
70
|
+
since this is callable from stdio-MCP / JSON-stdout contexts.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
index_dir: directory holding the Lance tables (the flow's LanceDB URI).
|
|
74
|
+
quiet: when True, suppress the per-table success/skip info lines on
|
|
75
|
+
stderr (errors are always logged).
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Mapping of table name → status. Values are ``"ok"``, ``"skipped"``
|
|
79
|
+
(table absent — e.g. a repo with no SQL/YAML), or ``"error: <text>"``.
|
|
80
|
+
"""
|
|
81
|
+
# Lazy import: the flow imports this module for LANCE_TABLE_NAMES and must
|
|
82
|
+
# not pay the lancedb import cost at flow-definition time.
|
|
83
|
+
import lancedb
|
|
84
|
+
|
|
85
|
+
results: dict[str, str] = {}
|
|
86
|
+
db = await lancedb.connect_async(str(index_dir))
|
|
87
|
+
try:
|
|
88
|
+
try:
|
|
89
|
+
existing = await _list_table_names(db)
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
print(
|
|
92
|
+
f"java-codebase-rag: optimize: failed to list tables in "
|
|
93
|
+
f"{index_dir}: {exc}",
|
|
94
|
+
file=sys.stderr,
|
|
95
|
+
)
|
|
96
|
+
return {name: f"error: list failed: {exc}" for name in LANCE_TABLE_NAMES}
|
|
97
|
+
|
|
98
|
+
for name in LANCE_TABLE_NAMES:
|
|
99
|
+
if name not in existing:
|
|
100
|
+
results[name] = "skipped"
|
|
101
|
+
if not quiet:
|
|
102
|
+
print(
|
|
103
|
+
f"java-codebase-rag: optimize: {name} absent, skipped",
|
|
104
|
+
file=sys.stderr,
|
|
105
|
+
)
|
|
106
|
+
continue
|
|
107
|
+
try:
|
|
108
|
+
table = await db.open_table(name)
|
|
109
|
+
except Exception as exc:
|
|
110
|
+
results[name] = f"error: open failed: {exc}"
|
|
111
|
+
print(
|
|
112
|
+
f"java-codebase-rag: optimize: {name} open failed: {exc}",
|
|
113
|
+
file=sys.stderr,
|
|
114
|
+
)
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
last_exc: BaseException | None = None
|
|
118
|
+
for attempt in range(_MAX_ATTEMPTS):
|
|
119
|
+
try:
|
|
120
|
+
await table.optimize()
|
|
121
|
+
last_exc = None
|
|
122
|
+
break
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
last_exc = exc
|
|
125
|
+
if _is_retryable(exc) and attempt < _MAX_ATTEMPTS - 1:
|
|
126
|
+
await asyncio.sleep(_BASE_BACKOFF_S * (2**attempt))
|
|
127
|
+
continue
|
|
128
|
+
# Non-retryable, or retries exhausted: stop the loop and
|
|
129
|
+
# surface below — do not swallow silently.
|
|
130
|
+
break
|
|
131
|
+
|
|
132
|
+
if last_exc is None:
|
|
133
|
+
results[name] = "ok"
|
|
134
|
+
if not quiet:
|
|
135
|
+
print(
|
|
136
|
+
f"java-codebase-rag: optimize: {name} ok",
|
|
137
|
+
file=sys.stderr,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
results[name] = f"error: {last_exc}"
|
|
141
|
+
print(
|
|
142
|
+
f"java-codebase-rag: optimize: {name} failed: {last_exc}",
|
|
143
|
+
file=sys.stderr,
|
|
144
|
+
)
|
|
145
|
+
finally:
|
|
146
|
+
# ``AsyncConnection.close`` is a *sync* method in lancedb 0.30.x.
|
|
147
|
+
db.close()
|
|
148
|
+
return results
|
java_codebase_rag/pipeline.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Subprocess helpers for cocoindex + graph builder (no heavy ML imports at import time)."""
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
+
import asyncio
|
|
4
5
|
import os
|
|
5
6
|
import shutil
|
|
6
7
|
import subprocess
|
|
@@ -11,6 +12,7 @@ from pathlib import Path
|
|
|
11
12
|
|
|
12
13
|
from java_codebase_rag.cli_format import Spinner, is_noise_line, stderr_is_tty
|
|
13
14
|
from java_codebase_rag.cli_progress import emit_vectors_finish, emit_vectors_start
|
|
15
|
+
from java_codebase_rag.config import cocoindex_subprocess_env_defaults
|
|
14
16
|
|
|
15
17
|
COCOINDEX_TARGET = "java_index_flow_lancedb.py:JavaCodeIndexLance"
|
|
16
18
|
|
|
@@ -110,6 +112,57 @@ def run_cocoindex_update(
|
|
|
110
112
|
quiet: bool,
|
|
111
113
|
verbose: bool = True,
|
|
112
114
|
lance_project_root: Path | None = None,
|
|
115
|
+
) -> subprocess.CompletedProcess[str]:
|
|
116
|
+
result = _run_cocoindex_update_impl(
|
|
117
|
+
env,
|
|
118
|
+
full_reprocess=full_reprocess,
|
|
119
|
+
quiet=quiet,
|
|
120
|
+
verbose=verbose,
|
|
121
|
+
lance_project_root=lance_project_root,
|
|
122
|
+
)
|
|
123
|
+
# After cocoindex returns exit 0 there are no concurrent writers, so this
|
|
124
|
+
# is the safe window to compact the Lance tables. The flow disabled its
|
|
125
|
+
# in-flight background optimize (see java_index_flow_lancedb.py), making
|
|
126
|
+
# this serialized pass the sole optimizer. Optimize failure does not flip
|
|
127
|
+
# the cocoindex CompletedProcess (a successful index is still usable, just
|
|
128
|
+
# not compacted); the outcome is logged to stderr only.
|
|
129
|
+
if result.returncode == 0:
|
|
130
|
+
_maybe_run_serialized_optimize(env, quiet=quiet)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _maybe_run_serialized_optimize(env: dict[str, str], *, quiet: bool) -> None:
|
|
135
|
+
"""Resolve the index dir from *env* and run the serialized Lance optimize.
|
|
136
|
+
|
|
137
|
+
The flow's lifespan reads ``JAVA_CODEBASE_RAG_INDEX_DIR`` (set by the CLI /
|
|
138
|
+
config.subprocess_env), so it is guaranteed present when cocoindex ran.
|
|
139
|
+
If it is somehow absent we skip optimize with a stderr warning rather than
|
|
140
|
+
crash — a successful index is still searchable un-compacted.
|
|
141
|
+
"""
|
|
142
|
+
idx_raw = env.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
|
|
143
|
+
if not idx_raw:
|
|
144
|
+
print(
|
|
145
|
+
"java-codebase-rag: optimize skipped — JAVA_CODEBASE_RAG_INDEX_DIR "
|
|
146
|
+
"not set in subprocess env",
|
|
147
|
+
file=sys.stderr,
|
|
148
|
+
)
|
|
149
|
+
return
|
|
150
|
+
try:
|
|
151
|
+
from java_codebase_rag.lance_optimize import optimize_lance_tables
|
|
152
|
+
|
|
153
|
+
asyncio.run(optimize_lance_tables(Path(idx_raw), quiet=quiet))
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
# Never crash the CLI on an optimize failure — surface on stderr only.
|
|
156
|
+
print(f"java-codebase-rag: optimize failed: {exc}", file=sys.stderr)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _run_cocoindex_update_impl(
|
|
160
|
+
env: dict[str, str],
|
|
161
|
+
*,
|
|
162
|
+
full_reprocess: bool,
|
|
163
|
+
quiet: bool,
|
|
164
|
+
verbose: bool = True,
|
|
165
|
+
lance_project_root: Path | None = None,
|
|
113
166
|
) -> subprocess.CompletedProcess[str]:
|
|
114
167
|
exe = cocoindex_bin()
|
|
115
168
|
if not exe.is_file():
|
|
@@ -128,10 +181,11 @@ def run_cocoindex_update(
|
|
|
128
181
|
stdout="",
|
|
129
182
|
stderr=f"java_index_flow_lancedb.py not found under {bd}",
|
|
130
183
|
)
|
|
131
|
-
#
|
|
132
|
-
# See: https://github.com/HumanBean17/java-codebase-rag/issues/
|
|
184
|
+
# Cap CocoIndex concurrency to avoid EMFILE ("too many open files") under
|
|
185
|
+
# default OS fd limits. See: https://github.com/HumanBean17/java-codebase-rag/issues/306
|
|
133
186
|
env = env.copy()
|
|
134
|
-
|
|
187
|
+
for _k, _v in cocoindex_subprocess_env_defaults().items():
|
|
188
|
+
env.setdefault(_k, _v)
|
|
135
189
|
cmd: list[str] = [str(exe), "update", COCOINDEX_TARGET]
|
|
136
190
|
if full_reprocess:
|
|
137
191
|
cmd.extend(["--full-reprocess", "-f"])
|
|
@@ -201,7 +255,7 @@ def run_cocoindex_drop(env: dict[str, str], *, quiet: bool) -> subprocess.Comple
|
|
|
201
255
|
def run_build_ast_graph(
|
|
202
256
|
*,
|
|
203
257
|
source_root: Path,
|
|
204
|
-
|
|
258
|
+
ladybug_path: Path,
|
|
205
259
|
verbose: bool,
|
|
206
260
|
quiet: bool = False,
|
|
207
261
|
env: dict[str, str] | None = None,
|
|
@@ -219,8 +273,8 @@ def run_build_ast_graph(
|
|
|
219
273
|
str(builder),
|
|
220
274
|
"--source-root",
|
|
221
275
|
str(source_root),
|
|
222
|
-
"--
|
|
223
|
-
str(
|
|
276
|
+
"--ladybug-path",
|
|
277
|
+
str(ladybug_path),
|
|
224
278
|
]
|
|
225
279
|
# Three-tier: --quiet (silent) / default (filtered progress) / --verbose (raw).
|
|
226
280
|
# Default passes --verbose so the builder emits per-pass progress lines,
|
|
@@ -254,7 +308,7 @@ def run_build_ast_graph(
|
|
|
254
308
|
def run_incremental_graph(
|
|
255
309
|
*,
|
|
256
310
|
source_root: Path,
|
|
257
|
-
|
|
311
|
+
ladybug_path: Path,
|
|
258
312
|
verbose: bool,
|
|
259
313
|
quiet: bool = False,
|
|
260
314
|
env: dict[str, str] | None = None,
|
|
@@ -273,8 +327,8 @@ def run_incremental_graph(
|
|
|
273
327
|
str(builder),
|
|
274
328
|
"--source-root",
|
|
275
329
|
str(source_root),
|
|
276
|
-
"--
|
|
277
|
-
str(
|
|
330
|
+
"--ladybug-path",
|
|
331
|
+
str(ladybug_path),
|
|
278
332
|
"--incremental",
|
|
279
333
|
]
|
|
280
334
|
# Three-tier: --quiet (silent) / default (filtered progress) / --verbose (raw).
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: java-codebase-rag
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: MCP server for semantic + structural search over Java codebases
|
|
5
5
|
Author: HumanBean17
|
|
6
6
|
License-Expression: MIT
|
|
7
7
|
Project-URL: Homepage, https://github.com/HumanBean17/java-codebase-rag
|
|
8
8
|
Project-URL: Repository, https://github.com/HumanBean17/java-codebase-rag
|
|
9
9
|
Project-URL: Issues, https://github.com/HumanBean17/java-codebase-rag/issues
|
|
10
|
-
Keywords: mcp,java,rag,code-search,graph,lancedb,
|
|
10
|
+
Keywords: mcp,java,rag,code-search,graph,lancedb,ladybug
|
|
11
11
|
Classifier: Development Status :: 3 - Alpha
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -19,7 +19,7 @@ Requires-Python: >=3.11
|
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
21
|
Requires-Dist: cocoindex[lancedb]<2,>=1.0.0a43
|
|
22
|
-
Requires-Dist:
|
|
22
|
+
Requires-Dist: ladybug<0.18,>=0.17.1
|
|
23
23
|
Requires-Dist: lancedb<0.31,>=0.25.3
|
|
24
24
|
Requires-Dist: mcp<2,>=1.27.0
|
|
25
25
|
Requires-Dist: numpy<2.5,>=1.26.4
|
|
@@ -35,6 +35,7 @@ Requires-Dist: unidiff<1,>=0.7.3
|
|
|
35
35
|
Provides-Extra: dev
|
|
36
36
|
Requires-Dist: pytest>=7; extra == "dev"
|
|
37
37
|
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-xdist>=3; extra == "dev"
|
|
38
39
|
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
39
40
|
Dynamic: license-file
|
|
40
41
|
|
|
@@ -103,7 +104,7 @@ java-codebase-rag install
|
|
|
103
104
|
java-codebase-rag install --non-interactive --agent claude-code
|
|
104
105
|
```
|
|
105
106
|
|
|
106
|
-
After `pip install --upgrade java-codebase-rag`, run `java-codebase-rag update` to refresh shipped artifacts.
|
|
107
|
+
After `pip install --upgrade java-codebase-rag`, run `java-codebase-rag update` to refresh shipped artifacts and catch up the index (Lance + graph).
|
|
107
108
|
|
|
108
109
|
### Manual registration
|
|
109
110
|
|
|
@@ -235,7 +236,7 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi
|
|
|
235
236
|
| Group | Subcommand | What it does |
|
|
236
237
|
|---|---|---|
|
|
237
238
|
| Setup | `install` | Interactive setup wizard: config, MCP registration, skill/agent deployment, indexing. |
|
|
238
|
-
| Setup | `update` | Refresh shipped artifacts (skill, agent, MCP entry) after pip upgrade. |
|
|
239
|
+
| Setup | `update` | Refresh shipped artifacts (skill, agent, MCP entry) + incremental Lance/graph catch-up after pip upgrade. |
|
|
239
240
|
| Lifecycle | `init` | First-time index. Refuses if artifacts already exist. |
|
|
240
241
|
| Lifecycle | `increment` | CocoIndex catch-up + incremental Kuzu update. `--vectors-only` for Lance only. |
|
|
241
242
|
| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. |
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
ast_java.py,sha256=NQgZzstbsMq-PdowoD6r_ixJKxEEFzTP9xUzqDpiXeU,99661
|
|
2
|
+
brownfield_events.py,sha256=yxXkKDgMb3VPtaiakGzncHM_EGnda8xIue6w90yYp8s,2055
|
|
3
|
+
build_ast_graph.py,sha256=OKigswkUmWwUAKXXRNH4zplw2VonIdWUWzVjC-t5roo,152893
|
|
4
|
+
chunk_heuristics.py,sha256=aQk2NOKxzUdqoUAJUO3G3LE0MN_bYZWNLQ0tkmj5uts,1813
|
|
5
|
+
graph_enrich.py,sha256=POT4LwSkTsrjUmP67bsm2UezUam70cunuPDYDh-v1Bs,63332
|
|
6
|
+
index_common.py,sha256=HT6FKHFJ084eFvd3fR1j8z8gf4eWoPHVW8GXLpw464I,285
|
|
7
|
+
java_index_flow_lancedb.py,sha256=MH9iTNF6HDHDTt5Jn7TOVE5hQ4WUPNt7PlQoh1tuh9o,13212
|
|
8
|
+
java_index_v1_common.py,sha256=nF1KrSqboF_RRvWerG9knRRFmWwsrG_CvhgnsoZ8KqA,1154
|
|
9
|
+
java_ontology.py,sha256=71bCLDNvMy0SpZPzSR5apJ0qJXNd6y5ggkLdBEw_PFo,16682
|
|
10
|
+
kuzu_queries.py,sha256=9bQzrU311AOw_BcUp_KSGiZgPVSaLSU7y63XfcT_vqI,90137
|
|
11
|
+
ladybug_queries.py,sha256=912j9VAYDjcU4ReVorWQ6R4DZl0tteKic-Pqu0jyBS0,90837
|
|
12
|
+
mcp_hints.py,sha256=3swh05LSiWur3tm3-yssndBsLxIxFhy501kBtJI8jJ0,42509
|
|
13
|
+
mcp_v2.py,sha256=o94GJI7j6dLJDIA3R_1ZiQhjzQfMAEW3etdeZYnHOUc,80637
|
|
14
|
+
path_filtering.py,sha256=-oX16SYLWYwX9pcV1fu3vbVTIhY1GzFflT7J1E2tqPY,17122
|
|
15
|
+
pr_analysis.py,sha256=3-5L8_G5XupdJsl9RN73Lq-ejPoK11B3m_VzAx2fGG8,18413
|
|
16
|
+
search_lancedb.py,sha256=scG6HBUrsgIeSWFrGcLcGdhWv1qODOx4JOBMAlLDY_E,36793
|
|
17
|
+
server.py,sha256=Js3XDpV7ThAtj352StH6QdhHutf1D5qUkbR-8k3jO8g,31303
|
|
18
|
+
java_codebase_rag/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
19
|
+
java_codebase_rag/_fdlimit.py,sha256=WroFdfSNbcriKok6q8znTf74dqlznxea_1Fd5bHl_3o,1930
|
|
20
|
+
java_codebase_rag/cli.py,sha256=a5IFLWAsh77mfLv1Z9OdpvLaYvj4i0KR3_kLtL-ans8,34156
|
|
21
|
+
java_codebase_rag/cli_format.py,sha256=arU7P9W6Fvm7X_wzR1wJ8EfyxK1rDP_ESEhdA0ub4Mo,2579
|
|
22
|
+
java_codebase_rag/cli_progress.py,sha256=9jCqEagYOXs32SYVA31_sOCrONvYy7cl1CrdBD2Pg44,3168
|
|
23
|
+
java_codebase_rag/config.py,sha256=Gn3LgxkTOtAvsL-3U2Xn7atOIhyOT2aGmY8SBBTLoQg,16975
|
|
24
|
+
java_codebase_rag/installer.py,sha256=DlBuVVWbHXgcjaQkuXUeT9fNdmk7XZefVT3zzw47k18,45965
|
|
25
|
+
java_codebase_rag/lance_optimize.py,sha256=MzACYlgwxmkJCK64qQLyIAdizSq5BARqaMYSZONlc1I,6069
|
|
26
|
+
java_codebase_rag/pipeline.py,sha256=UcgluFAW9Ghnas8u40x45bVic0mQv6rjzcliDKsnYJI,11936
|
|
27
|
+
java_codebase_rag/install_data/agents/explorer-rag-enhanced.md,sha256=APl9d-No12qZNZLjU7mwNRwxHIgnT3ZtQZiD4clWlyU,14413
|
|
28
|
+
java_codebase_rag/install_data/skills/explore-codebase/SKILL.md,sha256=pIM-Xdwq_fXkhhBJCdb-fA2nes5c_mMPcdUXb7Adyxo,12040
|
|
29
|
+
java_codebase_rag-0.6.1.dist-info/licenses/LICENSE,sha256=gxvtiHtuviR_q8ZAjWw-QTcF3DyPzg6ZY-lQrr8OPpw,1068
|
|
30
|
+
user_rag/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
31
|
+
user_rag/cli.py,sha256=TVcyfzwvmdYXJW6KrEYTKMHm7z2JSXMmz2uB-8kkjxY,5604
|
|
32
|
+
java_codebase_rag-0.6.1.dist-info/METADATA,sha256=aPiLbGD8xE-P3B_RI9gx7VuqrTd-VUriZ--ZPYNK02I,16934
|
|
33
|
+
java_codebase_rag-0.6.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
34
|
+
java_codebase_rag-0.6.1.dist-info/entry_points.txt,sha256=mVVQJa0n73OWfhHXYCDoPRrWin_LJhH2Rn0CkJ2iax4,101
|
|
35
|
+
java_codebase_rag-0.6.1.dist-info/top_level.txt,sha256=syQgi8XPBwY2ws_NZ1uRCxTf_s41NpshwEHNdcdnk3A,245
|
|
36
|
+
java_codebase_rag-0.6.1.dist-info/RECORD,,
|
java_index_flow_lancedb.py
CHANGED
|
@@ -4,7 +4,7 @@ CocoIndex 1.0 app: index Java, Flyway SQL, and YAML into LanceDB.
|
|
|
4
4
|
LanceDB requires a single primary key per table; each chunk gets a UUID `id`.
|
|
5
5
|
|
|
6
6
|
Environment:
|
|
7
|
-
JAVA_CODEBASE_RAG_INDEX_DIR — Lance tables +
|
|
7
|
+
JAVA_CODEBASE_RAG_INDEX_DIR — Lance tables + LadybugDB + cocoindex state (default: ./.java-codebase-rag)
|
|
8
8
|
JAVA_CODEBASE_RAG_SOURCE_ROOT — Java repo root for indexing (optional; else cocoindex cwd)
|
|
9
9
|
SBERT_MODEL / SBERT_DEVICE — embedding (optional; YAML also supported via java-codebase-rag CLI)
|
|
10
10
|
|
|
@@ -36,6 +36,7 @@ from cocoindex.ops.text import RecursiveSplitter, detect_code_language
|
|
|
36
36
|
from cocoindex.resources.file import PatternFilePathMatcher
|
|
37
37
|
|
|
38
38
|
from java_codebase_rag.config import resolved_sbert_model_for_process_env
|
|
39
|
+
from java_codebase_rag.lance_optimize import LANCE_TABLE_NAMES
|
|
39
40
|
from java_index_v1_common import (
|
|
40
41
|
JAVA_CHUNK,
|
|
41
42
|
SBERT_MODEL,
|
|
@@ -68,6 +69,20 @@ else:
|
|
|
68
69
|
|
|
69
70
|
splitter = RecursiveSplitter()
|
|
70
71
|
|
|
72
|
+
# cocoindex 1.0.7 schedules ``table.optimize()`` (a LanceDB Rewrite/compaction
|
|
73
|
+
# transaction) as a *background* asyncio task after every
|
|
74
|
+
# ``num_transactions_before_optimize`` mutation batches (default 50). That
|
|
75
|
+
# background Rewrite races the concurrent ``table.delete()`` (Delete)
|
|
76
|
+
# transactions emitted by later batches, and LanceDB does not allow a Rewrite
|
|
77
|
+
# to commit concurrently with a Delete (upstream lancedb#1504), which floods
|
|
78
|
+
# stderr with "Retryable commit conflict ... preempted by concurrent
|
|
79
|
+
# transaction Delete". Setting this effectively to infinity disables the
|
|
80
|
+
# in-flight background optimize; the serialized post-flow optimize in
|
|
81
|
+
# ``lance_optimize.optimize_lance_tables`` then compacts the table with no
|
|
82
|
+
# concurrent writers. ``optimize()`` is pure maintenance (compact/prune/index);
|
|
83
|
+
# upsert/delete correctness via merge_insert does not depend on it.
|
|
84
|
+
_NUM_TXN_BEFORE_OPTIMIZE = 10**12
|
|
85
|
+
|
|
71
86
|
|
|
72
87
|
@dataclass
|
|
73
88
|
class JavaLanceChunk:
|
|
@@ -317,8 +332,9 @@ async def app_main() -> None:
|
|
|
317
332
|
)
|
|
318
333
|
java_table = await lancedb.mount_table_target(
|
|
319
334
|
LANCE_DB,
|
|
320
|
-
|
|
335
|
+
LANCE_TABLE_NAMES[0],
|
|
321
336
|
java_schema,
|
|
337
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
322
338
|
)
|
|
323
339
|
|
|
324
340
|
sql_schema = await lancedb.TableSchema.from_class(
|
|
@@ -327,8 +343,9 @@ async def app_main() -> None:
|
|
|
327
343
|
)
|
|
328
344
|
sql_table = await lancedb.mount_table_target(
|
|
329
345
|
LANCE_DB,
|
|
330
|
-
|
|
346
|
+
LANCE_TABLE_NAMES[1],
|
|
331
347
|
sql_schema,
|
|
348
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
332
349
|
)
|
|
333
350
|
|
|
334
351
|
yaml_schema = await lancedb.TableSchema.from_class(
|
|
@@ -337,8 +354,9 @@ async def app_main() -> None:
|
|
|
337
354
|
)
|
|
338
355
|
yaml_table = await lancedb.mount_table_target(
|
|
339
356
|
LANCE_DB,
|
|
340
|
-
|
|
357
|
+
LANCE_TABLE_NAMES[2],
|
|
341
358
|
yaml_schema,
|
|
359
|
+
num_transactions_before_optimize=_NUM_TXN_BEFORE_OPTIMIZE,
|
|
342
360
|
)
|
|
343
361
|
|
|
344
362
|
project_root = coco.use_context(PROJECT_ROOT)
|
java_ontology.py
CHANGED
|
@@ -15,7 +15,10 @@ from ast_java import (
|
|
|
15
15
|
_TYPE_ANN_TO_CAPABILITY,
|
|
16
16
|
)
|
|
17
17
|
|
|
18
|
-
# Roles: Spring stereotype values plus DTO
|
|
18
|
+
# Roles assignable by indexing: Spring stereotype values plus DTO. ``OTHER`` is the
|
|
19
|
+
# built-in inference fallback (ast_java.infer_role when nothing matches) and is
|
|
20
|
+
# deliberately excluded here — it is a read-side value (the mcp_v2 ``Role`` enum
|
|
21
|
+
# includes it) but not a role a user may set via @CodebaseRole / role_overrides.
|
|
19
22
|
VALID_ROLES: frozenset[str] = frozenset((*ROLE_ANNOTATIONS.values(), "DTO"))
|
|
20
23
|
|
|
21
24
|
VALID_CAPABILITIES: frozenset[str] = frozenset(
|
|
@@ -141,7 +144,7 @@ Cardinality = Literal["many_to_many", "many_to_one", "one_to_many", "one_to_one"
|
|
|
141
144
|
@dataclass(frozen=True)
|
|
142
145
|
class EdgeAttr:
|
|
143
146
|
name: str
|
|
144
|
-
|
|
147
|
+
graph_type: str
|
|
145
148
|
purpose: str
|
|
146
149
|
|
|
147
150
|
|