java-codebase-rag 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ ast_java.py,sha256=QIldCwZVFlJUu3BwBjjoHYAhu5Eas4dxMaAb3MSBWDg,98174
2
+ brownfield_events.py,sha256=yxXkKDgMb3VPtaiakGzncHM_EGnda8xIue6w90yYp8s,2055
3
+ build_ast_graph.py,sha256=jF3EzHxZUWOYfghI0RQHZskKmYOzBzi8eFebqlq4kNg,118019
4
+ chunk_heuristics.py,sha256=aQk2NOKxzUdqoUAJUO3G3LE0MN_bYZWNLQ0tkmj5uts,1813
5
+ graph_enrich.py,sha256=2-njD2alm7FFpLn217ZG3f3ln-zqbdtGwTghOpd44oo,62021
6
+ index_common.py,sha256=HT6FKHFJ084eFvd3fR1j8z8gf4eWoPHVW8GXLpw464I,285
7
+ java_index_flow_lancedb.py,sha256=LMmfMSdE2d-ujxuJ2-hss7BhkrUMxHNyZuqsiGITuAI,12057
8
+ java_index_v1_common.py,sha256=nF1KrSqboF_RRvWerG9knRRFmWwsrG_CvhgnsoZ8KqA,1154
9
+ java_ontology.py,sha256=nM-oY8_91rmUudv9hAss1AMus9BFY9s5tTpAWjlCz00,16424
10
+ kuzu_queries.py,sha256=9bQzrU311AOw_BcUp_KSGiZgPVSaLSU7y63XfcT_vqI,90137
11
+ mcp_hints.py,sha256=myzwhmXCtHzUcEjM8PbrCpxyEo1WfMEXJ5i6O-Syb7k,31686
12
+ mcp_v2.py,sha256=939iHxEAxCRrvXIKJLOrpF_KMm0JLBkojFpMKeOnQIQ,76837
13
+ path_filtering.py,sha256=-oX16SYLWYwX9pcV1fu3vbVTIhY1GzFflT7J1E2tqPY,17122
14
+ pr_analysis.py,sha256=Zaq90xYgMgrReV3vCGcFhOkK61gIRMAAIgs7ev-rJG4,18410
15
+ search_lancedb.py,sha256=-XgtpbJ_3zDLiZ_vGKXjaLpl7RlvgyzUb7oAGoWkXO0,36754
16
+ server.py,sha256=lAsO3c8DMPsrB8tY7q9F9jbieqeDBI4vLnYcIJScHzk,25705
17
+ java_codebase_rag/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
18
+ java_codebase_rag/cli.py,sha256=hCjlmAXkS80noTX_bxm6BMiLIYEz_P5xfrw9C7LvkBE,27678
19
+ java_codebase_rag/cli_progress.py,sha256=Vtio3RqJ3LkRoNpxrv8iGbEiX4klkTlJX-mR4l6oeBM,1586
20
+ java_codebase_rag/config.py,sha256=h07zJrV8QoLv9hIhJZ2JgUI0Rh6uPBZUiPkGDEmTg_w,11687
21
+ java_codebase_rag/pipeline.py,sha256=QyKNCrBsjdFU71N9Xygti-DdtMQQsrZ8aySisux46lI,5311
22
+ java_codebase_rag-0.1.0.dist-info/licenses/LICENSE,sha256=gxvtiHtuviR_q8ZAjWw-QTcF3DyPzg6ZY-lQrr8OPpw,1068
23
+ java_codebase_rag-0.1.0.dist-info/METADATA,sha256=cR732vM6PVtmldy-CwOuailN5-fwMMJQ4Bi3mE5IcKo,51720
24
+ java_codebase_rag-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
25
+ java_codebase_rag-0.1.0.dist-info/entry_points.txt,sha256=mVVQJa0n73OWfhHXYCDoPRrWin_LJhH2Rn0CkJ2iax4,101
26
+ java_codebase_rag-0.1.0.dist-info/top_level.txt,sha256=5aIYoMkvJvvfXvf4iHn2OeSIM7PZXP-0j94eNESnwMw,242
27
+ java_codebase_rag-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ java-codebase-rag = java_codebase_rag.cli:main
3
+ java-codebase-rag-mcp = server:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 HumanBean17
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,17 @@
1
+ ast_java
2
+ brownfield_events
3
+ build_ast_graph
4
+ chunk_heuristics
5
+ graph_enrich
6
+ index_common
7
+ java_codebase_rag
8
+ java_index_flow_lancedb
9
+ java_index_v1_common
10
+ java_ontology
11
+ kuzu_queries
12
+ mcp_hints
13
+ mcp_v2
14
+ path_filtering
15
+ pr_analysis
16
+ search_lancedb
17
+ server
@@ -0,0 +1,398 @@
1
+ """
2
+ CocoIndex 1.0 app: index Java, Flyway SQL, and YAML into LanceDB.
3
+
4
+ LanceDB requires a single primary key per table; each chunk gets a UUID `id`.
5
+
6
+ Environment:
7
+ JAVA_CODEBASE_RAG_INDEX_DIR — Lance tables + Kuzu + cocoindex state (default: ./.java-codebase-rag)
8
+ JAVA_CODEBASE_RAG_SOURCE_ROOT — Java repo root for indexing (optional; else cocoindex cwd)
9
+ SBERT_MODEL / SBERT_DEVICE — embedding (optional; YAML also supported via java-codebase-rag CLI)
10
+
11
+ Dependencies:
12
+ pip install "cocoindex[lancedb]" sentence-transformers
13
+
14
+ Usage:
15
+ cocoindex update java_index_flow_lancedb.py:JavaCodeIndexLance --full-reprocess
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import inspect
20
+ import os
21
+ import uuid
22
+ from collections.abc import AsyncIterator
23
+ from contextlib import asynccontextmanager
24
+ from dataclasses import dataclass
25
+ from pathlib import Path
26
+ from typing import Annotated, Any
27
+
28
+ import cocoindex as coco
29
+ import numpy as np
30
+ import numpy.typing as npt
31
+ import pyarrow as pa
32
+ from cocoindex.connectors import lancedb, localfs
33
+ from cocoindex.connectors.lancedb import LanceType
34
+ from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
35
+ from cocoindex.ops.text import RecursiveSplitter, detect_code_language
36
+ from cocoindex.resources.file import PatternFilePathMatcher
37
+
38
+ from java_codebase_rag.config import resolved_sbert_model_for_process_env
39
+ from java_index_v1_common import (
40
+ JAVA_CHUNK,
41
+ SBERT_MODEL,
42
+ SQL_CHUNK,
43
+ YAML_CHUNK,
44
+ chunk_key_range,
45
+ position_to_json,
46
+ )
47
+ from path_filtering import LayeredIgnore
48
+ from ast_java import ONTOLOGY_VERSION, parse_java
49
+ from graph_enrich import enrich_chunk
50
+
51
+ # Older cocoindex (e.g. 1.0.0a43) uses ``tracked=False``; newer releases renamed
52
+ # the flag to ``detect_change`` (default False) and reject ``tracked``.
53
+ _ck_params = inspect.signature(coco.ContextKey.__init__).parameters
54
+ if "detect_change" in _ck_params:
55
+ PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root")
56
+ LANCE_DB = coco.ContextKey("java_lance_async_conn")
57
+ EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder")
58
+ elif "tracked" in _ck_params:
59
+ PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root", tracked=False)
60
+ LANCE_DB = coco.ContextKey("java_lance_async_conn", tracked=False)
61
+ EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder](
62
+ "java_lance_embedder", tracked=False
63
+ )
64
+ else:
65
+ PROJECT_ROOT = coco.ContextKey[Path]("java_lance_project_root")
66
+ LANCE_DB = coco.ContextKey("java_lance_async_conn")
67
+ EMBEDDER = coco.ContextKey[SentenceTransformerEmbedder]("java_lance_embedder")
68
+
69
+ splitter = RecursiveSplitter()
70
+
71
+
72
+ @dataclass
73
+ class JavaLanceChunk:
74
+ id: str
75
+ filename: str
76
+ language: str
77
+ text: str
78
+ range_start: int
79
+ range_end: int
80
+ start: dict[str, Any]
81
+ end: dict[str, Any]
82
+ embedding: Annotated[npt.NDArray[np.float32], EMBEDDER]
83
+ package: str
84
+ module: str
85
+ microservice: str
86
+ primary_type_fqn: str
87
+ primary_type_kind: str
88
+ role: str
89
+ # Native PyArrow lists: without the LanceType override CocoIndex would JSON-encode
90
+ # `list[str]` into a STRING column, which caller code then iterates character-by-character.
91
+ capabilities: Annotated[list[str], LanceType(pa.list_(pa.string()))]
92
+ annotations_on_type: Annotated[list[str], LanceType(pa.list_(pa.string()))]
93
+ symbols: Annotated[list[str], LanceType(pa.list_(pa.string()))]
94
+ ontology_version: int
95
+
96
+
97
+ @dataclass
98
+ class SqlLanceChunk:
99
+ id: str
100
+ filename: str
101
+ text: str
102
+ range_start: int
103
+ range_end: int
104
+ start: dict[str, Any]
105
+ end: dict[str, Any]
106
+ embedding: Annotated[npt.NDArray[np.float32], EMBEDDER]
107
+
108
+
109
+ @dataclass
110
+ class YamlLanceChunk:
111
+ id: str
112
+ filename: str
113
+ text: str
114
+ range_start: int
115
+ range_end: int
116
+ start: dict[str, Any]
117
+ end: dict[str, Any]
118
+ embedding: Annotated[npt.NDArray[np.float32], EMBEDDER]
119
+
120
+
121
+ @coco.lifespan
122
+ async def coco_lifespan(builder: coco.EnvironmentBuilder) -> AsyncIterator[None]:
123
+ idx_raw = os.environ.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
124
+ if idx_raw and not idx_raw.startswith(("s3://", "gs://", "az://")):
125
+ index_dir = Path(idx_raw).expanduser().resolve()
126
+ else:
127
+ index_dir = (Path(".").resolve() / ".java-codebase-rag").resolve()
128
+ index_dir.mkdir(parents=True, exist_ok=True)
129
+ builder.settings.db_path = index_dir / "cocoindex.db"
130
+
131
+ env_root = os.environ.get("JAVA_CODEBASE_RAG_SOURCE_ROOT", "").strip()
132
+ if env_root:
133
+ root = Path(env_root).expanduser().resolve()
134
+ else:
135
+ root = Path(".").resolve()
136
+ builder.provide(PROJECT_ROOT, root)
137
+
138
+ embedder = SentenceTransformerEmbedder(
139
+ resolved_sbert_model_for_process_env(SBERT_MODEL),
140
+ device=os.environ.get("SBERT_DEVICE") or None,
141
+ trust_remote_code=True,
142
+ )
143
+ builder.provide(EMBEDDER, embedder)
144
+
145
+ uri = str(index_dir)
146
+
147
+ @asynccontextmanager
148
+ async def _lance_cm() -> AsyncIterator[Any]:
149
+ conn = await lancedb.connect_async(uri)
150
+ try:
151
+ yield conn
152
+ finally:
153
+ conn.close()
154
+
155
+ await builder.provide_async_with(LANCE_DB, _lance_cm())
156
+ yield
157
+
158
+
159
+ @coco.fn(memo=True)
160
+ async def process_java_file(
161
+ file: localfs.File,
162
+ table: lancedb.TableTarget[JavaLanceChunk],
163
+ ) -> None:
164
+ embedder = coco.use_context(EMBEDDER)
165
+ project_root = coco.use_context(PROJECT_ROOT)
166
+ if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
167
+ return
168
+ try:
169
+ content = await file.read_text()
170
+ except UnicodeDecodeError:
171
+ return
172
+ if not content.strip():
173
+ return
174
+
175
+ language = detect_code_language(filename=file.file_path.path.name) or "text"
176
+ cs, mn, ov = JAVA_CHUNK
177
+ chunks = splitter.split(
178
+ content,
179
+ cs,
180
+ min_chunk_size=mn,
181
+ chunk_overlap=ov,
182
+ language=language,
183
+ )
184
+ rel = file.file_path.path.as_posix()
185
+ content_bytes = content.encode("utf-8", errors="replace")
186
+ ast = parse_java(content_bytes)
187
+
188
+ for ch in chunks:
189
+ rs, re = chunk_key_range(ch)
190
+ enrich = enrich_chunk(
191
+ ast,
192
+ chunk_start_byte=ch.start.byte_offset,
193
+ chunk_end_byte=ch.end.byte_offset,
194
+ file_path=rel,
195
+ project_root=project_root,
196
+ )
197
+ emb = await embedder.embed(ch.text)
198
+ table.declare_row(
199
+ row=JavaLanceChunk(
200
+ id=str(uuid.uuid4()),
201
+ filename=rel,
202
+ language=language,
203
+ text=ch.text,
204
+ range_start=rs,
205
+ range_end=re,
206
+ start=position_to_json(ch.start),
207
+ end=position_to_json(ch.end),
208
+ embedding=emb,
209
+ package=enrich.package,
210
+ module=enrich.module,
211
+ microservice=enrich.microservice,
212
+ primary_type_fqn=enrich.primary_type_fqn,
213
+ primary_type_kind=enrich.primary_type_kind,
214
+ role=enrich.role,
215
+ capabilities=list(enrich.capabilities),
216
+ annotations_on_type=enrich.annotations_on_type,
217
+ symbols=enrich.symbols,
218
+ ontology_version=ONTOLOGY_VERSION,
219
+ )
220
+ )
221
+
222
+
223
+ @coco.fn(memo=True)
224
+ async def process_sql_file(
225
+ file: localfs.File,
226
+ table: lancedb.TableTarget[SqlLanceChunk],
227
+ ) -> None:
228
+ embedder = coco.use_context(EMBEDDER)
229
+ project_root = coco.use_context(PROJECT_ROOT)
230
+ if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
231
+ return
232
+ try:
233
+ content = await file.read_text()
234
+ except UnicodeDecodeError:
235
+ return
236
+ if not content.strip():
237
+ return
238
+
239
+ language = "sql"
240
+ cs, mn, ov = SQL_CHUNK
241
+ chunks = splitter.split(
242
+ content,
243
+ cs,
244
+ min_chunk_size=mn,
245
+ chunk_overlap=ov,
246
+ language=language,
247
+ )
248
+ rel = file.file_path.path.as_posix()
249
+
250
+ for ch in chunks:
251
+ rs, re = chunk_key_range(ch)
252
+ emb = await embedder.embed(ch.text)
253
+ table.declare_row(
254
+ row=SqlLanceChunk(
255
+ id=str(uuid.uuid4()),
256
+ filename=rel,
257
+ text=ch.text,
258
+ range_start=rs,
259
+ range_end=re,
260
+ start=position_to_json(ch.start),
261
+ end=position_to_json(ch.end),
262
+ embedding=emb,
263
+ )
264
+ )
265
+
266
+
267
+ @coco.fn(memo=True)
268
+ async def process_yaml_file(
269
+ file: localfs.File,
270
+ table: lancedb.TableTarget[YamlLanceChunk],
271
+ ) -> None:
272
+ embedder = coco.use_context(EMBEDDER)
273
+ project_root = coco.use_context(PROJECT_ROOT)
274
+ if LayeredIgnore(project_root).is_ignored((project_root / file.file_path.path).resolve())[0]:
275
+ return
276
+ try:
277
+ content = await file.read_text()
278
+ except UnicodeDecodeError:
279
+ return
280
+ if not content.strip():
281
+ return
282
+
283
+ ext = file.file_path.path.suffix.lower()
284
+ language = "yaml" if ext in (".yml", ".yaml") else "text"
285
+ cs, mn, ov = YAML_CHUNK
286
+ chunks = splitter.split(
287
+ content,
288
+ cs,
289
+ min_chunk_size=mn,
290
+ chunk_overlap=ov,
291
+ language=language,
292
+ )
293
+ rel = file.file_path.path.as_posix()
294
+
295
+ for ch in chunks:
296
+ rs, re = chunk_key_range(ch)
297
+ emb = await embedder.embed(ch.text)
298
+ table.declare_row(
299
+ row=YamlLanceChunk(
300
+ id=str(uuid.uuid4()),
301
+ filename=rel,
302
+ text=ch.text,
303
+ range_start=rs,
304
+ range_end=re,
305
+ start=position_to_json(ch.start),
306
+ end=position_to_json(ch.end),
307
+ embedding=emb,
308
+ )
309
+ )
310
+
311
+
312
+ @coco.fn
313
+ async def app_main() -> None:
314
+ java_schema = await lancedb.TableSchema.from_class(
315
+ JavaLanceChunk,
316
+ primary_key=["id"],
317
+ )
318
+ java_table = await lancedb.mount_table_target(
319
+ LANCE_DB,
320
+ "javacodeindex_java_code",
321
+ java_schema,
322
+ )
323
+
324
+ sql_schema = await lancedb.TableSchema.from_class(
325
+ SqlLanceChunk,
326
+ primary_key=["id"],
327
+ )
328
+ sql_table = await lancedb.mount_table_target(
329
+ LANCE_DB,
330
+ "sqlschemaindex_sql_schema",
331
+ sql_schema,
332
+ )
333
+
334
+ yaml_schema = await lancedb.TableSchema.from_class(
335
+ YamlLanceChunk,
336
+ primary_key=["id"],
337
+ )
338
+ yaml_table = await lancedb.mount_table_target(
339
+ LANCE_DB,
340
+ "yamlconfigindex_yaml_config",
341
+ yaml_schema,
342
+ )
343
+
344
+ project_root = coco.use_context(PROJECT_ROOT)
345
+ _ignore = LayeredIgnore(project_root)
346
+ _walk_excludes = _ignore.cocoindex_excluded_patterns()
347
+ java_files = localfs.walk_dir(
348
+ PROJECT_ROOT,
349
+ recursive=True,
350
+ path_matcher=PatternFilePathMatcher(
351
+ included_patterns=["**/*.java"],
352
+ excluded_patterns=_walk_excludes,
353
+ ),
354
+ )
355
+ sql_files = localfs.walk_dir(
356
+ PROJECT_ROOT,
357
+ recursive=True,
358
+ path_matcher=PatternFilePathMatcher(
359
+ included_patterns=["**/src/main/resources/db/migration/*.sql"],
360
+ excluded_patterns=_walk_excludes,
361
+ ),
362
+ )
363
+ yaml_files = localfs.walk_dir(
364
+ PROJECT_ROOT,
365
+ recursive=True,
366
+ path_matcher=PatternFilePathMatcher(
367
+ included_patterns=[
368
+ "**/src/main/resources/application*.yml",
369
+ "**/src/main/resources/application*.yaml",
370
+ ],
371
+ excluded_patterns=_walk_excludes,
372
+ ),
373
+ )
374
+
375
+ await coco.mount_each(
376
+ coco.component_subpath(coco.Symbol("java_files")),
377
+ process_java_file,
378
+ java_files.items(),
379
+ java_table,
380
+ )
381
+ await coco.mount_each(
382
+ coco.component_subpath(coco.Symbol("sql_files")),
383
+ process_sql_file,
384
+ sql_files.items(),
385
+ sql_table,
386
+ )
387
+ await coco.mount_each(
388
+ coco.component_subpath(coco.Symbol("yaml_files")),
389
+ process_yaml_file,
390
+ yaml_files.items(),
391
+ yaml_table,
392
+ )
393
+
394
+
395
+ app = coco.App(
396
+ coco.AppConfig(name="JavaCodeIndexLance"),
397
+ app_main,
398
+ )
@@ -0,0 +1,33 @@
1
+ """Shared helpers for Java/SQL/YAML CocoIndex 1.0 apps (no ContextKeys here)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Any
7
+
8
+ from cocoindex.resources.chunk import Chunk, TextPosition
9
+
10
+ # Hub id or absolute path to a local model dir (config.json + weights). Override with env SBERT_MODEL.
11
+ _DEFAULT_HUB = "sentence-transformers/all-MiniLM-L6-v2"
12
+ SBERT_MODEL = os.path.expandvars(os.path.expanduser(os.environ.get("SBERT_MODEL", _DEFAULT_HUB)))
13
+
14
+ # Larger window + overlap so chunks carry more behavioural context (method bodies
15
+ # rarely split mid-statement, fewer "orphan" import-only hits at chunk edges).
16
+ # Requires re-index to apply.
17
+ JAVA_CHUNK = (1500, 350, 220)
18
+ SQL_CHUNK = (800, 100, 80)
19
+ YAML_CHUNK = (600, 100, 60)
20
+
21
+
22
+ def position_to_json(pos: TextPosition) -> dict[str, Any]:
23
+ return {
24
+ "byte_offset": pos.byte_offset,
25
+ "char_offset": pos.char_offset,
26
+ "line": pos.line,
27
+ "column": pos.column,
28
+ }
29
+
30
+
31
+ def chunk_key_range(chunk: Chunk) -> tuple[int, int]:
32
+ """Byte range for stable primary keys (start inclusive, end exclusive)."""
33
+ return chunk.start.byte_offset, chunk.end.byte_offset