java-codebase-rag 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ast_java.py +24 -7
- build_ast_graph.py +153 -94
- graph_enrich.py +3 -3
- java_codebase_rag/_fdlimit.py +48 -0
- java_codebase_rag/cli.py +31 -28
- java_codebase_rag/config.py +40 -10
- java_codebase_rag/installer.py +99 -10
- java_codebase_rag/lance_optimize.py +148 -0
- java_codebase_rag/pipeline.py +63 -9
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/METADATA +6 -5
- java_codebase_rag-0.6.1.dist-info/RECORD +36 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/top_level.txt +1 -1
- java_index_flow_lancedb.py +22 -4
- java_ontology.py +5 -2
- ladybug_queries.py +1995 -0
- mcp_v2.py +51 -26
- pr_analysis.py +1 -1
- search_lancedb.py +8 -8
- server.py +116 -68
- user_rag/__init__.py +1 -0
- user_rag/cli.py +175 -0
- java_codebase_rag-0.5.3.dist-info/RECORD +0 -31
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/WHEEL +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/entry_points.txt +0 -0
- {java_codebase_rag-0.5.3.dist-info → java_codebase_rag-0.6.1.dist-info}/licenses/LICENSE +0 -0
ast_java.py
CHANGED
|
@@ -13,6 +13,7 @@ Python with no tree-sitter dependency.
|
|
|
13
13
|
from __future__ import annotations
|
|
14
14
|
|
|
15
15
|
import posixpath
|
|
16
|
+
import sys
|
|
16
17
|
from dataclasses import dataclass, field
|
|
17
18
|
from functools import lru_cache
|
|
18
19
|
from typing import Iterable
|
|
@@ -325,7 +326,7 @@ class RouteDecl:
|
|
|
325
326
|
filename: str
|
|
326
327
|
start_line: int
|
|
327
328
|
end_line: int
|
|
328
|
-
# brownfield / B2a composition (graph_enrich.resolve_routes_for_method); not a
|
|
329
|
+
# brownfield / B2a composition (graph_enrich.resolve_routes_for_method); not a graph column.
|
|
329
330
|
route_source_layer: str = "builtin"
|
|
330
331
|
|
|
331
332
|
|
|
@@ -1642,9 +1643,17 @@ def _parse_codebase_http_client_annotation(
|
|
|
1642
1643
|
pairs, _ = _annotation_kv_nodes(ann, src)
|
|
1643
1644
|
client_kind = ""
|
|
1644
1645
|
if "clientKind" in pairs:
|
|
1645
|
-
val,
|
|
1646
|
-
if val and
|
|
1647
|
-
|
|
1646
|
+
val, vkind = _annotation_value(pairs["clientKind"], src)
|
|
1647
|
+
if val and vkind == "enum":
|
|
1648
|
+
kind_val = str(val)
|
|
1649
|
+
from java_ontology import VALID_CLIENT_KINDS # deferred: java_ontology imports ast_java
|
|
1650
|
+
if kind_val in VALID_CLIENT_KINDS:
|
|
1651
|
+
client_kind = kind_val
|
|
1652
|
+
else:
|
|
1653
|
+
print(
|
|
1654
|
+
f"[lancedb-mcp] CodebaseHttpClient: invalid clientKind {kind_val!r} — ignored",
|
|
1655
|
+
file=sys.stderr,
|
|
1656
|
+
)
|
|
1648
1657
|
target_service = ""
|
|
1649
1658
|
if "targetService" in pairs:
|
|
1650
1659
|
atoms = _string_value_atoms(pairs["targetService"], src, ctx)
|
|
@@ -1714,9 +1723,17 @@ def _parse_codebase_producer_annotation(
|
|
|
1714
1723
|
client_kind = "kafka_send"
|
|
1715
1724
|
kind_node = pairs.get("producerKind") or pairs.get("clientKind")
|
|
1716
1725
|
if kind_node is not None:
|
|
1717
|
-
val,
|
|
1718
|
-
if val and
|
|
1719
|
-
|
|
1726
|
+
val, vkind = _annotation_value(kind_node, src)
|
|
1727
|
+
if val and vkind == "enum":
|
|
1728
|
+
kind_val = str(val)
|
|
1729
|
+
from java_ontology import VALID_PRODUCER_KINDS # deferred: java_ontology imports ast_java
|
|
1730
|
+
if kind_val in VALID_PRODUCER_KINDS:
|
|
1731
|
+
client_kind = kind_val
|
|
1732
|
+
else:
|
|
1733
|
+
print(
|
|
1734
|
+
f"[lancedb-mcp] CodebaseProducer: invalid producerKind {kind_val!r} — ignored",
|
|
1735
|
+
file=sys.stderr,
|
|
1736
|
+
)
|
|
1720
1737
|
topic = ""
|
|
1721
1738
|
if "topic" in pairs:
|
|
1722
1739
|
atoms = _string_value_atoms(pairs["topic"], src, ctx)
|
build_ast_graph.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Four-pass AST-derived Knowledge Base builder (
|
|
2
|
+
"""Four-pass AST-derived Knowledge Base builder (LadybugDB).
|
|
3
3
|
|
|
4
4
|
Walks a Java source tree with `tree_sitter_java`, writes a deterministic graph of:
|
|
5
5
|
Symbol nodes: package, file, class, interface, enum, record, annotation, method, constructor
|
|
@@ -13,14 +13,14 @@ Pass 3 resolves static call sites into confidence-scored CALLS edges and DECLARE
|
|
|
13
13
|
Pass 4 emits Route rows plus Symbol→Route EXPOSES edges from literal annotation metadata.
|
|
14
14
|
|
|
15
15
|
Usage:
|
|
16
|
-
build_ast_graph.py --source-root <repo> [--
|
|
16
|
+
build_ast_graph.py --source-root <repo> [--ladybug-path <path>] [--verbose]
|
|
17
17
|
|
|
18
|
-
Default
|
|
19
|
-
--
|
|
20
|
-
JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.
|
|
21
|
-
./.java-codebase-rag/code_graph.
|
|
18
|
+
Default LadybugDB database path resolution order:
|
|
19
|
+
--ladybug-path CLI arg (path passed to ladybug.Database(...))
|
|
20
|
+
JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.lbug (if set and local)
|
|
21
|
+
./.java-codebase-rag/code_graph.lbug under cwd
|
|
22
22
|
|
|
23
|
-
The
|
|
23
|
+
The LadybugDB DB is dropped and rebuilt on every run (Phase 1 is a full rebuild).
|
|
24
24
|
"""
|
|
25
25
|
from __future__ import annotations
|
|
26
26
|
|
|
@@ -37,7 +37,7 @@ from collections import defaultdict
|
|
|
37
37
|
from dataclasses import asdict, dataclass, field, replace
|
|
38
38
|
from pathlib import Path
|
|
39
39
|
|
|
40
|
-
import
|
|
40
|
+
import ladybug
|
|
41
41
|
|
|
42
42
|
from ast_java import (
|
|
43
43
|
ONTOLOGY_VERSION,
|
|
@@ -76,7 +76,7 @@ _PASS3_START = "[graph] pass 3 · call resolution (outgoing calls per site)"
|
|
|
76
76
|
_PASS4_START = "[graph] pass 4 · route and EXPOSES extraction"
|
|
77
77
|
_PASS5_START = "[graph] pass 5 · imperative HTTP_CALLS / ASYNC_CALLS edges"
|
|
78
78
|
_PASS6_START = "[graph] pass 6 · cross-service call-edge matching"
|
|
79
|
-
_WRITE_START = "[graph] writing ·
|
|
79
|
+
_WRITE_START = "[graph] writing · LadybugDB graph to disk"
|
|
80
80
|
|
|
81
81
|
|
|
82
82
|
def _verbose_stderr_line(content: str) -> None:
|
|
@@ -230,7 +230,7 @@ class RouteRow:
|
|
|
230
230
|
start_line: int
|
|
231
231
|
end_line: int
|
|
232
232
|
resolved: bool
|
|
233
|
-
# B2a brownfield composition (PR-A3); not persisted on
|
|
233
|
+
# B2a brownfield composition (PR-A3); not persisted on LadybugDB `Route` nodes.
|
|
234
234
|
source_layer: str = "builtin"
|
|
235
235
|
|
|
236
236
|
|
|
@@ -499,8 +499,8 @@ def _hash_file(abs_path: Path) -> str:
|
|
|
499
499
|
# ---------- incremental rebuild helpers ----------
|
|
500
500
|
|
|
501
501
|
|
|
502
|
-
def _load_existing_types(conn:
|
|
503
|
-
"""Load type entries from existing
|
|
502
|
+
def _load_existing_types(conn: ladybug.Connection, tables: GraphTables, exclude_files: set[str] | None = None) -> None:
|
|
503
|
+
"""Load type entries from existing LadybugDB graph into tables for cross-file resolution.
|
|
504
504
|
|
|
505
505
|
When exclude_files is provided, only load types from files NOT in the set.
|
|
506
506
|
"""
|
|
@@ -543,8 +543,8 @@ def _load_existing_types(conn: kuzu.Connection, tables: GraphTables, exclude_fil
|
|
|
543
543
|
tables.by_package.setdefault(package, []).append(entry)
|
|
544
544
|
|
|
545
545
|
|
|
546
|
-
def _load_existing_members(conn:
|
|
547
|
-
"""Load member entries from existing
|
|
546
|
+
def _load_existing_members(conn: ladybug.Connection, tables: GraphTables, exclude_files: set[str] | None = None) -> None:
|
|
547
|
+
"""Load member entries from existing LadybugDB graph into tables.members.
|
|
548
548
|
|
|
549
549
|
When exclude_files is provided, only load members from files NOT in the set.
|
|
550
550
|
"""
|
|
@@ -588,15 +588,25 @@ def _load_existing_members(conn: kuzu.Connection, tables: GraphTables, exclude_f
|
|
|
588
588
|
))
|
|
589
589
|
|
|
590
590
|
|
|
591
|
-
|
|
591
|
+
# Every Symbol->Symbol REL TABLE type in the graph schema. A Symbol node can
|
|
592
|
+
# only have an INCOMING edge of one of these types, so `_find_dependents` MUST
|
|
593
|
+
# walk all of them: that completeness is what makes the changed-node DETACH
|
|
594
|
+
# DELETE in `_delete_file_scope` Phase 3 safe (every real caller of a changed
|
|
595
|
+
# node is pulled into scope, so Phase 1 removes the edge before the node delete).
|
|
596
|
+
# If you add a new Symbol->Symbol edge type to the schema, add it here too —
|
|
597
|
+
# otherwise changed-node deletion would silently drop its surviving edges.
|
|
598
|
+
_SYMBOL_TO_SYMBOL_EDGE_TYPES = (
|
|
599
|
+
"EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES",
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> set[str]:
|
|
592
604
|
"""Find files whose nodes have edges pointing into changed nodes. Returns set of filenames."""
|
|
593
605
|
dependent_files: set[str] = set()
|
|
594
606
|
|
|
595
|
-
# Query each Symbol-to-Symbol edge table for incoming edges
|
|
596
|
-
edge_types = ["EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES"]
|
|
597
607
|
params = {"changed_ids": list(changed_node_ids)}
|
|
598
608
|
|
|
599
|
-
for edge_type in
|
|
609
|
+
for edge_type in _SYMBOL_TO_SYMBOL_EDGE_TYPES:
|
|
600
610
|
query = f"""
|
|
601
611
|
MATCH (src:Symbol)-[e:{edge_type}]->(dst:Symbol)
|
|
602
612
|
WHERE dst.id IN $changed_ids
|
|
@@ -612,23 +622,52 @@ def _find_dependents(conn: kuzu.Connection, changed_node_ids: set[str]) -> set[s
|
|
|
612
622
|
return dependent_files
|
|
613
623
|
|
|
614
624
|
|
|
615
|
-
def _delete_file_scope(
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
in
|
|
625
|
+
def _delete_file_scope(
|
|
626
|
+
conn: ladybug.Connection,
|
|
627
|
+
changed_files: set[str],
|
|
628
|
+
dependent_files: set[str],
|
|
629
|
+
) -> None:
|
|
630
|
+
"""Delete nodes and edges for a scope split into changed vs dependent files.
|
|
631
|
+
|
|
632
|
+
``changed_files`` are files whose content actually changed (added/modified/
|
|
633
|
+
removed): their Symbol nodes are deleted (and re-created by ``_scoped_write``).
|
|
634
|
+
``dependent_files`` are files pulled in only to re-resolve their OUTGOING
|
|
635
|
+
edges against the changed nodes; their node definitions did not change, so
|
|
636
|
+
their nodes are deliberately PRESERVED (they re-MERGE in place on the same
|
|
637
|
+
deterministic ``symbol_id``). Skipping phantom nodes (filename="").
|
|
638
|
+
|
|
639
|
+
Why dependents are preserved (issue #305): the orchestrator computes
|
|
640
|
+
dependents from the *changed* nodes only, so a dependent file's node can
|
|
641
|
+
have an incoming CALLS edge from an out-of-scope caller. The ``source_file``
|
|
642
|
+
on every Symbol->Symbol edge is the CALLER's file (pinned by
|
|
643
|
+
``test_source_file_value_matches_symbol_filename``), so Phase 1 below only
|
|
644
|
+
deletes edges ORIGINATING in scope; incoming edges from out-of-scope callers
|
|
645
|
+
survive. If we then tried to DELETE the dependent node, LadybugDB rejects it
|
|
646
|
+
("Node ... has connected edges in table CALLS in the bwd direction, ...
|
|
647
|
+
Please delete the edges first or try DETACH DELETE") and the rebuild falls
|
|
648
|
+
back to a full rebuild. A naive fix (DETACH DELETE on dependents, or an
|
|
649
|
+
extra incoming-edge pass) would silence the crash but permanently drop those
|
|
650
|
+
out-of-scope edges, corrupting the graph. Preserving dependent nodes keeps
|
|
651
|
+
both the nodes and their incoming edges intact.
|
|
652
|
+
|
|
653
|
+
Phase 1 deletes ALL edge types across the whole scope (changed + dependent)
|
|
654
|
+
first to avoid LadybugDB "has connected edges" errors when edges from one
|
|
655
|
+
file point to nodes in another file within the same scope. Route/Client/
|
|
656
|
+
Producer nodes use DETACH DELETE as a safety net for any edges missed in
|
|
657
|
+
Phase 1.
|
|
625
658
|
"""
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
#
|
|
631
|
-
#
|
|
659
|
+
scope_files = changed_files | dependent_files
|
|
660
|
+
scope_list = list(scope_files)
|
|
661
|
+
changed_list = list(changed_files)
|
|
662
|
+
|
|
663
|
+
# Phase 1: Delete ALL edges ORIGINATING from any scope file (changed +
|
|
664
|
+
# dependent). Because `source_file` is the caller's file, this deletes edges
|
|
665
|
+
# whose source is in scope (including dependents' outgoing edges to changed
|
|
666
|
+
# nodes) while intentionally leaving incoming edges from out-of-scope callers
|
|
667
|
+
# intact — those must survive so the dependent nodes below can be preserved.
|
|
668
|
+
# This list is a superset of `_SYMBOL_TO_SYMBOL_EDGE_TYPES` (it also covers
|
|
669
|
+
# Symbol->Route/Client/Producer/UCS and Client/Producer->Route edges); keep
|
|
670
|
+
# both lists in sync with the schema.
|
|
632
671
|
edge_tables = [
|
|
633
672
|
"EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES",
|
|
634
673
|
"UNRESOLVED_AT", "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER",
|
|
@@ -640,7 +679,7 @@ def _delete_file_scope(conn: kuzu.Connection, filenames: set[str]) -> None:
|
|
|
640
679
|
WHERE e.source_file IN $filenames
|
|
641
680
|
DELETE e
|
|
642
681
|
"""
|
|
643
|
-
conn.execute(query, {"filenames":
|
|
682
|
+
conn.execute(query, {"filenames": scope_list})
|
|
644
683
|
|
|
645
684
|
# Phase 2: Collect all Symbol node IDs for UnresolvedCallSite cleanup.
|
|
646
685
|
symbol_ids: list[str] = []
|
|
@@ -649,12 +688,15 @@ def _delete_file_scope(conn: kuzu.Connection, filenames: set[str]) -> None:
|
|
|
649
688
|
WHERE s.filename IN $filenames
|
|
650
689
|
RETURN s.id
|
|
651
690
|
"""
|
|
652
|
-
result = conn.execute(symbol_ids_query, {"filenames":
|
|
691
|
+
result = conn.execute(symbol_ids_query, {"filenames": scope_list})
|
|
653
692
|
while result.has_next():
|
|
654
693
|
row = result.get_next()
|
|
655
694
|
symbol_ids.append(row[0])
|
|
656
695
|
|
|
657
|
-
# Delete UnresolvedCallSite nodes whose caller_id is in the collected set
|
|
696
|
+
# Delete UnresolvedCallSite nodes whose caller_id is in the collected set.
|
|
697
|
+
# These are children of scope symbols (including preserved dependents);
|
|
698
|
+
# deleting them is safe because every scope file — dependents included — is
|
|
699
|
+
# reprocessed and re-emits its UnresolvedCallSite nodes in `_scoped_write`.
|
|
658
700
|
if symbol_ids:
|
|
659
701
|
unresolved_query = """
|
|
660
702
|
MATCH (u:UnresolvedCallSite)
|
|
@@ -663,27 +705,37 @@ def _delete_file_scope(conn: kuzu.Connection, filenames: set[str]) -> None:
|
|
|
663
705
|
"""
|
|
664
706
|
conn.execute(unresolved_query, {"symbol_ids": symbol_ids})
|
|
665
707
|
|
|
666
|
-
# Phase 3: Delete Symbol nodes.
|
|
708
|
+
# Phase 3: Delete Symbol nodes ONLY for changed files (not dependents).
|
|
709
|
+
# Dependent-file nodes are deliberately PRESERVED so their incoming edges
|
|
710
|
+
# from out-of-scope callers survive; the dependents are re-MERGEd in place
|
|
711
|
+
# by `_scoped_write` on the same deterministic node id. A changed node's
|
|
712
|
+
# real incoming edges all come from dependent files (callers pulled into
|
|
713
|
+
# scope by `_find_dependents`, which walks every type in
|
|
714
|
+
# `_SYMBOL_TO_SYMBOL_EDGE_TYPES`), so Phase 1 already removed them and the
|
|
715
|
+
# dependents re-emit them when reprocessed. DETACH DELETE is only a safety
|
|
716
|
+
# net for the rare surviving edge whose source was NOT pulled into scope
|
|
717
|
+
# (e.g. a phantom caller with filename="", which `_find_dependents` skips);
|
|
718
|
+
# such an edge is stale once the node is recreated, so dropping it is fine.
|
|
667
719
|
delete_symbols_query = """
|
|
668
720
|
MATCH (s:Symbol)
|
|
669
721
|
WHERE s.filename IN $filenames
|
|
670
|
-
DELETE s
|
|
722
|
+
DETACH DELETE s
|
|
671
723
|
"""
|
|
672
|
-
conn.execute(delete_symbols_query, {"filenames":
|
|
724
|
+
conn.execute(delete_symbols_query, {"filenames": changed_list})
|
|
673
725
|
|
|
674
726
|
# Phase 4: Delete Route, Client, Producer nodes.
|
|
675
727
|
# Use DETACH DELETE as a safety net in case any edges were missed in Phase 1.
|
|
676
728
|
for label in ["Route", "Client", "Producer"]:
|
|
677
729
|
conn.execute(
|
|
678
730
|
f"MATCH (n:{label}) WHERE n.filename IN $filenames DETACH DELETE n",
|
|
679
|
-
{"filenames":
|
|
731
|
+
{"filenames": scope_list},
|
|
680
732
|
)
|
|
681
733
|
|
|
682
734
|
|
|
683
|
-
def _scoped_write(conn:
|
|
684
|
-
"""Write nodes and edges to existing
|
|
735
|
+
def _scoped_write(conn: ladybug.Connection, tables: GraphTables, *, project_root: Path, meta_chain: dict[str, frozenset[str]] | None) -> None:
|
|
736
|
+
"""Write nodes and edges to existing LadybugDB database without drop/create schema.
|
|
685
737
|
|
|
686
|
-
Like
|
|
738
|
+
Like write_ladybug() but without _drop_all()/_create_schema(). The caller is
|
|
687
739
|
responsible for calling _populate_declares_rows() and _populate_overrides_rows()
|
|
688
740
|
before invoking this function.
|
|
689
741
|
|
|
@@ -715,13 +767,13 @@ def _scoped_write(conn: kuzu.Connection, tables: GraphTables, *, project_root: P
|
|
|
715
767
|
|
|
716
768
|
|
|
717
769
|
def _write_nodes_merge(
|
|
718
|
-
conn:
|
|
770
|
+
conn: ladybug.Connection,
|
|
719
771
|
tables: GraphTables,
|
|
720
772
|
*,
|
|
721
773
|
project_root: Path,
|
|
722
774
|
meta_chain: dict[str, frozenset[str]] | None,
|
|
723
775
|
) -> None:
|
|
724
|
-
"""Write nodes to existing
|
|
776
|
+
"""Write nodes to existing LadybugDB database using MERGE to handle existing nodes."""
|
|
725
777
|
_write_nodes_impl(conn, tables, project_root=project_root, meta_chain=meta_chain, symbol_query=_MERGE_SYMBOL)
|
|
726
778
|
|
|
727
779
|
|
|
@@ -2664,7 +2716,7 @@ def pass6_match_edges(
|
|
|
2664
2716
|
)
|
|
2665
2717
|
|
|
2666
2718
|
|
|
2667
|
-
# ----------
|
|
2719
|
+
# ---------- LadybugDB write ----------
|
|
2668
2720
|
|
|
2669
2721
|
|
|
2670
2722
|
_SCHEMA_NODE = (
|
|
@@ -2685,7 +2737,7 @@ _SCHEMA_META = (
|
|
|
2685
2737
|
"ontology_version INT64, built_at INT64, source_root STRING, "
|
|
2686
2738
|
"counts_json STRING, parse_errors INT64, "
|
|
2687
2739
|
"routes_total INT64, exposes_total INT64, "
|
|
2688
|
-
# JSON map {framework: count}; STRING avoids
|
|
2740
|
+
# JSON map {framework: count}; STRING avoids LadybugDB Python MAP↔STRUCT binder mismatch.
|
|
2689
2741
|
"routes_by_framework STRING, "
|
|
2690
2742
|
"routes_resolved_pct DOUBLE, "
|
|
2691
2743
|
"routes_from_brownfield_pct DOUBLE, "
|
|
@@ -2798,7 +2850,7 @@ _SCHEMA_ASYNC_CALLS = (
|
|
|
2798
2850
|
)
|
|
2799
2851
|
|
|
2800
2852
|
|
|
2801
|
-
def _drop_all(conn:
|
|
2853
|
+
def _drop_all(conn: ladybug.Connection) -> None:
|
|
2802
2854
|
for stmt in (
|
|
2803
2855
|
"DROP TABLE IF EXISTS DECLARES_CLIENT",
|
|
2804
2856
|
"DROP TABLE IF EXISTS DECLARES_PRODUCER",
|
|
@@ -2825,7 +2877,7 @@ def _drop_all(conn: kuzu.Connection) -> None:
|
|
|
2825
2877
|
pass
|
|
2826
2878
|
|
|
2827
2879
|
|
|
2828
|
-
def _create_schema(conn:
|
|
2880
|
+
def _create_schema(conn: ladybug.Connection) -> None:
|
|
2829
2881
|
for stmt in (
|
|
2830
2882
|
_SCHEMA_NODE,
|
|
2831
2883
|
_SCHEMA_UNRESOLVED_CALL_SITE,
|
|
@@ -2885,7 +2937,7 @@ _MERGE_SYMBOL = (
|
|
|
2885
2937
|
|
|
2886
2938
|
|
|
2887
2939
|
def _write_nodes_impl(
|
|
2888
|
-
conn:
|
|
2940
|
+
conn: ladybug.Connection,
|
|
2889
2941
|
tables: GraphTables,
|
|
2890
2942
|
*,
|
|
2891
2943
|
project_root: Path,
|
|
@@ -2952,7 +3004,7 @@ def _write_nodes_impl(
|
|
|
2952
3004
|
|
|
2953
3005
|
|
|
2954
3006
|
def _write_nodes(
|
|
2955
|
-
conn:
|
|
3007
|
+
conn: ladybug.Connection,
|
|
2956
3008
|
tables: GraphTables,
|
|
2957
3009
|
*,
|
|
2958
3010
|
project_root: Path,
|
|
@@ -3064,7 +3116,7 @@ def _direct_supertype_ids(tables: GraphTables, type_id: str) -> list[str]:
|
|
|
3064
3116
|
def _populate_overrides_rows(tables: GraphTables) -> None:
|
|
3065
3117
|
"""Materialize (subtype_method)-[:OVERRIDES]->(supertype_method) for one supertype hop.
|
|
3066
3118
|
|
|
3067
|
-
Matches ``
|
|
3119
|
+
Matches ``LadybugDBGraph.override_axis_rollup_for`` (direct ``IMPLEMENTS`` / ``EXTENDS``
|
|
3068
3120
|
only, same ``signature``, distinct method ids, non-static instance methods).
|
|
3069
3121
|
"""
|
|
3070
3122
|
by_declaring_type: dict[str, list[MemberEntry]] = defaultdict(list)
|
|
@@ -3099,7 +3151,7 @@ def _build_file_by_node_id(tables: GraphTables) -> dict[str, str]:
|
|
|
3099
3151
|
return lookup
|
|
3100
3152
|
|
|
3101
3153
|
|
|
3102
|
-
def _write_edges(conn:
|
|
3154
|
+
def _write_edges(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id: dict[str, str] | None = None) -> None:
|
|
3103
3155
|
# Build node_id -> file_path lookup for source_file resolution.
|
|
3104
3156
|
if _file_by_node_id is None:
|
|
3105
3157
|
_file_by_node_id = _build_file_by_node_id(tables)
|
|
@@ -3193,7 +3245,7 @@ def _write_edges(conn: kuzu.Connection, tables: GraphTables, _file_by_node_id: d
|
|
|
3193
3245
|
})
|
|
3194
3246
|
|
|
3195
3247
|
|
|
3196
|
-
def _write_routes_and_exposes(conn:
|
|
3248
|
+
def _write_routes_and_exposes(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id: dict[str, str] | None = None) -> None:
|
|
3197
3249
|
# Build node_id -> file_path lookup for source_file resolution (for Symbol sources).
|
|
3198
3250
|
if _file_by_node_id is None:
|
|
3199
3251
|
_file_by_node_id = _build_file_by_node_id(tables)
|
|
@@ -3276,7 +3328,7 @@ def _write_routes_and_exposes(conn: kuzu.Connection, tables: GraphTables, _file_
|
|
|
3276
3328
|
})
|
|
3277
3329
|
|
|
3278
3330
|
|
|
3279
|
-
def _write_meta(conn:
|
|
3331
|
+
def _write_meta(conn: ladybug.Connection, tables: GraphTables, source_root: Path) -> None:
|
|
3280
3332
|
seen_calls: set[tuple[str, str, int, int]] = set()
|
|
3281
3333
|
calls_unique = 0
|
|
3282
3334
|
for row in tables.calls_rows:
|
|
@@ -3392,12 +3444,12 @@ def _write_meta(conn: kuzu.Connection, tables: GraphTables, source_root: Path) -
|
|
|
3392
3444
|
|
|
3393
3445
|
def incremental_rebuild(
|
|
3394
3446
|
source_root: Path,
|
|
3395
|
-
|
|
3447
|
+
ladybug_path: Path,
|
|
3396
3448
|
*,
|
|
3397
3449
|
verbose: bool,
|
|
3398
3450
|
expansion_cap: int = 50,
|
|
3399
3451
|
) -> IncrementalResult:
|
|
3400
|
-
"""Incrementally rebuild the
|
|
3452
|
+
"""Incrementally rebuild the LadybugDB graph, processing only changed files and their dependents.
|
|
3401
3453
|
|
|
3402
3454
|
Returns IncrementalResult with statistics about the rebuild.
|
|
3403
3455
|
Falls back to full rebuild if:
|
|
@@ -3409,7 +3461,7 @@ def incremental_rebuild(
|
|
|
3409
3461
|
t_start = time.time()
|
|
3410
3462
|
|
|
3411
3463
|
# Step 1: Load existing graph and detect changes
|
|
3412
|
-
if not
|
|
3464
|
+
if not ladybug_path.exists():
|
|
3413
3465
|
if verbose:
|
|
3414
3466
|
_verbose_stderr_line("[increment] no existing graph; falling back to full rebuild")
|
|
3415
3467
|
# Fall back to full rebuild
|
|
@@ -3420,7 +3472,7 @@ def incremental_rebuild(
|
|
|
3420
3472
|
pass4_routes(tables, asts, source_root=source_root, verbose=verbose)
|
|
3421
3473
|
pass5_imperative_edges(tables, asts, source_root=source_root, verbose=verbose)
|
|
3422
3474
|
pass6_match_edges(tables, verbose=verbose)
|
|
3423
|
-
|
|
3475
|
+
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=verbose)
|
|
3424
3476
|
|
|
3425
3477
|
return IncrementalResult(
|
|
3426
3478
|
mode="full_fallback",
|
|
@@ -3431,8 +3483,8 @@ def incremental_rebuild(
|
|
|
3431
3483
|
elapsed_sec=time.time() - t_start,
|
|
3432
3484
|
)
|
|
3433
3485
|
|
|
3434
|
-
db =
|
|
3435
|
-
conn =
|
|
3486
|
+
db = ladybug.Database(str(ladybug_path))
|
|
3487
|
+
conn = ladybug.Connection(db)
|
|
3436
3488
|
|
|
3437
3489
|
# Check ontology version
|
|
3438
3490
|
try:
|
|
@@ -3445,7 +3497,7 @@ def incremental_rebuild(
|
|
|
3445
3497
|
_verbose_stderr_line(f"[increment] ontology version {version} < 17; falling back to full rebuild")
|
|
3446
3498
|
conn.close()
|
|
3447
3499
|
del conn, db
|
|
3448
|
-
return _fallback_to_full(source_root,
|
|
3500
|
+
return _fallback_to_full(source_root, ladybug_path, verbose, t_start)
|
|
3449
3501
|
except Exception as e:
|
|
3450
3502
|
if verbose:
|
|
3451
3503
|
_verbose_stderr_line(f"[increment] failed to read ontology version: {e}; falling back to full rebuild")
|
|
@@ -3454,9 +3506,9 @@ def incremental_rebuild(
|
|
|
3454
3506
|
except Exception:
|
|
3455
3507
|
pass
|
|
3456
3508
|
del conn, db
|
|
3457
|
-
return _fallback_to_full(source_root,
|
|
3509
|
+
return _fallback_to_full(source_root, ladybug_path, verbose, t_start)
|
|
3458
3510
|
|
|
3459
|
-
index_dir =
|
|
3511
|
+
index_dir = ladybug_path.parent
|
|
3460
3512
|
tracker = FileHashTracker(index_dir)
|
|
3461
3513
|
tracker.load()
|
|
3462
3514
|
|
|
@@ -3488,7 +3540,7 @@ def incremental_rebuild(
|
|
|
3488
3540
|
_verbose_stderr_line("[increment] crash marker exists; falling back to full rebuild")
|
|
3489
3541
|
conn.close()
|
|
3490
3542
|
crash_marker_path.unlink(missing_ok=True)
|
|
3491
|
-
return _fallback_to_full(source_root,
|
|
3543
|
+
return _fallback_to_full(source_root, ladybug_path, verbose, t_start)
|
|
3492
3544
|
|
|
3493
3545
|
# Write crash marker
|
|
3494
3546
|
crash_marker_path.write_text("", encoding="utf-8")
|
|
@@ -3516,7 +3568,7 @@ def incremental_rebuild(
|
|
|
3516
3568
|
_verbose_stderr_line(f"[increment] dependent expansion cap ({expansion_cap}) exceeded ({len(scope_files)} files); falling back to full rebuild")
|
|
3517
3569
|
conn.close()
|
|
3518
3570
|
crash_marker_path.unlink(missing_ok=True)
|
|
3519
|
-
return _fallback_to_full(source_root,
|
|
3571
|
+
return _fallback_to_full(source_root, ladybug_path, verbose, t_start)
|
|
3520
3572
|
|
|
3521
3573
|
if verbose:
|
|
3522
3574
|
_verbose_stderr_line(f"[increment] processing {len(scope_files)} files ({len(changed_files)} changed + {len(dependent_files)} dependents)")
|
|
@@ -3524,7 +3576,7 @@ def incremental_rebuild(
|
|
|
3524
3576
|
# Step 4: Scoped deletion
|
|
3525
3577
|
if verbose:
|
|
3526
3578
|
_verbose_stderr_line("[increment] deleting outdated nodes and edges")
|
|
3527
|
-
_delete_file_scope(conn,
|
|
3579
|
+
_delete_file_scope(conn, changed_files, dependent_files)
|
|
3528
3580
|
|
|
3529
3581
|
# Force deletion to be applied by running a dummy query
|
|
3530
3582
|
conn.execute("MATCH (s:Symbol) RETURN count(*)")
|
|
@@ -3612,14 +3664,21 @@ def incremental_rebuild(
|
|
|
3612
3664
|
_verbose_stderr_line(f"[increment] error during incremental rebuild: {e}; falling back to full rebuild")
|
|
3613
3665
|
conn.close()
|
|
3614
3666
|
crash_marker_path.unlink(missing_ok=True)
|
|
3615
|
-
return _fallback_to_full(source_root,
|
|
3667
|
+
return _fallback_to_full(source_root, ladybug_path, verbose, t_start)
|
|
3668
|
+
|
|
3616
3669
|
|
|
3670
|
+
def _init_hash_tracker(source_root: Path, ladybug_path: Path) -> int:
|
|
3671
|
+
"""Initialize hash tracker for all Java files. Returns number of files hashed.
|
|
3617
3672
|
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3673
|
+
Called right after a full graph rebuild (``write_ladybug``), so the store must
|
|
3674
|
+
mirror exactly the files that were just indexed. We deliberately do NOT
|
|
3675
|
+
``load()`` the existing store: ``update`` re-hashes every current file anyway,
|
|
3676
|
+
and preserving old entries would leave stale hashes for files that no longer
|
|
3677
|
+
exist (deleted or now-ignored). Those ghosts would be re-detected as "removed"
|
|
3678
|
+
on every subsequent ``increment``, sustaining an endless full-rebuild loop.
|
|
3679
|
+
"""
|
|
3680
|
+
index_dir = ladybug_path.parent
|
|
3621
3681
|
tracker = FileHashTracker(index_dir)
|
|
3622
|
-
tracker.load()
|
|
3623
3682
|
ignore = LayeredIgnore(source_root)
|
|
3624
3683
|
all_files: set[str] = set()
|
|
3625
3684
|
source_root_resolved = source_root.resolve()
|
|
@@ -3635,7 +3694,7 @@ def _init_hash_tracker(source_root: Path, kuzu_path: Path) -> int:
|
|
|
3635
3694
|
return len(all_files)
|
|
3636
3695
|
|
|
3637
3696
|
|
|
3638
|
-
def _fallback_to_full(source_root: Path,
|
|
3697
|
+
def _fallback_to_full(source_root: Path, ladybug_path: Path, verbose: bool, t_start: float) -> IncrementalResult:
|
|
3639
3698
|
"""Fallback to full rebuild."""
|
|
3640
3699
|
tables = GraphTables()
|
|
3641
3700
|
asts = pass1_parse(source_root, tables, verbose=verbose)
|
|
@@ -3644,7 +3703,7 @@ def _fallback_to_full(source_root: Path, kuzu_path: Path, verbose: bool, t_start
|
|
|
3644
3703
|
pass4_routes(tables, asts, source_root=source_root, verbose=verbose)
|
|
3645
3704
|
pass5_imperative_edges(tables, asts, source_root=source_root, verbose=verbose)
|
|
3646
3705
|
pass6_match_edges(tables, verbose=verbose)
|
|
3647
|
-
|
|
3706
|
+
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=verbose)
|
|
3648
3707
|
|
|
3649
3708
|
return IncrementalResult(
|
|
3650
3709
|
mode="full_fallback",
|
|
@@ -3656,12 +3715,12 @@ def _fallback_to_full(source_root: Path, kuzu_path: Path, verbose: bool, t_start
|
|
|
3656
3715
|
)
|
|
3657
3716
|
|
|
3658
3717
|
|
|
3659
|
-
def _write_clients_producers_and_calls(conn:
|
|
3660
|
-
"""Write Route, Client, Producer, and cross-service edges to
|
|
3718
|
+
def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTables) -> None:
|
|
3719
|
+
"""Write Route, Client, Producer, and cross-service edges to LadybugDB.
|
|
3661
3720
|
|
|
3662
3721
|
Used by the incremental rebuild's global pass 5-6 step. Writes phantom
|
|
3663
3722
|
Route nodes (created by pass5 for cross-service calls) that wouldn't
|
|
3664
|
-
otherwise exist in
|
|
3723
|
+
otherwise exist in LadybugDB.
|
|
3665
3724
|
"""
|
|
3666
3725
|
# Write phantom routes that don't already exist (pass5 creates these for cross-service calls)
|
|
3667
3726
|
for row in tables.routes_rows:
|
|
@@ -3690,7 +3749,7 @@ def _write_clients_producers_and_calls(conn: kuzu.Connection, tables: GraphTable
|
|
|
3690
3749
|
|
|
3691
3750
|
# Write declares_client edges
|
|
3692
3751
|
for row in tables.declares_client_rows:
|
|
3693
|
-
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
|
|
3752
|
+
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
|
|
3694
3753
|
conn.execute(_CREATE_DECLARES_CLIENT, {
|
|
3695
3754
|
"sid": row.symbol_id,
|
|
3696
3755
|
"cid": row.client_id,
|
|
@@ -3701,7 +3760,7 @@ def _write_clients_producers_and_calls(conn: kuzu.Connection, tables: GraphTable
|
|
|
3701
3760
|
|
|
3702
3761
|
# Write declares_producer edges
|
|
3703
3762
|
for row in tables.declares_producer_rows:
|
|
3704
|
-
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
|
|
3763
|
+
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
|
|
3705
3764
|
conn.execute(_CREATE_DECLARES_PRODUCER, {
|
|
3706
3765
|
"sid": row.symbol_id,
|
|
3707
3766
|
"pid": row.producer_id,
|
|
@@ -3739,7 +3798,7 @@ def _write_clients_producers_and_calls(conn: kuzu.Connection, tables: GraphTable
|
|
|
3739
3798
|
})
|
|
3740
3799
|
|
|
3741
3800
|
|
|
3742
|
-
def
|
|
3801
|
+
def write_ladybug(
|
|
3743
3802
|
db_path: Path,
|
|
3744
3803
|
tables: GraphTables,
|
|
3745
3804
|
*,
|
|
@@ -3755,8 +3814,8 @@ def write_kuzu(
|
|
|
3755
3814
|
_verbose_stderr_line(_WRITE_START)
|
|
3756
3815
|
with _VerbosePassHeartbeats("[graph] writing", verbose=verbose):
|
|
3757
3816
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
3758
|
-
db =
|
|
3759
|
-
conn =
|
|
3817
|
+
db = ladybug.Database(str(db_path))
|
|
3818
|
+
conn = ladybug.Connection(db)
|
|
3760
3819
|
_drop_all(conn)
|
|
3761
3820
|
_create_schema(conn)
|
|
3762
3821
|
t0 = time.time()
|
|
@@ -3787,22 +3846,22 @@ def write_kuzu(
|
|
|
3787
3846
|
# ---------- CLI ----------
|
|
3788
3847
|
|
|
3789
3848
|
|
|
3790
|
-
def
|
|
3849
|
+
def _default_ladybug_path() -> Path:
|
|
3791
3850
|
idx = os.environ.get("JAVA_CODEBASE_RAG_INDEX_DIR", "").strip()
|
|
3792
3851
|
if idx and not idx.startswith(("s3://", "gs://", "az://")):
|
|
3793
|
-
return Path(os.path.expanduser(idx.rstrip("/"))) / "code_graph.
|
|
3794
|
-
return Path.cwd() / ".java-codebase-rag" / "code_graph.
|
|
3852
|
+
return Path(os.path.expanduser(idx.rstrip("/"))) / "code_graph.lbug"
|
|
3853
|
+
return Path.cwd() / ".java-codebase-rag" / "code_graph.lbug"
|
|
3795
3854
|
|
|
3796
3855
|
|
|
3797
3856
|
def main() -> int:
|
|
3798
|
-
parser = argparse.ArgumentParser(description="Build an AST-derived
|
|
3857
|
+
parser = argparse.ArgumentParser(description="Build an AST-derived LadybugDB graph for Java sources.")
|
|
3799
3858
|
parser.add_argument("--source-root", default=None, help="Repository / monorepo root to scan for .java (defaults to current working directory)")
|
|
3800
3859
|
parser.add_argument(
|
|
3801
|
-
"--
|
|
3860
|
+
"--ladybug-path",
|
|
3802
3861
|
default=None,
|
|
3803
3862
|
help=(
|
|
3804
|
-
"
|
|
3805
|
-
"default: $JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.
|
|
3863
|
+
"LadybugDB database path (file/dir as used by ladybug.Database; "
|
|
3864
|
+
"default: $JAVA_CODEBASE_RAG_INDEX_DIR/code_graph.lbug or ./.java-codebase-rag/code_graph.lbug)"
|
|
3806
3865
|
),
|
|
3807
3866
|
)
|
|
3808
3867
|
parser.add_argument("--verbose", action="store_true")
|
|
@@ -3814,10 +3873,10 @@ def main() -> int:
|
|
|
3814
3873
|
print(f"source-root not a directory: {root}", file=sys.stderr)
|
|
3815
3874
|
return 2
|
|
3816
3875
|
|
|
3817
|
-
|
|
3876
|
+
ladybug_path = Path(args.ladybug_path).expanduser() if args.ladybug_path else _default_ladybug_path()
|
|
3818
3877
|
|
|
3819
3878
|
if args.incremental:
|
|
3820
|
-
result = incremental_rebuild(root,
|
|
3879
|
+
result = incremental_rebuild(root, ladybug_path, verbose=args.verbose)
|
|
3821
3880
|
print(json.dumps({
|
|
3822
3881
|
"mode": result.mode,
|
|
3823
3882
|
"files_changed": result.files_changed,
|
|
@@ -3837,9 +3896,9 @@ def main() -> int:
|
|
|
3837
3896
|
pass4_routes(tables, asts, source_root=root, verbose=args.verbose)
|
|
3838
3897
|
pass5_imperative_edges(tables, asts, source_root=root, verbose=args.verbose)
|
|
3839
3898
|
pass6_match_edges(tables, verbose=args.verbose)
|
|
3840
|
-
|
|
3899
|
+
write_ladybug(ladybug_path, tables, source_root=root, verbose=args.verbose)
|
|
3841
3900
|
if args.verbose:
|
|
3842
|
-
_verbose_stderr_line(f"[graph] done ·
|
|
3901
|
+
_verbose_stderr_line(f"[graph] done · ladybug at {ladybug_path}")
|
|
3843
3902
|
return 0
|
|
3844
3903
|
|
|
3845
3904
|
|