codespine 0.5.1__tar.gz → 0.5.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-0.5.1 → codespine-0.5.3}/PKG-INFO +1 -1
- {codespine-0.5.1 → codespine-0.5.3}/codespine/__init__.py +1 -1
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/community.py +14 -1
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/coupling.py +8 -0
- codespine-0.5.3/codespine/analysis/crossmodule.py +194 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/flow.py +11 -2
- {codespine-0.5.1 → codespine-0.5.3}/codespine/cli.py +37 -11
- {codespine-0.5.1 → codespine-0.5.3}/codespine/db/store.py +25 -10
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/PKG-INFO +1 -1
- {codespine-0.5.1 → codespine-0.5.3}/pyproject.toml +1 -1
- codespine-0.5.1/codespine/analysis/crossmodule.py +0 -173
- {codespine-0.5.1 → codespine-0.5.3}/LICENSE +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/README.md +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/context.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/deadcode.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/analysis/impact.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/config.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/db/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/db/schema.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/diff/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/diff/branch_diff.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/indexer/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/indexer/call_resolver.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/indexer/engine.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/indexer/java_parser.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/mcp/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/mcp/server.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/noise/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/noise/blocklist.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/bm25.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/fuzzy.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/hybrid.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/rrf.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/search/vector.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/watch/__init__.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine/watch/watcher.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/SOURCES.txt +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/requires.txt +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/codespine.egg-info/top_level.txt +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/gindex.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/setup.cfg +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_call_resolver.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_index_and_hybrid.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_java_parser.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_multimodule_index.py +0 -0
- {codespine-0.5.1 → codespine-0.5.3}/tests/test_search_ranking.py +0 -0
|
@@ -3,8 +3,14 @@ from __future__ import annotations
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
def detect_communities(store) -> list[dict]:
|
|
6
|
+
def detect_communities(store, progress=None) -> list[dict]:
|
|
7
|
+
def _ping(msg: str) -> None:
|
|
8
|
+
if progress:
|
|
9
|
+
progress(msg)
|
|
10
|
+
|
|
11
|
+
_ping("loading symbols")
|
|
7
12
|
symbols = store.query_records("MATCH (s:Symbol) RETURN s.id as id, s.fqname as fqname")
|
|
13
|
+
_ping(f"{len(symbols)} symbols, loading edges")
|
|
8
14
|
edges = store.query_records(
|
|
9
15
|
"""
|
|
10
16
|
MATCH (a:Method)-[:CALLS]->(b:Method)
|
|
@@ -17,6 +23,7 @@ def detect_communities(store) -> list[dict]:
|
|
|
17
23
|
ids = [s["id"] for s in symbols]
|
|
18
24
|
index_of = {sid: i for i, sid in enumerate(ids)}
|
|
19
25
|
|
|
26
|
+
_ping(f"{len(edges)} edges, clustering")
|
|
20
27
|
membership: dict[str, int] = {}
|
|
21
28
|
try:
|
|
22
29
|
import igraph as ig
|
|
@@ -44,11 +51,17 @@ def detect_communities(store) -> list[dict]:
|
|
|
44
51
|
for sid, cid in membership.items():
|
|
45
52
|
grouped[cid].append(sid)
|
|
46
53
|
|
|
54
|
+
_ping(f"{len(grouped)} clusters, persisting")
|
|
47
55
|
communities: list[dict] = []
|
|
56
|
+
done_clusters = 0
|
|
57
|
+
total_clusters = len(grouped)
|
|
48
58
|
for cid, symbol_ids in grouped.items():
|
|
49
59
|
cohesion = 1.0 / max(len(symbol_ids), 1)
|
|
50
60
|
label = f"community_{cid}"
|
|
51
61
|
store.set_community(str(cid), label, cohesion, symbol_ids)
|
|
62
|
+
done_clusters += 1
|
|
63
|
+
if done_clusters % 200 == 0 or done_clusters == total_clusters:
|
|
64
|
+
_ping(f"persisting {done_clusters}/{total_clusters} clusters")
|
|
52
65
|
communities.append(
|
|
53
66
|
{
|
|
54
67
|
"community_id": str(cid),
|
|
@@ -46,11 +46,18 @@ def compute_coupling(
|
|
|
46
46
|
months: int = SETTINGS.default_coupling_months,
|
|
47
47
|
min_strength: float = SETTINGS.default_min_coupling_strength,
|
|
48
48
|
min_cochanges: int = SETTINGS.default_min_cochanges,
|
|
49
|
+
progress=None,
|
|
49
50
|
) -> list[dict]:
|
|
51
|
+
def _ping(msg: str) -> None:
|
|
52
|
+
if progress:
|
|
53
|
+
progress(msg)
|
|
54
|
+
|
|
55
|
+
_ping("reading git history")
|
|
50
56
|
changesets = _git_changed_file_sets(repo_path, months)
|
|
51
57
|
if not changesets:
|
|
52
58
|
return []
|
|
53
59
|
|
|
60
|
+
_ping(f"{len(changesets)} commits, computing co-changes")
|
|
54
61
|
file_changes = Counter()
|
|
55
62
|
co_changes: Counter[tuple[str, str]] = Counter()
|
|
56
63
|
|
|
@@ -60,6 +67,7 @@ def compute_coupling(
|
|
|
60
67
|
for a, b in itertools.combinations(sorted(cs), 2):
|
|
61
68
|
co_changes[(a, b)] += 1
|
|
62
69
|
|
|
70
|
+
_ping(f"{len(co_changes)} pairs, filtering and persisting")
|
|
63
71
|
results = []
|
|
64
72
|
for (a, b), pair_count in co_changes.items():
|
|
65
73
|
denom = max(file_changes[a], file_changes[b])
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Cross-module call edge linker.
|
|
2
|
+
|
|
3
|
+
After all modules in a workspace have been individually indexed, each module's
|
|
4
|
+
call resolver only sees methods *within that module* (the class/method catalogs
|
|
5
|
+
are project-scoped). This module fills the gap by:
|
|
6
|
+
|
|
7
|
+
1. Building a **global** class-name index across ALL projects.
|
|
8
|
+
2. Scanning every method's signature and return type for class names that
|
|
9
|
+
belong to a DIFFERENT project.
|
|
10
|
+
3. Creating CALLS edges between the referencing method and the methods of
|
|
11
|
+
the referenced class.
|
|
12
|
+
|
|
13
|
+
Two linking strategies are applied:
|
|
14
|
+
|
|
15
|
+
Strategy A — Name + arity match (confidence 0.7)
|
|
16
|
+
The referencing method M_src calls a method with the same name AND
|
|
17
|
+
parameter count as a method M_dst in the referenced class. This catches
|
|
18
|
+
delegation, interface-implementation forwarding, and adapter patterns.
|
|
19
|
+
|
|
20
|
+
Strategy B — Type-reference fallback (confidence 0.4)
|
|
21
|
+
For every *public, non-constructor* method in the referenced class that
|
|
22
|
+
received NO name-match edge, create ONE low-confidence edge from the
|
|
23
|
+
referencing method. This prevents methods that are genuinely used
|
|
24
|
+
cross-module from appearing as dead code.
|
|
25
|
+
"""
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
import re
|
|
30
|
+
from collections import defaultdict
|
|
31
|
+
|
|
32
|
+
LOGGER = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Very short class names produce too many false-positive matches when scanned
|
|
35
|
+
# as substrings of method signatures. Skip names ≤ this length.
|
|
36
|
+
_MIN_CLASS_NAME_LEN = 4
|
|
37
|
+
|
|
38
|
+
# Regex to split a Java signature into word tokens (class names, keywords, etc.)
|
|
39
|
+
_TOKEN_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]*")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _param_count(sig: str) -> int:
|
|
43
|
+
"""Count parameters from a method signature string."""
|
|
44
|
+
if not sig or "(" not in sig or ")" not in sig:
|
|
45
|
+
return 0
|
|
46
|
+
arg_str = sig[sig.find("(") + 1: sig.rfind(")")]
|
|
47
|
+
return 0 if not arg_str.strip() else arg_str.count(",") + 1
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def link_cross_module_calls(store, project_ids: list[str] | None = None, progress=None) -> int:
|
|
51
|
+
"""Create CALLS edges between methods in different projects.
|
|
52
|
+
|
|
53
|
+
Returns the number of new cross-module call edges created.
|
|
54
|
+
*progress* is an optional ``(status_str) -> None`` callback for live updates.
|
|
55
|
+
"""
|
|
56
|
+
def _ping(msg: str) -> None:
|
|
57
|
+
if progress:
|
|
58
|
+
progress(msg)
|
|
59
|
+
if project_ids is None:
|
|
60
|
+
proj_recs = store.query_records("MATCH (p:Project) RETURN p.id as id")
|
|
61
|
+
project_ids = [r["id"] for r in proj_recs]
|
|
62
|
+
|
|
63
|
+
if len(project_ids) < 2:
|
|
64
|
+
LOGGER.info(
|
|
65
|
+
"Only %d project(s) indexed — skipping cross-module linking.",
|
|
66
|
+
len(project_ids),
|
|
67
|
+
)
|
|
68
|
+
return 0
|
|
69
|
+
|
|
70
|
+
# ── 1. Global class index ─────────────────────────────────────────
|
|
71
|
+
all_classes = store.query_records(
|
|
72
|
+
"""
|
|
73
|
+
MATCH (c:Class), (f:File)
|
|
74
|
+
WHERE c.file_id = f.id
|
|
75
|
+
RETURN c.id as cid, c.name as name, c.fqcn as fqcn, f.project_id as pid
|
|
76
|
+
"""
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
_ping(f"building class index ({len(all_classes)} classes)")
|
|
80
|
+
|
|
81
|
+
# class_name → [(class_id, project_id)]
|
|
82
|
+
name_to_classes: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
|
83
|
+
for c in all_classes:
|
|
84
|
+
name_to_classes[c["name"]].append((c["cid"], c["pid"]))
|
|
85
|
+
|
|
86
|
+
# ── 2. Per-project class name sets (for O(1) lookups) ─────────────
|
|
87
|
+
# For each project pair (src, dst), we need the set of class names
|
|
88
|
+
# that belong to the OTHER project(s). Pre-compute per-project sets.
|
|
89
|
+
classes_per_project: dict[str, set[str]] = defaultdict(set)
|
|
90
|
+
for c in all_classes:
|
|
91
|
+
if len(c["name"]) > _MIN_CLASS_NAME_LEN:
|
|
92
|
+
classes_per_project[c["pid"]].add(c["name"])
|
|
93
|
+
|
|
94
|
+
# ── 3. Scan methods for cross-project type references ─────────────
|
|
95
|
+
new_edges = 0
|
|
96
|
+
seen: set[tuple[str, str]] = set()
|
|
97
|
+
|
|
98
|
+
for src_pid in project_ids:
|
|
99
|
+
# Build the set of "interesting" class names from OTHER projects
|
|
100
|
+
other_class_names: set[str] = set()
|
|
101
|
+
for other_pid in project_ids:
|
|
102
|
+
if other_pid != src_pid:
|
|
103
|
+
other_class_names |= classes_per_project.get(other_pid, set())
|
|
104
|
+
|
|
105
|
+
if not other_class_names:
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
_ping(f"scanning {src_pid} methods")
|
|
109
|
+
|
|
110
|
+
# Fetch all methods in this project
|
|
111
|
+
src_methods = store.query_records(
|
|
112
|
+
"""
|
|
113
|
+
MATCH (m:Method), (c:Class), (f:File)
|
|
114
|
+
WHERE m.class_id = c.id AND c.file_id = f.id AND f.project_id = $pid
|
|
115
|
+
RETURN m.id as mid, m.name as name, m.signature as sig,
|
|
116
|
+
m.return_type as rtype, c.id as cid
|
|
117
|
+
""",
|
|
118
|
+
{"pid": src_pid},
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
for sm in src_methods:
|
|
122
|
+
sig = sm.get("sig") or ""
|
|
123
|
+
rtype = sm.get("rtype") or ""
|
|
124
|
+
# Tokenize signature + return type into words
|
|
125
|
+
tokens = set(_TOKEN_RE.findall(sig + " " + rtype))
|
|
126
|
+
# Find which class names from other projects appear in the tokens
|
|
127
|
+
matched_class_names = tokens & other_class_names
|
|
128
|
+
if not matched_class_names:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
# For each matched class, create CALLS edges
|
|
132
|
+
for class_name in matched_class_names:
|
|
133
|
+
for dst_cid, dst_pid in name_to_classes.get(class_name, []):
|
|
134
|
+
if dst_pid == src_pid:
|
|
135
|
+
continue # same project — not cross-module
|
|
136
|
+
|
|
137
|
+
# Get methods of the destination class
|
|
138
|
+
dst_methods = store.query_records(
|
|
139
|
+
"""MATCH (m:Method) WHERE m.class_id = $cid
|
|
140
|
+
RETURN m.id as mid, m.name as name, m.signature as sig,
|
|
141
|
+
m.modifiers as modifiers, m.is_constructor as is_ctor""",
|
|
142
|
+
{"cid": dst_cid},
|
|
143
|
+
)
|
|
144
|
+
if not dst_methods:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Strategy A: name + arity match
|
|
148
|
+
matched_dst_mids: set[str] = set()
|
|
149
|
+
sm_name = sm["name"]
|
|
150
|
+
sm_pc = _param_count(sm.get("sig") or "")
|
|
151
|
+
for dm in dst_methods:
|
|
152
|
+
if dm["name"] == sm_name:
|
|
153
|
+
dm_pc = _param_count(dm.get("sig") or "")
|
|
154
|
+
if dm_pc == sm_pc:
|
|
155
|
+
pair = (sm["mid"], dm["mid"])
|
|
156
|
+
if pair not in seen:
|
|
157
|
+
seen.add(pair)
|
|
158
|
+
try:
|
|
159
|
+
store.add_call(
|
|
160
|
+
sm["mid"], dm["mid"],
|
|
161
|
+
0.7, "cross_module_name_match",
|
|
162
|
+
)
|
|
163
|
+
new_edges += 1
|
|
164
|
+
except Exception as exc:
|
|
165
|
+
LOGGER.debug("Name-match edge failed: %s", exc)
|
|
166
|
+
matched_dst_mids.add(dm["mid"])
|
|
167
|
+
|
|
168
|
+
# Strategy B: fallback for unmatched public dst methods
|
|
169
|
+
for dm in dst_methods:
|
|
170
|
+
if dm["mid"] in matched_dst_mids:
|
|
171
|
+
continue
|
|
172
|
+
if dm.get("is_ctor"):
|
|
173
|
+
continue
|
|
174
|
+
mods = dm.get("modifiers") or []
|
|
175
|
+
mod_strs = {str(m).strip() for m in mods} if mods else set()
|
|
176
|
+
if "private" in mod_strs:
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
pair = (sm["mid"], dm["mid"])
|
|
180
|
+
if pair in seen:
|
|
181
|
+
continue
|
|
182
|
+
seen.add(pair)
|
|
183
|
+
try:
|
|
184
|
+
store.add_call(
|
|
185
|
+
sm["mid"], dm["mid"],
|
|
186
|
+
0.4, "cross_module_type_ref",
|
|
187
|
+
)
|
|
188
|
+
new_edges += 1
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
LOGGER.debug("Fallback edge failed: %s", exc)
|
|
191
|
+
|
|
192
|
+
_ping(f"{new_edges} edges created")
|
|
193
|
+
LOGGER.info("Cross-module linking: created %d new call edges.", new_edges)
|
|
194
|
+
return new_edges
|
|
@@ -48,7 +48,12 @@ def _entry_methods(store, project: str | None = None) -> list[str]:
|
|
|
48
48
|
return [r["id"] for r in fallback]
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def trace_execution_flows(store, entry_symbol: str | None = None, max_depth: int = 6, project: str | None = None) -> list[dict]:
|
|
51
|
+
def trace_execution_flows(store, entry_symbol: str | None = None, max_depth: int = 6, project: str | None = None, progress=None) -> list[dict]:
|
|
52
|
+
def _ping(msg: str) -> None:
|
|
53
|
+
if progress:
|
|
54
|
+
progress(msg)
|
|
55
|
+
|
|
56
|
+
_ping("loading call graph")
|
|
52
57
|
edges = store.query_records(
|
|
53
58
|
"""
|
|
54
59
|
MATCH (a:Method)-[:CALLS]->(b:Method)
|
|
@@ -85,8 +90,11 @@ def trace_execution_flows(store, entry_symbol: str | None = None, max_depth: int
|
|
|
85
90
|
else:
|
|
86
91
|
entries = _entry_methods(store, project=project)
|
|
87
92
|
|
|
93
|
+
_ping(f"{len(entries)} entry points, tracing")
|
|
88
94
|
flows = []
|
|
89
|
-
for e in entries:
|
|
95
|
+
for idx, e in enumerate(entries):
|
|
96
|
+
if idx % 50 == 0 and idx > 0:
|
|
97
|
+
_ping(f"traced {idx}/{len(entries)} entry points")
|
|
90
98
|
visited = {e}
|
|
91
99
|
q = deque([(e, 0)])
|
|
92
100
|
nodes_with_depth = [(e, 0)]
|
|
@@ -115,6 +123,7 @@ def trace_execution_flows(store, entry_symbol: str | None = None, max_depth: int
|
|
|
115
123
|
# need a second round-trip to resolve raw method ID hashes.
|
|
116
124
|
# Collect all unique IDs across all flows, resolve in one bulk query.
|
|
117
125
|
# ------------------------------------------------------------------ #
|
|
126
|
+
_ping(f"{len(flows)} flows, enriching metadata")
|
|
118
127
|
all_ids = list({node["symbol"] for flow in flows for node in flow["nodes"]})
|
|
119
128
|
meta = _resolve_method_metadata(store, all_ids)
|
|
120
129
|
|
|
@@ -217,13 +217,25 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
217
217
|
elif parse_state["indexed"] < parse_state["total"]:
|
|
218
218
|
_phase("Parsing code...", f"{parse_state['indexed']}/{parse_state['total']}")
|
|
219
219
|
|
|
220
|
+
# ── Helper for in-place progress updates ────────────────────────────
|
|
221
|
+
def _live_phase(label: str, status: str) -> None:
|
|
222
|
+
"""Overwrite the current line with a status update."""
|
|
223
|
+
click.echo(f"\r{label:<30} {status:<50}", nl=False)
|
|
224
|
+
|
|
225
|
+
def _finish_phase(label: str, result: str) -> None:
|
|
226
|
+
"""Finalise an in-place phase line and move to the next line."""
|
|
227
|
+
click.echo(f"\r{label:<30} {result:<50}")
|
|
228
|
+
|
|
220
229
|
# ── Cross-module call linking ──────────────────────────────────────
|
|
221
|
-
# When multiple modules/projects are indexed, attempt to resolve call
|
|
222
|
-
# edges that span module boundaries using import + REFERENCES_TYPE info.
|
|
223
230
|
if is_multi and len(modules_with_ids) > 1:
|
|
231
|
+
xmod_label = "Cross-module linking..."
|
|
232
|
+
_live_phase(xmod_label, "running")
|
|
224
233
|
xmod_pids = [pid for _, pid in modules_with_ids]
|
|
225
|
-
xmod_edges = link_cross_module_calls(
|
|
226
|
-
|
|
234
|
+
xmod_edges = link_cross_module_calls(
|
|
235
|
+
store, project_ids=xmod_pids,
|
|
236
|
+
progress=lambda s: _live_phase(xmod_label, s),
|
|
237
|
+
)
|
|
238
|
+
_finish_phase(xmod_label, f"{xmod_edges} cross-module call edges")
|
|
227
239
|
else:
|
|
228
240
|
_phase("Cross-module linking...", "skipped (single module)")
|
|
229
241
|
|
|
@@ -234,16 +246,29 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
234
246
|
|
|
235
247
|
should_run_deep = deep or total_files_found <= 1200
|
|
236
248
|
if should_run_deep:
|
|
237
|
-
|
|
238
|
-
|
|
249
|
+
comm_label = "Detecting communities..."
|
|
250
|
+
_live_phase(comm_label, "running")
|
|
251
|
+
communities = detect_communities(
|
|
252
|
+
store,
|
|
253
|
+
progress=lambda s: _live_phase(comm_label, s),
|
|
254
|
+
)
|
|
255
|
+
_finish_phase(comm_label, f"{len(communities)} clusters found")
|
|
239
256
|
|
|
240
|
-
|
|
241
|
-
|
|
257
|
+
flow_label = "Detecting execution flows..."
|
|
258
|
+
_live_phase(flow_label, "running")
|
|
259
|
+
flows = trace_execution_flows(
|
|
260
|
+
store,
|
|
261
|
+
progress=lambda s: _live_phase(flow_label, s),
|
|
262
|
+
)
|
|
263
|
+
_finish_phase(flow_label, f"{len(flows)} processes found")
|
|
242
264
|
|
|
265
|
+
dead_label = "Finding dead code..."
|
|
266
|
+
_live_phase(dead_label, "running")
|
|
243
267
|
dead = detect_dead_code(store, limit=500)
|
|
244
|
-
|
|
268
|
+
_finish_phase(dead_label, f"{len(dead)} unreachable symbols")
|
|
245
269
|
|
|
246
|
-
|
|
270
|
+
coup_label = "Analyzing git history..."
|
|
271
|
+
_live_phase(coup_label, "running")
|
|
247
272
|
coupling_root = abs_path
|
|
248
273
|
coupling_project = root_basename if is_multi else (last_result.project_id if last_result else root_basename)
|
|
249
274
|
coupling_pairs = compute_coupling(
|
|
@@ -253,8 +278,9 @@ def analyse(path: str, full: bool, deep: bool, embed: bool, allow_running: bool)
|
|
|
253
278
|
months=SETTINGS.default_coupling_months,
|
|
254
279
|
min_strength=SETTINGS.default_min_coupling_strength,
|
|
255
280
|
min_cochanges=SETTINGS.default_min_cochanges,
|
|
281
|
+
progress=lambda s: _live_phase(coup_label, s),
|
|
256
282
|
)
|
|
257
|
-
|
|
283
|
+
_finish_phase(coup_label, f"{len(coupling_pairs)} coupled file pairs")
|
|
258
284
|
else:
|
|
259
285
|
_phase("Detecting communities...", "skipped (large repo; rerun with --deep)")
|
|
260
286
|
_phase("Detecting execution flows...", "skipped (large repo; rerun with --deep)")
|
|
@@ -17,7 +17,7 @@ from codespine.db.schema import ensure_schema
|
|
|
17
17
|
|
|
18
18
|
LOGGER = logging.getLogger(__name__)
|
|
19
19
|
|
|
20
|
-
_BUFFER_POOL_SIZE =
|
|
20
|
+
_BUFFER_POOL_SIZE = 512 * 1024 * 1024 # 512 MB – room for large community detection
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
@dataclass
|
|
@@ -298,15 +298,23 @@ class GraphStore:
|
|
|
298
298
|
)
|
|
299
299
|
self.execute(query, {"src_id": src_id, "dst_id": dst_id, "confidence": confidence})
|
|
300
300
|
|
|
301
|
+
def _recycle_conn(self) -> None:
|
|
302
|
+
"""Drop and recreate the per-thread connection to release buffer pages."""
|
|
303
|
+
try:
|
|
304
|
+
if hasattr(self._tls, "conn") and self._tls.conn is not None:
|
|
305
|
+
self._tls.conn = None
|
|
306
|
+
except Exception:
|
|
307
|
+
pass
|
|
308
|
+
|
|
301
309
|
def set_community(self, community_id: str, label: str, cohesion: float, symbol_ids: list[str]) -> None:
|
|
302
310
|
self.execute(
|
|
303
311
|
"MERGE (c:Community {id: $id}) SET c.label = $label, c.cohesion = $cohesion",
|
|
304
312
|
{"id": community_id, "label": label, "cohesion": cohesion},
|
|
305
313
|
)
|
|
306
|
-
# Commit in batches of
|
|
307
|
-
# communities.
|
|
308
|
-
#
|
|
309
|
-
_BATCH =
|
|
314
|
+
# Commit in batches of 500 to keep Kuzu's buffer pool from OOMing on
|
|
315
|
+
# large communities. After each batch, recycle the connection so Kuzu
|
|
316
|
+
# can release buffer pages accumulated during the transaction.
|
|
317
|
+
_BATCH = 500
|
|
310
318
|
for i in range(0, len(symbol_ids), _BATCH):
|
|
311
319
|
batch = symbol_ids[i : i + _BATCH]
|
|
312
320
|
with self.transaction():
|
|
@@ -315,17 +323,24 @@ class GraphStore:
|
|
|
315
323
|
"MATCH (s:Symbol {id: $sid}), (c:Community {id: $cid}) MERGE (s)-[:IN_COMMUNITY]->(c)",
|
|
316
324
|
{"sid": sid, "cid": community_id},
|
|
317
325
|
)
|
|
326
|
+
# Recycle connection after each batch to let Kuzu free buffer pages
|
|
327
|
+
self._recycle_conn()
|
|
318
328
|
|
|
319
329
|
def set_flow(self, flow_id: str, entry_symbol_id: str, kind: str, symbols_at_depth: list[tuple[str, int]]) -> None:
|
|
320
330
|
self.execute(
|
|
321
331
|
"MERGE (f:Flow {id: $id}) SET f.entry_symbol_id = $entry, f.kind = $kind",
|
|
322
332
|
{"id": flow_id, "entry": entry_symbol_id, "kind": kind},
|
|
323
333
|
)
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
334
|
+
_BATCH = 500
|
|
335
|
+
for i in range(0, len(symbols_at_depth), _BATCH):
|
|
336
|
+
batch = symbols_at_depth[i : i + _BATCH]
|
|
337
|
+
with self.transaction():
|
|
338
|
+
for sid, depth in batch:
|
|
339
|
+
self.execute(
|
|
340
|
+
"MATCH (s:Symbol {id: $sid}), (f:Flow {id: $fid}) MERGE (s)-[:IN_FLOW {depth: $depth}]->(f)",
|
|
341
|
+
{"sid": sid, "fid": flow_id, "depth": int(depth)},
|
|
342
|
+
)
|
|
343
|
+
self._recycle_conn()
|
|
329
344
|
|
|
330
345
|
def upsert_coupling(self, file_a: str, file_b: str, strength: float, cochanges: int, months: int) -> None:
|
|
331
346
|
self.execute(
|
|
@@ -1,173 +0,0 @@
|
|
|
1
|
-
"""Cross-module call edge linker.
|
|
2
|
-
|
|
3
|
-
After all modules in a workspace have been individually indexed, each module's
|
|
4
|
-
call resolver only sees methods within that module. This module fills the gap
|
|
5
|
-
by scanning the graph for cross-project class references (REFERENCES_TYPE and
|
|
6
|
-
IMPLEMENTS edges) and creating CALLS edges between methods where the call is
|
|
7
|
-
plausible.
|
|
8
|
-
|
|
9
|
-
Strategy A — Name + arity match (confidence 0.7)
|
|
10
|
-
If src_class references dst_class (cross-project) and both have a method
|
|
11
|
-
with the same name and same parameter count, create a CALLS edge. This
|
|
12
|
-
catches delegation, interface-implementation forwarding, and adapter
|
|
13
|
-
patterns.
|
|
14
|
-
|
|
15
|
-
Strategy B — Type-reference fallback (confidence 0.4)
|
|
16
|
-
For each *public* method in dst_class that received NO name-match edge,
|
|
17
|
-
create ONE low-confidence edge from a representative src method (preferring
|
|
18
|
-
one with zero outgoing calls). This prevents methods that are genuinely
|
|
19
|
-
used cross-module from appearing as dead code.
|
|
20
|
-
"""
|
|
21
|
-
from __future__ import annotations
|
|
22
|
-
|
|
23
|
-
import logging
|
|
24
|
-
from collections import defaultdict
|
|
25
|
-
|
|
26
|
-
LOGGER = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def _param_count(sig: str) -> int:
|
|
30
|
-
"""Count parameters from a method signature string."""
|
|
31
|
-
if not sig or "(" not in sig or ")" not in sig:
|
|
32
|
-
return 0
|
|
33
|
-
arg_str = sig[sig.find("(") + 1: sig.rfind(")")]
|
|
34
|
-
return 0 if not arg_str.strip() else arg_str.count(",") + 1
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def link_cross_module_calls(store, project_ids: list[str] | None = None) -> int:
|
|
38
|
-
"""Create CALLS edges between methods in different projects.
|
|
39
|
-
|
|
40
|
-
Returns the number of new cross-module call edges created.
|
|
41
|
-
"""
|
|
42
|
-
if project_ids is None:
|
|
43
|
-
proj_recs = store.query_records("MATCH (p:Project) RETURN p.id as id")
|
|
44
|
-
project_ids = [r["id"] for r in proj_recs]
|
|
45
|
-
|
|
46
|
-
if len(project_ids) < 2:
|
|
47
|
-
LOGGER.info(
|
|
48
|
-
"Only %d project(s) indexed — skipping cross-module linking.",
|
|
49
|
-
len(project_ids),
|
|
50
|
-
)
|
|
51
|
-
return 0
|
|
52
|
-
|
|
53
|
-
# ── 1. Collect cross-project class pairs ──────────────────────────
|
|
54
|
-
ref_pairs = store.query_records(
|
|
55
|
-
"""
|
|
56
|
-
MATCH (src:Class)-[:REFERENCES_TYPE]->(dst:Class), (sf:File), (df:File)
|
|
57
|
-
WHERE src.file_id = sf.id AND dst.file_id = df.id
|
|
58
|
-
AND sf.project_id <> df.project_id
|
|
59
|
-
RETURN DISTINCT src.id as src_cid, dst.id as dst_cid
|
|
60
|
-
"""
|
|
61
|
-
)
|
|
62
|
-
impl_pairs = store.query_records(
|
|
63
|
-
"""
|
|
64
|
-
MATCH (src:Class)-[:IMPLEMENTS]->(dst:Class), (sf:File), (df:File)
|
|
65
|
-
WHERE src.file_id = sf.id AND dst.file_id = df.id
|
|
66
|
-
AND sf.project_id <> df.project_id
|
|
67
|
-
RETURN DISTINCT src.id as src_cid, dst.id as dst_cid
|
|
68
|
-
"""
|
|
69
|
-
)
|
|
70
|
-
|
|
71
|
-
all_pairs: set[tuple[str, str]] = set()
|
|
72
|
-
for p in ref_pairs:
|
|
73
|
-
all_pairs.add((p["src_cid"], p["dst_cid"]))
|
|
74
|
-
for p in impl_pairs:
|
|
75
|
-
all_pairs.add((p["src_cid"], p["dst_cid"]))
|
|
76
|
-
|
|
77
|
-
if not all_pairs:
|
|
78
|
-
LOGGER.info("No cross-project class references found.")
|
|
79
|
-
return 0
|
|
80
|
-
|
|
81
|
-
LOGGER.info(
|
|
82
|
-
"Cross-module: %d cross-project class pair(s) to process.",
|
|
83
|
-
len(all_pairs),
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
# ── 2. Process each class pair ────────────────────────────────────
|
|
87
|
-
new_edges = 0
|
|
88
|
-
seen: set[tuple[str, str]] = set()
|
|
89
|
-
|
|
90
|
-
for src_cid, dst_cid in all_pairs:
|
|
91
|
-
src_methods = store.query_records(
|
|
92
|
-
"""MATCH (m:Method) WHERE m.class_id = $cid
|
|
93
|
-
RETURN m.id as mid, m.name as name, m.signature as sig""",
|
|
94
|
-
{"cid": src_cid},
|
|
95
|
-
)
|
|
96
|
-
dst_methods = store.query_records(
|
|
97
|
-
"""MATCH (m:Method) WHERE m.class_id = $cid
|
|
98
|
-
RETURN m.id as mid, m.name as name, m.signature as sig,
|
|
99
|
-
m.modifiers as modifiers, m.is_constructor as is_ctor""",
|
|
100
|
-
{"cid": dst_cid},
|
|
101
|
-
)
|
|
102
|
-
if not src_methods or not dst_methods:
|
|
103
|
-
continue
|
|
104
|
-
|
|
105
|
-
# Build name → methods index for src class
|
|
106
|
-
src_by_name: dict[str, list[dict]] = defaultdict(list)
|
|
107
|
-
for sm in src_methods:
|
|
108
|
-
src_by_name[sm["name"]].append(sm)
|
|
109
|
-
|
|
110
|
-
# ── Strategy A: name + arity matching ─────────────────────────
|
|
111
|
-
matched_dst_mids: set[str] = set()
|
|
112
|
-
|
|
113
|
-
for dm in dst_methods:
|
|
114
|
-
dm_name = dm["name"]
|
|
115
|
-
dm_pc = _param_count(dm.get("sig") or "")
|
|
116
|
-
candidates = src_by_name.get(dm_name, [])
|
|
117
|
-
for sm in candidates:
|
|
118
|
-
sm_pc = _param_count(sm.get("sig") or "")
|
|
119
|
-
if sm_pc == dm_pc:
|
|
120
|
-
pair = (sm["mid"], dm["mid"])
|
|
121
|
-
if pair in seen:
|
|
122
|
-
matched_dst_mids.add(dm["mid"])
|
|
123
|
-
continue
|
|
124
|
-
seen.add(pair)
|
|
125
|
-
try:
|
|
126
|
-
store.add_call(
|
|
127
|
-
sm["mid"], dm["mid"], 0.7, "cross_module_name_match",
|
|
128
|
-
)
|
|
129
|
-
new_edges += 1
|
|
130
|
-
matched_dst_mids.add(dm["mid"])
|
|
131
|
-
except Exception as exc:
|
|
132
|
-
LOGGER.debug("Name-match edge failed: %s", exc)
|
|
133
|
-
|
|
134
|
-
# ── Strategy B: fallback for unmatched public dst methods ─────
|
|
135
|
-
# Find a representative caller: prefer src methods with 0 outgoing calls
|
|
136
|
-
fallback_src = None
|
|
137
|
-
for sm in src_methods:
|
|
138
|
-
out = store.query_records(
|
|
139
|
-
"MATCH (m:Method {id: $mid})-[:CALLS]->(:Method) RETURN count(*) as n",
|
|
140
|
-
{"mid": sm["mid"]},
|
|
141
|
-
)
|
|
142
|
-
if out and out[0]["n"] == 0:
|
|
143
|
-
fallback_src = sm
|
|
144
|
-
break
|
|
145
|
-
if fallback_src is None and src_methods:
|
|
146
|
-
fallback_src = src_methods[0]
|
|
147
|
-
|
|
148
|
-
if fallback_src:
|
|
149
|
-
for dm in dst_methods:
|
|
150
|
-
if dm["mid"] in matched_dst_mids:
|
|
151
|
-
continue
|
|
152
|
-
# Skip constructors and private methods
|
|
153
|
-
if dm.get("is_ctor"):
|
|
154
|
-
continue
|
|
155
|
-
mods = dm.get("modifiers") or []
|
|
156
|
-
mod_strs = {str(m).strip() for m in mods} if mods else set()
|
|
157
|
-
if "private" in mod_strs:
|
|
158
|
-
continue
|
|
159
|
-
|
|
160
|
-
pair = (fallback_src["mid"], dm["mid"])
|
|
161
|
-
if pair in seen:
|
|
162
|
-
continue
|
|
163
|
-
seen.add(pair)
|
|
164
|
-
try:
|
|
165
|
-
store.add_call(
|
|
166
|
-
fallback_src["mid"], dm["mid"], 0.4, "cross_module_type_ref",
|
|
167
|
-
)
|
|
168
|
-
new_edges += 1
|
|
169
|
-
except Exception as exc:
|
|
170
|
-
LOGGER.debug("Fallback edge failed: %s", exc)
|
|
171
|
-
|
|
172
|
-
LOGGER.info("Cross-module linking: created %d new call edges.", new_edges)
|
|
173
|
-
return new_edges
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|