code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,874 @@
|
|
|
1
|
+
"""Community/cluster detection for the code knowledge graph.
|
|
2
|
+
|
|
3
|
+
Detects communities of related code nodes using the Leiden algorithm (via igraph,
|
|
4
|
+
optional) with a file-based grouping fallback when igraph is not installed.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import random
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter, defaultdict
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from .graph import GraphEdge, GraphNode, GraphStore, _sanitize_name
|
|
16
|
+
|
|
17
|
+
# Fixed seed for igraph's RNG so Leiden community detection is reproducible
|
|
18
|
+
# across runs. Without this, two builds of the same graph produce different
|
|
19
|
+
# community IDs / sizes, breaking benchmark comparability. Override with
|
|
20
|
+
# CRG_LEIDEN_SEED env var if you need a different seed.
|
|
21
|
+
_LEIDEN_SEED = 42
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
# Stay well under SQLite's default 999-variable limit per statement.
|
|
26
|
+
_SQL_BATCH = 450
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Optional igraph import
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import igraph as ig # type: ignore[import-untyped]
|
|
34
|
+
|
|
35
|
+
IGRAPH_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
ig = None # type: ignore[assignment]
|
|
38
|
+
IGRAPH_AVAILABLE = False
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------------------
|
|
41
|
+
# Edge weight mapping
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
EDGE_WEIGHTS: dict[str, float] = {
|
|
45
|
+
"CALLS": 1.0,
|
|
46
|
+
"IMPORTS_FROM": 0.5,
|
|
47
|
+
"INHERITS": 0.8,
|
|
48
|
+
"IMPLEMENTS": 0.7,
|
|
49
|
+
"CONTAINS": 0.3,
|
|
50
|
+
"TESTED_BY": 0.4,
|
|
51
|
+
"DEPENDS_ON": 0.6,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Common words to filter when generating community names
|
|
55
|
+
_COMMON_WORDS = frozenset({
|
|
56
|
+
"get", "set", "self", "init", "new", "create", "update", "delete",
|
|
57
|
+
"add", "remove", "make", "build", "from", "to", "for", "with",
|
|
58
|
+
"the", "and", "test", "main", "run", "do", "is", "has", "on",
|
|
59
|
+
"of", "in", "at", "by", "my", "this", "that", "all", "none",
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
# Community naming
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _generate_community_name(members: list[GraphNode]) -> str:
|
|
69
|
+
"""Generate a meaningful name for a community of nodes.
|
|
70
|
+
|
|
71
|
+
Algorithm:
|
|
72
|
+
1. Find most common module/file prefix among members
|
|
73
|
+
2. If a dominant class exists (>40% of nodes), use its name
|
|
74
|
+
3. Fallback: most frequent keyword in function/class names
|
|
75
|
+
4. Format: "{prefix}-{keyword}"
|
|
76
|
+
"""
|
|
77
|
+
if not members:
|
|
78
|
+
return "empty"
|
|
79
|
+
|
|
80
|
+
# 1. Find common file prefix
|
|
81
|
+
file_paths = [m.file_path for m in members]
|
|
82
|
+
prefix = _extract_file_prefix(file_paths)
|
|
83
|
+
|
|
84
|
+
# 2. Check for dominant class
|
|
85
|
+
class_names = [m.name for m in members if m.kind == "Class"]
|
|
86
|
+
if class_names:
|
|
87
|
+
class_counts = Counter(class_names)
|
|
88
|
+
top_class, top_count = class_counts.most_common(1)[0]
|
|
89
|
+
if top_count > len(members) * 0.4:
|
|
90
|
+
if prefix:
|
|
91
|
+
return f"{prefix}-{_to_slug(top_class)}"
|
|
92
|
+
return _to_slug(top_class)
|
|
93
|
+
|
|
94
|
+
# 3. Most frequent keyword from function/class names
|
|
95
|
+
keywords = _extract_keywords(members)
|
|
96
|
+
keyword = keywords[0] if keywords else ""
|
|
97
|
+
|
|
98
|
+
if prefix and keyword:
|
|
99
|
+
return f"{prefix}-{keyword}"
|
|
100
|
+
if prefix:
|
|
101
|
+
return prefix
|
|
102
|
+
if keyword:
|
|
103
|
+
return keyword
|
|
104
|
+
return "cluster"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_file_prefix(file_paths: list[str]) -> str:
|
|
108
|
+
"""Find the most common short directory or module name from file paths."""
|
|
109
|
+
if not file_paths:
|
|
110
|
+
return ""
|
|
111
|
+
# Extract the parent directory or file stem
|
|
112
|
+
parts: list[str] = []
|
|
113
|
+
for fp in file_paths:
|
|
114
|
+
# Use the last directory component or file stem
|
|
115
|
+
segments = fp.replace("\\", "/").split("/")
|
|
116
|
+
# Take the parent dir if it exists, otherwise the file stem
|
|
117
|
+
if len(segments) >= 2:
|
|
118
|
+
parts.append(segments[-2])
|
|
119
|
+
else:
|
|
120
|
+
stem = segments[-1].rsplit(".", 1)[0]
|
|
121
|
+
parts.append(stem)
|
|
122
|
+
|
|
123
|
+
counts = Counter(parts)
|
|
124
|
+
top_part, _ = counts.most_common(1)[0]
|
|
125
|
+
return _to_slug(top_part)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _extract_keywords(members: list[GraphNode]) -> list[str]:
|
|
129
|
+
"""Extract the most frequent meaningful keywords from member names."""
|
|
130
|
+
word_counts: Counter[str] = Counter()
|
|
131
|
+
for m in members:
|
|
132
|
+
if m.kind in ("Function", "Class", "Test", "Type"):
|
|
133
|
+
words = _split_name(m.name)
|
|
134
|
+
for w in words:
|
|
135
|
+
wl = w.lower()
|
|
136
|
+
if wl not in _COMMON_WORDS and len(wl) > 1:
|
|
137
|
+
word_counts[wl] += 1
|
|
138
|
+
|
|
139
|
+
if not word_counts:
|
|
140
|
+
return []
|
|
141
|
+
return [w for w, _ in word_counts.most_common(5)]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _split_name(name: str) -> list[str]:
|
|
145
|
+
"""Split a camelCase or snake_case name into words."""
|
|
146
|
+
# Insert boundary before uppercase letters for camelCase
|
|
147
|
+
s = re.sub(r"([a-z])([A-Z])", r"\1_\2", name)
|
|
148
|
+
# Split on underscores, hyphens, dots
|
|
149
|
+
return [p for p in re.split(r"[_\-.\s]+", s) if p]
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _to_slug(s: str) -> str:
|
|
153
|
+
"""Convert a string to a short lowercase slug."""
|
|
154
|
+
return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")[:30]
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
# Cohesion calculation
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _compute_cohesion_batch(
|
|
163
|
+
community_member_qns: list[set[str]],
|
|
164
|
+
all_edges: list[GraphEdge],
|
|
165
|
+
) -> list[float]:
|
|
166
|
+
"""Compute cohesion for multiple communities in a single O(edges) pass.
|
|
167
|
+
|
|
168
|
+
Builds a ``qualified_name -> community_index`` reverse map (each node
|
|
169
|
+
appears in at most one community since all callers produce partitions),
|
|
170
|
+
then walks every edge exactly once, bucketing it into internal/external
|
|
171
|
+
counters per community.
|
|
172
|
+
|
|
173
|
+
Total work: O(edges + sum(|members|)) instead of
|
|
174
|
+
O(edges * communities) for naive per-community cohesion.
|
|
175
|
+
|
|
176
|
+
Returns a list of cohesion scores aligned with ``community_member_qns``.
|
|
177
|
+
"""
|
|
178
|
+
qn_to_idx: dict[str, int] = {}
|
|
179
|
+
for idx, members in enumerate(community_member_qns):
|
|
180
|
+
for qn in members:
|
|
181
|
+
qn_to_idx[qn] = idx
|
|
182
|
+
|
|
183
|
+
n = len(community_member_qns)
|
|
184
|
+
internal = [0] * n
|
|
185
|
+
external = [0] * n
|
|
186
|
+
|
|
187
|
+
for e in all_edges:
|
|
188
|
+
sc = qn_to_idx.get(e.source_qualified)
|
|
189
|
+
tc = qn_to_idx.get(e.target_qualified)
|
|
190
|
+
if sc is None and tc is None:
|
|
191
|
+
continue
|
|
192
|
+
if sc == tc:
|
|
193
|
+
# Safe: sc is not None here (sc == tc and not both None).
|
|
194
|
+
assert sc is not None
|
|
195
|
+
internal[sc] += 1
|
|
196
|
+
else:
|
|
197
|
+
if sc is not None:
|
|
198
|
+
external[sc] += 1
|
|
199
|
+
if tc is not None:
|
|
200
|
+
external[tc] += 1
|
|
201
|
+
|
|
202
|
+
results: list[float] = []
|
|
203
|
+
for i in range(n):
|
|
204
|
+
total = internal[i] + external[i]
|
|
205
|
+
results.append(internal[i] / total if total > 0 else 0.0)
|
|
206
|
+
return results
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _build_adjacency(edges: list[GraphEdge]) -> dict[str, list[str]]:
|
|
210
|
+
"""Build adjacency list from edges (one pass over all edges)."""
|
|
211
|
+
adj: dict[str, list[str]] = defaultdict(list)
|
|
212
|
+
for e in edges:
|
|
213
|
+
adj[e.source_qualified].append(e.target_qualified)
|
|
214
|
+
adj[e.target_qualified].append(e.source_qualified)
|
|
215
|
+
return adj
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _compute_cohesion(
|
|
219
|
+
member_qns: set[str],
|
|
220
|
+
all_edges: list[GraphEdge],
|
|
221
|
+
adj: dict[str, list[str]] | None = None,
|
|
222
|
+
) -> float:
|
|
223
|
+
"""Compute cohesion: internal_edges / (internal_edges + external_edges).
|
|
224
|
+
|
|
225
|
+
For multiple communities, prefer :func:`_compute_cohesion_batch`, which
|
|
226
|
+
runs in O(edges) total instead of O(edges) per community.
|
|
227
|
+
"""
|
|
228
|
+
return _compute_cohesion_batch([member_qns], all_edges)[0]
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
# ---------------------------------------------------------------------------
|
|
232
|
+
# Leiden-based community detection (igraph)
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def _detect_leiden(
|
|
237
|
+
nodes: list[GraphNode],
|
|
238
|
+
edges: list[GraphEdge],
|
|
239
|
+
min_size: int,
|
|
240
|
+
adj: dict[str, list[str]] | None = None,
|
|
241
|
+
) -> list[dict[str, Any]]:
|
|
242
|
+
"""Detect communities using Leiden algorithm via igraph.
|
|
243
|
+
|
|
244
|
+
Caps Leiden at ``n_iterations=2`` (sufficient for code dependency graphs)
|
|
245
|
+
and skips the recursive sub-community splitting pass that caused
|
|
246
|
+
exponential blow-up on large repos (>100k nodes).
|
|
247
|
+
"""
|
|
248
|
+
if ig is None:
|
|
249
|
+
return []
|
|
250
|
+
|
|
251
|
+
qn_to_idx: dict[str, int] = {}
|
|
252
|
+
idx_to_node: dict[int, GraphNode] = {}
|
|
253
|
+
for i, node in enumerate(nodes):
|
|
254
|
+
qn_to_idx[node.qualified_name] = i
|
|
255
|
+
idx_to_node[i] = node
|
|
256
|
+
|
|
257
|
+
if not qn_to_idx:
|
|
258
|
+
return []
|
|
259
|
+
|
|
260
|
+
logger.info("Building igraph with %d nodes...", len(qn_to_idx))
|
|
261
|
+
|
|
262
|
+
g = ig.Graph(n=len(qn_to_idx), directed=False)
|
|
263
|
+
edge_list: list[tuple[int, int]] = []
|
|
264
|
+
weights: list[float] = []
|
|
265
|
+
seen_edges: set[tuple[int, int]] = set()
|
|
266
|
+
|
|
267
|
+
for e in edges:
|
|
268
|
+
src_idx = qn_to_idx.get(e.source_qualified)
|
|
269
|
+
tgt_idx = qn_to_idx.get(e.target_qualified)
|
|
270
|
+
if src_idx is not None and tgt_idx is not None and src_idx != tgt_idx:
|
|
271
|
+
pair = (min(src_idx, tgt_idx), max(src_idx, tgt_idx))
|
|
272
|
+
if pair not in seen_edges:
|
|
273
|
+
seen_edges.add(pair)
|
|
274
|
+
edge_list.append(pair)
|
|
275
|
+
weights.append(EDGE_WEIGHTS.get(e.kind, 0.5))
|
|
276
|
+
|
|
277
|
+
if not edge_list:
|
|
278
|
+
return _detect_file_based(nodes, edges, min_size, adj=adj)
|
|
279
|
+
|
|
280
|
+
g.add_edges(edge_list)
|
|
281
|
+
g.es["weight"] = weights
|
|
282
|
+
|
|
283
|
+
# Run Leiden -- scale resolution inversely with graph size to get
|
|
284
|
+
# coarser clusters on large repos. Default resolution=1.0 produces
|
|
285
|
+
# thousands of tiny communities for 30k+ node graphs.
|
|
286
|
+
import math
|
|
287
|
+
n_nodes = g.vcount()
|
|
288
|
+
resolution = max(0.05, 1.0 / math.log10(max(n_nodes, 10)))
|
|
289
|
+
|
|
290
|
+
logger.info(
|
|
291
|
+
"Running Leiden on %d nodes, %d edges...",
|
|
292
|
+
g.vcount(), g.ecount(),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
import os
|
|
296
|
+
seed = int(os.environ.get("CRG_LEIDEN_SEED", _LEIDEN_SEED))
|
|
297
|
+
# Deterministic seeding for benchmark reproducibility — community
|
|
298
|
+
# detection is not a security-sensitive context. nosec B311.
|
|
299
|
+
ig.set_random_number_generator(random.Random(seed)) # nosec B311
|
|
300
|
+
partition = g.community_leiden(
|
|
301
|
+
objective_function="modularity",
|
|
302
|
+
weights="weight",
|
|
303
|
+
resolution=resolution,
|
|
304
|
+
n_iterations=2,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
logger.info(
|
|
308
|
+
"Leiden complete, found %d partitions. Computing cohesion...",
|
|
309
|
+
len(partition),
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
pending: list[tuple[list[GraphNode], set[str]]] = []
|
|
313
|
+
for cluster_ids in partition:
|
|
314
|
+
if len(cluster_ids) < min_size:
|
|
315
|
+
continue
|
|
316
|
+
members = [idx_to_node[i] for i in cluster_ids if i in idx_to_node]
|
|
317
|
+
if len(members) < min_size:
|
|
318
|
+
continue
|
|
319
|
+
member_qns = {m.qualified_name for m in members}
|
|
320
|
+
pending.append((members, member_qns))
|
|
321
|
+
|
|
322
|
+
cohesions = _compute_cohesion_batch([p[1] for p in pending], edges)
|
|
323
|
+
|
|
324
|
+
communities: list[dict[str, Any]] = []
|
|
325
|
+
for (members, member_qns), cohesion in zip(pending, cohesions):
|
|
326
|
+
lang_counts = Counter(m.language for m in members if m.language)
|
|
327
|
+
dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
|
|
328
|
+
name = _generate_community_name(members)
|
|
329
|
+
|
|
330
|
+
communities.append({
|
|
331
|
+
"name": name,
|
|
332
|
+
"level": 0,
|
|
333
|
+
"size": len(members),
|
|
334
|
+
"cohesion": round(cohesion, 4),
|
|
335
|
+
"dominant_language": dominant_lang,
|
|
336
|
+
"description": f"Community of {len(members)} nodes",
|
|
337
|
+
"members": [m.qualified_name for m in members],
|
|
338
|
+
"member_qns": member_qns,
|
|
339
|
+
})
|
|
340
|
+
|
|
341
|
+
logger.info("Community detection complete: %d communities", len(communities))
|
|
342
|
+
return communities
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
# File-based fallback community detection
|
|
347
|
+
# ---------------------------------------------------------------------------
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _detect_file_based(
|
|
351
|
+
nodes: list[GraphNode],
|
|
352
|
+
edges: list[GraphEdge],
|
|
353
|
+
min_size: int,
|
|
354
|
+
adj: dict[str, list[str]] | None = None,
|
|
355
|
+
) -> list[dict[str, Any]]:
|
|
356
|
+
"""Group nodes by directory when Leiden is unavailable or over-fragments.
|
|
357
|
+
|
|
358
|
+
Strips the longest common directory prefix from all file paths, then
|
|
359
|
+
adaptively picks a grouping depth that yields 10-200 communities.
|
|
360
|
+
"""
|
|
361
|
+
# Collect all directory paths (normalized, without filename)
|
|
362
|
+
all_dir_parts: list[list[str]] = []
|
|
363
|
+
for n in nodes:
|
|
364
|
+
parts = n.file_path.replace("\\", "/").split("/")
|
|
365
|
+
all_dir_parts.append([p for p in parts[:-1] if p])
|
|
366
|
+
|
|
367
|
+
# Find the longest common prefix among directory parts
|
|
368
|
+
prefix_len = 0
|
|
369
|
+
if all_dir_parts:
|
|
370
|
+
shortest = min(len(p) for p in all_dir_parts)
|
|
371
|
+
for i in range(shortest):
|
|
372
|
+
seg = all_dir_parts[0][i]
|
|
373
|
+
if all(p[i] == seg for p in all_dir_parts):
|
|
374
|
+
prefix_len = i + 1
|
|
375
|
+
else:
|
|
376
|
+
break
|
|
377
|
+
|
|
378
|
+
def _group_at_depth(depth: int) -> dict[str, list[GraphNode]]:
|
|
379
|
+
groups: dict[str, list[GraphNode]] = defaultdict(list)
|
|
380
|
+
for n in nodes:
|
|
381
|
+
parts = n.file_path.replace("\\", "/").split("/")
|
|
382
|
+
dir_parts = [p for p in parts[:-1] if p]
|
|
383
|
+
remainder = dir_parts[prefix_len:]
|
|
384
|
+
if remainder:
|
|
385
|
+
key = "/".join(remainder[:depth])
|
|
386
|
+
else:
|
|
387
|
+
key = parts[-1].rsplit(".", 1)[0] if parts else "root"
|
|
388
|
+
groups[key].append(n)
|
|
389
|
+
return groups
|
|
390
|
+
|
|
391
|
+
# Try increasing depths until we get 10-200 qualifying groups
|
|
392
|
+
max_depth = max((len(p) - prefix_len for p in all_dir_parts), default=0)
|
|
393
|
+
best_groups = _group_at_depth(1) # depth=1 always works (file stem fallback)
|
|
394
|
+
for depth in range(1, max_depth + 1):
|
|
395
|
+
groups = _group_at_depth(depth)
|
|
396
|
+
qualifying = sum(1 for v in groups.values() if len(v) >= min_size)
|
|
397
|
+
best_groups = groups
|
|
398
|
+
if qualifying >= 10:
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
by_dir = best_groups
|
|
402
|
+
|
|
403
|
+
# Pre-filter to communities meeting min_size and collect their member
|
|
404
|
+
# sets so we can batch-compute all cohesions in a single O(edges) pass.
|
|
405
|
+
# Without this, per-community cohesion is O(edges * files), which makes
|
|
406
|
+
# community detection effectively hang on large repos.
|
|
407
|
+
pending: list[tuple[str, list[GraphNode], set[str]]] = []
|
|
408
|
+
for dir_path, members in by_dir.items():
|
|
409
|
+
if len(members) < min_size:
|
|
410
|
+
continue
|
|
411
|
+
member_qns = {m.qualified_name for m in members}
|
|
412
|
+
pending.append((dir_path, members, member_qns))
|
|
413
|
+
|
|
414
|
+
cohesions = _compute_cohesion_batch([p[2] for p in pending], edges)
|
|
415
|
+
|
|
416
|
+
communities: list[dict[str, Any]] = []
|
|
417
|
+
for (dir_path, members, member_qns), cohesion in zip(pending, cohesions):
|
|
418
|
+
lang_counts = Counter(m.language for m in members if m.language)
|
|
419
|
+
dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
|
|
420
|
+
name = _generate_community_name(members)
|
|
421
|
+
|
|
422
|
+
communities.append({
|
|
423
|
+
"name": name,
|
|
424
|
+
"level": 0,
|
|
425
|
+
"size": len(members),
|
|
426
|
+
"cohesion": round(cohesion, 4),
|
|
427
|
+
"dominant_language": dominant_lang,
|
|
428
|
+
"description": f"Directory-based community: {dir_path}",
|
|
429
|
+
"members": [m.qualified_name for m in members],
|
|
430
|
+
"member_qns": member_qns,
|
|
431
|
+
})
|
|
432
|
+
|
|
433
|
+
return communities
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# ---------------------------------------------------------------------------
|
|
437
|
+
# Oversized community splitting
|
|
438
|
+
# ---------------------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _split_oversized(
|
|
442
|
+
communities: list[dict],
|
|
443
|
+
nodes: list[GraphNode],
|
|
444
|
+
edges: list[GraphEdge],
|
|
445
|
+
threshold_pct: float = 0.25,
|
|
446
|
+
min_split_size: int = 10,
|
|
447
|
+
) -> list[dict]:
|
|
448
|
+
"""Recursively split communities that exceed threshold_pct of total.
|
|
449
|
+
|
|
450
|
+
Uses Leiden on the subgraph of oversized communities. If igraph is
|
|
451
|
+
not available, returns communities unchanged.
|
|
452
|
+
"""
|
|
453
|
+
if not IGRAPH_AVAILABLE:
|
|
454
|
+
return communities
|
|
455
|
+
|
|
456
|
+
total = sum(
|
|
457
|
+
c.get("size", len(c.get("members", [])))
|
|
458
|
+
for c in communities
|
|
459
|
+
)
|
|
460
|
+
if total == 0:
|
|
461
|
+
return communities
|
|
462
|
+
|
|
463
|
+
threshold = max(int(total * threshold_pct), min_split_size)
|
|
464
|
+
result: list[dict] = []
|
|
465
|
+
next_id = max(
|
|
466
|
+
(c.get("id", 0) for c in communities), default=0
|
|
467
|
+
) + 1
|
|
468
|
+
|
|
469
|
+
for comm in communities:
|
|
470
|
+
members = set(comm.get("members", []))
|
|
471
|
+
if len(members) <= threshold:
|
|
472
|
+
result.append(comm)
|
|
473
|
+
continue
|
|
474
|
+
|
|
475
|
+
# Build subgraph for this community
|
|
476
|
+
member_nodes = [
|
|
477
|
+
n for n in nodes
|
|
478
|
+
if n.qualified_name in members
|
|
479
|
+
]
|
|
480
|
+
member_edges = [
|
|
481
|
+
e for e in edges
|
|
482
|
+
if (
|
|
483
|
+
e.source_qualified in members
|
|
484
|
+
and e.target_qualified in members
|
|
485
|
+
)
|
|
486
|
+
]
|
|
487
|
+
|
|
488
|
+
if len(member_nodes) < min_split_size:
|
|
489
|
+
result.append(comm)
|
|
490
|
+
continue
|
|
491
|
+
|
|
492
|
+
# Run Leiden on subgraph
|
|
493
|
+
qn_to_idx = {
|
|
494
|
+
n.qualified_name: i
|
|
495
|
+
for i, n in enumerate(member_nodes)
|
|
496
|
+
}
|
|
497
|
+
ig_edges: list[tuple[int, int]] = []
|
|
498
|
+
ig_weights: list[float] = []
|
|
499
|
+
for e in member_edges:
|
|
500
|
+
si = qn_to_idx.get(e.source_qualified)
|
|
501
|
+
ti = qn_to_idx.get(e.target_qualified)
|
|
502
|
+
if si is not None and ti is not None and si != ti:
|
|
503
|
+
ig_edges.append((si, ti))
|
|
504
|
+
ig_weights.append(
|
|
505
|
+
EDGE_WEIGHTS.get(e.kind, 0.5)
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
if not ig_edges:
|
|
509
|
+
result.append(comm)
|
|
510
|
+
continue
|
|
511
|
+
|
|
512
|
+
try:
|
|
513
|
+
g = ig.Graph(
|
|
514
|
+
n=len(member_nodes),
|
|
515
|
+
edges=ig_edges,
|
|
516
|
+
directed=False,
|
|
517
|
+
)
|
|
518
|
+
g.es["weight"] = ig_weights
|
|
519
|
+
import os
|
|
520
|
+
seed = int(os.environ.get("CRG_LEIDEN_SEED", _LEIDEN_SEED))
|
|
521
|
+
# Deterministic seeding for benchmark reproducibility — community
|
|
522
|
+
# detection is not a security-sensitive context. nosec B311.
|
|
523
|
+
ig.set_random_number_generator(random.Random(seed)) # nosec B311
|
|
524
|
+
partition = g.community_leiden(
|
|
525
|
+
objective_function="modularity",
|
|
526
|
+
weights="weight",
|
|
527
|
+
resolution=0.5,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
sub_communities: dict[int, list[str]] = {}
|
|
531
|
+
for idx, cid in enumerate(partition.membership):
|
|
532
|
+
sub_communities.setdefault(cid, []).append(
|
|
533
|
+
member_nodes[idx].qualified_name
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
if len(sub_communities) <= 1:
|
|
537
|
+
result.append(comm)
|
|
538
|
+
continue
|
|
539
|
+
|
|
540
|
+
parent_id = comm.get("id", 0)
|
|
541
|
+
comm_name = comm.get("name", "")
|
|
542
|
+
for sub_members in sub_communities.values():
|
|
543
|
+
sub_comm = {
|
|
544
|
+
"id": next_id,
|
|
545
|
+
"name": comm_name + f"-sub{next_id}",
|
|
546
|
+
"level": comm.get("level", 0) + 1,
|
|
547
|
+
"parent_id": parent_id,
|
|
548
|
+
"members": sub_members,
|
|
549
|
+
"size": len(sub_members),
|
|
550
|
+
"cohesion": 0.0,
|
|
551
|
+
"dominant_language": comm.get(
|
|
552
|
+
"dominant_language"
|
|
553
|
+
),
|
|
554
|
+
"description": (
|
|
555
|
+
f"Split from {comm_name}"
|
|
556
|
+
),
|
|
557
|
+
}
|
|
558
|
+
result.append(sub_comm)
|
|
559
|
+
next_id += 1
|
|
560
|
+
|
|
561
|
+
logger.info(
|
|
562
|
+
"Split oversized community '%s' "
|
|
563
|
+
"(%d members) into %d",
|
|
564
|
+
comm_name,
|
|
565
|
+
len(members),
|
|
566
|
+
len(sub_communities),
|
|
567
|
+
)
|
|
568
|
+
except Exception:
|
|
569
|
+
logger.warning(
|
|
570
|
+
"Failed to split community '%s', "
|
|
571
|
+
"keeping as-is",
|
|
572
|
+
comm.get("name", ""),
|
|
573
|
+
exc_info=True,
|
|
574
|
+
)
|
|
575
|
+
result.append(comm)
|
|
576
|
+
|
|
577
|
+
return result
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
# ---------------------------------------------------------------------------
|
|
581
|
+
# Public API
|
|
582
|
+
# ---------------------------------------------------------------------------
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def detect_communities(
|
|
586
|
+
store: GraphStore, min_size: int = 2
|
|
587
|
+
) -> list[dict[str, Any]]:
|
|
588
|
+
"""Detect communities in the code graph.
|
|
589
|
+
|
|
590
|
+
Uses the Leiden algorithm via igraph if available, otherwise falls back to
|
|
591
|
+
file-based grouping.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
store: The GraphStore instance.
|
|
595
|
+
min_size: Minimum number of nodes for a community to be included.
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
List of community dicts with keys: name, level, size, cohesion,
|
|
599
|
+
dominant_language, description, members, member_qns.
|
|
600
|
+
"""
|
|
601
|
+
# Gather all nodes (exclude File nodes to focus on code entities)
|
|
602
|
+
all_edges = store.get_all_edges()
|
|
603
|
+
unique_nodes = store.get_all_nodes(exclude_files=True)
|
|
604
|
+
|
|
605
|
+
# Build adjacency index once for fast cohesion computation
|
|
606
|
+
adj = _build_adjacency(all_edges)
|
|
607
|
+
|
|
608
|
+
logger.info(
|
|
609
|
+
"Loaded %d unique nodes, %d edges",
|
|
610
|
+
len(unique_nodes), len(all_edges),
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
if IGRAPH_AVAILABLE:
|
|
614
|
+
logger.info("Detecting communities with Leiden algorithm (igraph)")
|
|
615
|
+
results = _detect_leiden(unique_nodes, all_edges, min_size, adj=adj)
|
|
616
|
+
else:
|
|
617
|
+
logger.info("igraph not available, using file-based community detection")
|
|
618
|
+
results = _detect_file_based(unique_nodes, all_edges, min_size, adj=adj)
|
|
619
|
+
|
|
620
|
+
# Split oversized communities
|
|
621
|
+
results = _split_oversized(
|
|
622
|
+
results, unique_nodes, all_edges,
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Convert member_qns (internal set) to a list for serialization safety,
|
|
626
|
+
# then strip it from the returned dicts to avoid leaking internal state.
|
|
627
|
+
for comm in results:
|
|
628
|
+
if "member_qns" in comm:
|
|
629
|
+
comm["member_qns"] = list(comm["member_qns"])
|
|
630
|
+
del comm["member_qns"]
|
|
631
|
+
|
|
632
|
+
return results
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def incremental_detect_communities(
|
|
636
|
+
store: GraphStore,
|
|
637
|
+
changed_files: list[str],
|
|
638
|
+
min_size: int = 2,
|
|
639
|
+
) -> int:
|
|
640
|
+
"""Re-detect communities only if changed files affect existing communities.
|
|
641
|
+
|
|
642
|
+
If no existing communities contain nodes from changed files, skips
|
|
643
|
+
re-detection entirely (the common case for small changes). Otherwise
|
|
644
|
+
re-runs full community detection.
|
|
645
|
+
|
|
646
|
+
Args:
|
|
647
|
+
store: The GraphStore instance.
|
|
648
|
+
changed_files: List of file paths that have changed.
|
|
649
|
+
min_size: Minimum number of nodes for a community to be included.
|
|
650
|
+
|
|
651
|
+
Returns:
|
|
652
|
+
Number of communities detected, or 0 if skipped.
|
|
653
|
+
"""
|
|
654
|
+
if not changed_files:
|
|
655
|
+
return 0
|
|
656
|
+
|
|
657
|
+
conn = store._conn
|
|
658
|
+
|
|
659
|
+
# Check if any communities are affected (batch to stay under SQLite limit)
|
|
660
|
+
affected_count = 0
|
|
661
|
+
for i in range(0, len(changed_files), _SQL_BATCH):
|
|
662
|
+
batch = changed_files[i:i + _SQL_BATCH]
|
|
663
|
+
placeholders = ",".join("?" * len(batch))
|
|
664
|
+
row = conn.execute(
|
|
665
|
+
f"SELECT COUNT(DISTINCT community_id) FROM nodes " # nosec B608
|
|
666
|
+
f"WHERE community_id IS NOT NULL AND file_path IN ({placeholders})",
|
|
667
|
+
batch,
|
|
668
|
+
).fetchone()
|
|
669
|
+
if row:
|
|
670
|
+
affected_count += row[0]
|
|
671
|
+
affected = (affected_count,) if affected_count else None
|
|
672
|
+
|
|
673
|
+
if not affected or affected[0] == 0:
|
|
674
|
+
return 0 # No communities affected, skip
|
|
675
|
+
|
|
676
|
+
# Re-run full community detection (correct and fast enough)
|
|
677
|
+
communities = detect_communities(store, min_size=min_size)
|
|
678
|
+
return store_communities(store, communities)
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
def store_communities(
|
|
682
|
+
store: GraphStore, communities: list[dict[str, Any]]
|
|
683
|
+
) -> int:
|
|
684
|
+
"""Store detected communities in the database.
|
|
685
|
+
|
|
686
|
+
Clears existing communities and community_id assignments, then inserts
|
|
687
|
+
the new communities and updates node community_id references.
|
|
688
|
+
|
|
689
|
+
Args:
|
|
690
|
+
store: The GraphStore instance.
|
|
691
|
+
communities: List of community dicts from detect_communities().
|
|
692
|
+
|
|
693
|
+
Returns:
|
|
694
|
+
Number of communities stored.
|
|
695
|
+
"""
|
|
696
|
+
# NOTE: store_communities uses _conn directly because it performs
|
|
697
|
+
# multi-statement batch writes (DELETE + INSERT loop + UPDATE loop)
|
|
698
|
+
# that are tightly coupled to the DB transaction lifecycle.
|
|
699
|
+
conn = store._conn
|
|
700
|
+
|
|
701
|
+
if conn.in_transaction:
|
|
702
|
+
logger.warning("Rolling back uncommitted transaction before BEGIN IMMEDIATE")
|
|
703
|
+
conn.rollback()
|
|
704
|
+
# Wrap in explicit transaction so the DELETE + INSERT + UPDATE
|
|
705
|
+
# sequence is atomic — no partial community data on crash.
|
|
706
|
+
conn.execute("BEGIN IMMEDIATE")
|
|
707
|
+
try:
|
|
708
|
+
conn.execute("DELETE FROM communities")
|
|
709
|
+
conn.execute("UPDATE nodes SET community_id = NULL")
|
|
710
|
+
|
|
711
|
+
count = 0
|
|
712
|
+
for comm in communities:
|
|
713
|
+
cursor = conn.execute(
|
|
714
|
+
"""INSERT INTO communities
|
|
715
|
+
(name, level, cohesion, size, dominant_language, description)
|
|
716
|
+
VALUES (?, ?, ?, ?, ?, ?)""",
|
|
717
|
+
(
|
|
718
|
+
comm["name"],
|
|
719
|
+
comm.get("level", 0),
|
|
720
|
+
comm.get("cohesion", 0.0),
|
|
721
|
+
comm["size"],
|
|
722
|
+
comm.get("dominant_language", ""),
|
|
723
|
+
comm.get("description", ""),
|
|
724
|
+
),
|
|
725
|
+
)
|
|
726
|
+
community_id = cursor.lastrowid
|
|
727
|
+
|
|
728
|
+
# Batch update community_id on member nodes
|
|
729
|
+
member_qns = comm.get("members", [])
|
|
730
|
+
for j in range(0, len(member_qns), _SQL_BATCH):
|
|
731
|
+
batch = member_qns[j:j + _SQL_BATCH]
|
|
732
|
+
placeholders = ",".join("?" * len(batch))
|
|
733
|
+
conn.execute(
|
|
734
|
+
f"UPDATE nodes SET community_id = ? WHERE qualified_name IN ({placeholders})", # nosec B608
|
|
735
|
+
[community_id] + batch,
|
|
736
|
+
)
|
|
737
|
+
count += 1
|
|
738
|
+
|
|
739
|
+
conn.commit()
|
|
740
|
+
except BaseException:
|
|
741
|
+
conn.rollback()
|
|
742
|
+
raise
|
|
743
|
+
return count
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def get_communities(
|
|
747
|
+
store: GraphStore, sort_by: str = "size", min_size: int = 0
|
|
748
|
+
) -> list[dict[str, Any]]:
|
|
749
|
+
"""Retrieve stored communities from the database.
|
|
750
|
+
|
|
751
|
+
Args:
|
|
752
|
+
store: The GraphStore instance.
|
|
753
|
+
sort_by: Column to sort by ("size", "cohesion", "name").
|
|
754
|
+
min_size: Minimum community size to include.
|
|
755
|
+
|
|
756
|
+
Returns:
|
|
757
|
+
List of community dicts.
|
|
758
|
+
"""
|
|
759
|
+
valid_sorts = {"size", "cohesion", "name"}
|
|
760
|
+
if sort_by not in valid_sorts:
|
|
761
|
+
sort_by = "size"
|
|
762
|
+
|
|
763
|
+
order = "DESC" if sort_by in ("size", "cohesion") else "ASC"
|
|
764
|
+
|
|
765
|
+
# NOTE: get_communities reads the communities table which has no
|
|
766
|
+
# dedicated GraphStore method (it's a domain-specific table managed
|
|
767
|
+
# entirely by the communities module). We use _conn for this query.
|
|
768
|
+
rows = store._conn.execute(
|
|
769
|
+
f"SELECT * FROM communities WHERE size >= ? ORDER BY {sort_by} {order}", # nosec B608
|
|
770
|
+
(min_size,),
|
|
771
|
+
).fetchall()
|
|
772
|
+
|
|
773
|
+
communities: list[dict[str, Any]] = []
|
|
774
|
+
for row in rows:
|
|
775
|
+
# Fetch member qualified names for this community
|
|
776
|
+
member_qns = [
|
|
777
|
+
_sanitize_name(qn)
|
|
778
|
+
for qn in store.get_community_member_qns(row["id"])
|
|
779
|
+
]
|
|
780
|
+
|
|
781
|
+
communities.append({
|
|
782
|
+
"id": row["id"],
|
|
783
|
+
"name": _sanitize_name(row["name"]),
|
|
784
|
+
"level": row["level"],
|
|
785
|
+
"cohesion": row["cohesion"],
|
|
786
|
+
"size": row["size"],
|
|
787
|
+
"dominant_language": row["dominant_language"] or "",
|
|
788
|
+
"description": _sanitize_name(row["description"] or ""),
|
|
789
|
+
"members": member_qns,
|
|
790
|
+
})
|
|
791
|
+
|
|
792
|
+
return communities
|
|
793
|
+
|
|
794
|
+
|
|
795
|
+
_TEST_COMMUNITY_RE = re.compile(
|
|
796
|
+
r"(^test[-/]|[-/]test([:/]|$)|it:should|describe:|spec[-/]|[-/]spec$)",
|
|
797
|
+
re.IGNORECASE,
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def _is_test_community(name: str) -> bool:
|
|
802
|
+
"""Return True if a community name indicates it is test-dominated."""
|
|
803
|
+
return bool(_TEST_COMMUNITY_RE.search(name))
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def get_architecture_overview(store: GraphStore) -> dict[str, Any]:
|
|
807
|
+
"""Generate an architecture overview based on community structure.
|
|
808
|
+
|
|
809
|
+
Builds a node-to-community mapping, counts cross-community edges,
|
|
810
|
+
and generates warnings for high coupling.
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
store: The GraphStore instance.
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
Dict with keys: communities, cross_community_edges, warnings.
|
|
817
|
+
"""
|
|
818
|
+
communities = get_communities(store)
|
|
819
|
+
|
|
820
|
+
# Build node -> community_id mapping
|
|
821
|
+
node_to_community: dict[str, int] = {}
|
|
822
|
+
for comm in communities:
|
|
823
|
+
comm_id = comm.get("id", 0)
|
|
824
|
+
for qn in comm.get("members", []):
|
|
825
|
+
node_to_community[qn] = comm_id
|
|
826
|
+
|
|
827
|
+
# Count cross-community edges
|
|
828
|
+
all_edges = store.get_all_edges()
|
|
829
|
+
cross_edges: list[dict[str, Any]] = []
|
|
830
|
+
cross_counts: Counter[tuple[int, int]] = Counter()
|
|
831
|
+
|
|
832
|
+
for e in all_edges:
|
|
833
|
+
# TESTED_BY edges are expected cross-community coupling (test → code),
|
|
834
|
+
# not an architectural smell.
|
|
835
|
+
if e.kind == "TESTED_BY":
|
|
836
|
+
continue
|
|
837
|
+
src_comm = node_to_community.get(e.source_qualified)
|
|
838
|
+
tgt_comm = node_to_community.get(e.target_qualified)
|
|
839
|
+
if (
|
|
840
|
+
src_comm is not None
|
|
841
|
+
and tgt_comm is not None
|
|
842
|
+
and src_comm != tgt_comm
|
|
843
|
+
):
|
|
844
|
+
pair = (min(src_comm, tgt_comm), max(src_comm, tgt_comm))
|
|
845
|
+
cross_counts[pair] += 1
|
|
846
|
+
cross_edges.append({
|
|
847
|
+
"source_community": src_comm,
|
|
848
|
+
"target_community": tgt_comm,
|
|
849
|
+
"edge_kind": e.kind,
|
|
850
|
+
"source": _sanitize_name(e.source_qualified),
|
|
851
|
+
"target": _sanitize_name(e.target_qualified),
|
|
852
|
+
})
|
|
853
|
+
|
|
854
|
+
# Generate warnings for high coupling, skipping test-dominated pairs.
|
|
855
|
+
warnings: list[str] = []
|
|
856
|
+
comm_name_map = {c.get("id", 0): c["name"] for c in communities}
|
|
857
|
+
for (c1, c2), count in cross_counts.most_common():
|
|
858
|
+
if count > 10:
|
|
859
|
+
name1 = comm_name_map.get(c1, f"community-{c1}")
|
|
860
|
+
name2 = comm_name_map.get(c2, f"community-{c2}")
|
|
861
|
+
# Skip pairs where either community is test-dominated — coupling
|
|
862
|
+
# between test and production code is expected, not architectural.
|
|
863
|
+
if _is_test_community(name1) or _is_test_community(name2):
|
|
864
|
+
continue
|
|
865
|
+
warnings.append(
|
|
866
|
+
f"High coupling ({count} edges) between "
|
|
867
|
+
f"'{name1}' and '{name2}'"
|
|
868
|
+
)
|
|
869
|
+
|
|
870
|
+
return {
|
|
871
|
+
"communities": communities,
|
|
872
|
+
"cross_community_edges": cross_edges,
|
|
873
|
+
"warnings": warnings,
|
|
874
|
+
}
|