code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,874 @@
1
+ """Community/cluster detection for the code knowledge graph.
2
+
3
+ Detects communities of related code nodes using the Leiden algorithm (via igraph,
4
+ optional) with a file-based grouping fallback when igraph is not installed.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ import random
11
+ import re
12
+ from collections import Counter, defaultdict
13
+ from typing import Any
14
+
15
+ from .graph import GraphEdge, GraphNode, GraphStore, _sanitize_name
16
+
17
+ # Fixed seed for igraph's RNG so Leiden community detection is reproducible
18
+ # across runs. Without this, two builds of the same graph produce different
19
+ # community IDs / sizes, breaking benchmark comparability. Override with
20
+ # CRG_LEIDEN_SEED env var if you need a different seed.
21
+ _LEIDEN_SEED = 42
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Stay well under SQLite's default 999-variable limit per statement.
26
+ _SQL_BATCH = 450
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # Optional igraph import
30
+ # ---------------------------------------------------------------------------
31
+
32
+ try:
33
+ import igraph as ig # type: ignore[import-untyped]
34
+
35
+ IGRAPH_AVAILABLE = True
36
+ except ImportError:
37
+ ig = None # type: ignore[assignment]
38
+ IGRAPH_AVAILABLE = False
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Edge weight mapping
42
+ # ---------------------------------------------------------------------------
43
+
44
+ EDGE_WEIGHTS: dict[str, float] = {
45
+ "CALLS": 1.0,
46
+ "IMPORTS_FROM": 0.5,
47
+ "INHERITS": 0.8,
48
+ "IMPLEMENTS": 0.7,
49
+ "CONTAINS": 0.3,
50
+ "TESTED_BY": 0.4,
51
+ "DEPENDS_ON": 0.6,
52
+ }
53
+
54
+ # Common words to filter when generating community names
55
+ _COMMON_WORDS = frozenset({
56
+ "get", "set", "self", "init", "new", "create", "update", "delete",
57
+ "add", "remove", "make", "build", "from", "to", "for", "with",
58
+ "the", "and", "test", "main", "run", "do", "is", "has", "on",
59
+ "of", "in", "at", "by", "my", "this", "that", "all", "none",
60
+ })
61
+
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Community naming
65
+ # ---------------------------------------------------------------------------
66
+
67
+
68
+ def _generate_community_name(members: list[GraphNode]) -> str:
69
+ """Generate a meaningful name for a community of nodes.
70
+
71
+ Algorithm:
72
+ 1. Find most common module/file prefix among members
73
+ 2. If a dominant class exists (>40% of nodes), use its name
74
+ 3. Fallback: most frequent keyword in function/class names
75
+ 4. Format: "{prefix}-{keyword}"
76
+ """
77
+ if not members:
78
+ return "empty"
79
+
80
+ # 1. Find common file prefix
81
+ file_paths = [m.file_path for m in members]
82
+ prefix = _extract_file_prefix(file_paths)
83
+
84
+ # 2. Check for dominant class
85
+ class_names = [m.name for m in members if m.kind == "Class"]
86
+ if class_names:
87
+ class_counts = Counter(class_names)
88
+ top_class, top_count = class_counts.most_common(1)[0]
89
+ if top_count > len(members) * 0.4:
90
+ if prefix:
91
+ return f"{prefix}-{_to_slug(top_class)}"
92
+ return _to_slug(top_class)
93
+
94
+ # 3. Most frequent keyword from function/class names
95
+ keywords = _extract_keywords(members)
96
+ keyword = keywords[0] if keywords else ""
97
+
98
+ if prefix and keyword:
99
+ return f"{prefix}-{keyword}"
100
+ if prefix:
101
+ return prefix
102
+ if keyword:
103
+ return keyword
104
+ return "cluster"
105
+
106
+
107
+ def _extract_file_prefix(file_paths: list[str]) -> str:
108
+ """Find the most common short directory or module name from file paths."""
109
+ if not file_paths:
110
+ return ""
111
+ # Extract the parent directory or file stem
112
+ parts: list[str] = []
113
+ for fp in file_paths:
114
+ # Use the last directory component or file stem
115
+ segments = fp.replace("\\", "/").split("/")
116
+ # Take the parent dir if it exists, otherwise the file stem
117
+ if len(segments) >= 2:
118
+ parts.append(segments[-2])
119
+ else:
120
+ stem = segments[-1].rsplit(".", 1)[0]
121
+ parts.append(stem)
122
+
123
+ counts = Counter(parts)
124
+ top_part, _ = counts.most_common(1)[0]
125
+ return _to_slug(top_part)
126
+
127
+
128
+ def _extract_keywords(members: list[GraphNode]) -> list[str]:
129
+ """Extract the most frequent meaningful keywords from member names."""
130
+ word_counts: Counter[str] = Counter()
131
+ for m in members:
132
+ if m.kind in ("Function", "Class", "Test", "Type"):
133
+ words = _split_name(m.name)
134
+ for w in words:
135
+ wl = w.lower()
136
+ if wl not in _COMMON_WORDS and len(wl) > 1:
137
+ word_counts[wl] += 1
138
+
139
+ if not word_counts:
140
+ return []
141
+ return [w for w, _ in word_counts.most_common(5)]
142
+
143
+
144
+ def _split_name(name: str) -> list[str]:
145
+ """Split a camelCase or snake_case name into words."""
146
+ # Insert boundary before uppercase letters for camelCase
147
+ s = re.sub(r"([a-z])([A-Z])", r"\1_\2", name)
148
+ # Split on underscores, hyphens, dots
149
+ return [p for p in re.split(r"[_\-.\s]+", s) if p]
150
+
151
+
152
+ def _to_slug(s: str) -> str:
153
+ """Convert a string to a short lowercase slug."""
154
+ return re.sub(r"[^a-z0-9]+", "-", s.lower()).strip("-")[:30]
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # Cohesion calculation
159
+ # ---------------------------------------------------------------------------
160
+
161
+
162
+ def _compute_cohesion_batch(
163
+ community_member_qns: list[set[str]],
164
+ all_edges: list[GraphEdge],
165
+ ) -> list[float]:
166
+ """Compute cohesion for multiple communities in a single O(edges) pass.
167
+
168
+ Builds a ``qualified_name -> community_index`` reverse map (each node
169
+ appears in at most one community since all callers produce partitions),
170
+ then walks every edge exactly once, bucketing it into internal/external
171
+ counters per community.
172
+
173
+ Total work: O(edges + sum(|members|)) instead of
174
+ O(edges * communities) for naive per-community cohesion.
175
+
176
+ Returns a list of cohesion scores aligned with ``community_member_qns``.
177
+ """
178
+ qn_to_idx: dict[str, int] = {}
179
+ for idx, members in enumerate(community_member_qns):
180
+ for qn in members:
181
+ qn_to_idx[qn] = idx
182
+
183
+ n = len(community_member_qns)
184
+ internal = [0] * n
185
+ external = [0] * n
186
+
187
+ for e in all_edges:
188
+ sc = qn_to_idx.get(e.source_qualified)
189
+ tc = qn_to_idx.get(e.target_qualified)
190
+ if sc is None and tc is None:
191
+ continue
192
+ if sc == tc:
193
+ # Safe: sc is not None here (sc == tc and not both None).
194
+ assert sc is not None
195
+ internal[sc] += 1
196
+ else:
197
+ if sc is not None:
198
+ external[sc] += 1
199
+ if tc is not None:
200
+ external[tc] += 1
201
+
202
+ results: list[float] = []
203
+ for i in range(n):
204
+ total = internal[i] + external[i]
205
+ results.append(internal[i] / total if total > 0 else 0.0)
206
+ return results
207
+
208
+
209
+ def _build_adjacency(edges: list[GraphEdge]) -> dict[str, list[str]]:
210
+ """Build adjacency list from edges (one pass over all edges)."""
211
+ adj: dict[str, list[str]] = defaultdict(list)
212
+ for e in edges:
213
+ adj[e.source_qualified].append(e.target_qualified)
214
+ adj[e.target_qualified].append(e.source_qualified)
215
+ return adj
216
+
217
+
218
+ def _compute_cohesion(
219
+ member_qns: set[str],
220
+ all_edges: list[GraphEdge],
221
+ adj: dict[str, list[str]] | None = None,
222
+ ) -> float:
223
+ """Compute cohesion: internal_edges / (internal_edges + external_edges).
224
+
225
+ For multiple communities, prefer :func:`_compute_cohesion_batch`, which
226
+ runs in O(edges) total instead of O(edges) per community.
227
+ """
228
+ return _compute_cohesion_batch([member_qns], all_edges)[0]
229
+
230
+
231
+ # ---------------------------------------------------------------------------
232
+ # Leiden-based community detection (igraph)
233
+ # ---------------------------------------------------------------------------
234
+
235
+
236
+ def _detect_leiden(
237
+ nodes: list[GraphNode],
238
+ edges: list[GraphEdge],
239
+ min_size: int,
240
+ adj: dict[str, list[str]] | None = None,
241
+ ) -> list[dict[str, Any]]:
242
+ """Detect communities using Leiden algorithm via igraph.
243
+
244
+ Caps Leiden at ``n_iterations=2`` (sufficient for code dependency graphs)
245
+ and skips the recursive sub-community splitting pass that caused
246
+ exponential blow-up on large repos (>100k nodes).
247
+ """
248
+ if ig is None:
249
+ return []
250
+
251
+ qn_to_idx: dict[str, int] = {}
252
+ idx_to_node: dict[int, GraphNode] = {}
253
+ for i, node in enumerate(nodes):
254
+ qn_to_idx[node.qualified_name] = i
255
+ idx_to_node[i] = node
256
+
257
+ if not qn_to_idx:
258
+ return []
259
+
260
+ logger.info("Building igraph with %d nodes...", len(qn_to_idx))
261
+
262
+ g = ig.Graph(n=len(qn_to_idx), directed=False)
263
+ edge_list: list[tuple[int, int]] = []
264
+ weights: list[float] = []
265
+ seen_edges: set[tuple[int, int]] = set()
266
+
267
+ for e in edges:
268
+ src_idx = qn_to_idx.get(e.source_qualified)
269
+ tgt_idx = qn_to_idx.get(e.target_qualified)
270
+ if src_idx is not None and tgt_idx is not None and src_idx != tgt_idx:
271
+ pair = (min(src_idx, tgt_idx), max(src_idx, tgt_idx))
272
+ if pair not in seen_edges:
273
+ seen_edges.add(pair)
274
+ edge_list.append(pair)
275
+ weights.append(EDGE_WEIGHTS.get(e.kind, 0.5))
276
+
277
+ if not edge_list:
278
+ return _detect_file_based(nodes, edges, min_size, adj=adj)
279
+
280
+ g.add_edges(edge_list)
281
+ g.es["weight"] = weights
282
+
283
+ # Run Leiden -- scale resolution inversely with graph size to get
284
+ # coarser clusters on large repos. Default resolution=1.0 produces
285
+ # thousands of tiny communities for 30k+ node graphs.
286
+ import math
287
+ n_nodes = g.vcount()
288
+ resolution = max(0.05, 1.0 / math.log10(max(n_nodes, 10)))
289
+
290
+ logger.info(
291
+ "Running Leiden on %d nodes, %d edges...",
292
+ g.vcount(), g.ecount(),
293
+ )
294
+
295
+ import os
296
+ seed = int(os.environ.get("CRG_LEIDEN_SEED", _LEIDEN_SEED))
297
+ # Deterministic seeding for benchmark reproducibility — community
298
+ # detection is not a security-sensitive context. nosec B311.
299
+ ig.set_random_number_generator(random.Random(seed)) # nosec B311
300
+ partition = g.community_leiden(
301
+ objective_function="modularity",
302
+ weights="weight",
303
+ resolution=resolution,
304
+ n_iterations=2,
305
+ )
306
+
307
+ logger.info(
308
+ "Leiden complete, found %d partitions. Computing cohesion...",
309
+ len(partition),
310
+ )
311
+
312
+ pending: list[tuple[list[GraphNode], set[str]]] = []
313
+ for cluster_ids in partition:
314
+ if len(cluster_ids) < min_size:
315
+ continue
316
+ members = [idx_to_node[i] for i in cluster_ids if i in idx_to_node]
317
+ if len(members) < min_size:
318
+ continue
319
+ member_qns = {m.qualified_name for m in members}
320
+ pending.append((members, member_qns))
321
+
322
+ cohesions = _compute_cohesion_batch([p[1] for p in pending], edges)
323
+
324
+ communities: list[dict[str, Any]] = []
325
+ for (members, member_qns), cohesion in zip(pending, cohesions):
326
+ lang_counts = Counter(m.language for m in members if m.language)
327
+ dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
328
+ name = _generate_community_name(members)
329
+
330
+ communities.append({
331
+ "name": name,
332
+ "level": 0,
333
+ "size": len(members),
334
+ "cohesion": round(cohesion, 4),
335
+ "dominant_language": dominant_lang,
336
+ "description": f"Community of {len(members)} nodes",
337
+ "members": [m.qualified_name for m in members],
338
+ "member_qns": member_qns,
339
+ })
340
+
341
+ logger.info("Community detection complete: %d communities", len(communities))
342
+ return communities
343
+
344
+
345
+ # ---------------------------------------------------------------------------
346
+ # File-based fallback community detection
347
+ # ---------------------------------------------------------------------------
348
+
349
+
350
+ def _detect_file_based(
351
+ nodes: list[GraphNode],
352
+ edges: list[GraphEdge],
353
+ min_size: int,
354
+ adj: dict[str, list[str]] | None = None,
355
+ ) -> list[dict[str, Any]]:
356
+ """Group nodes by directory when Leiden is unavailable or over-fragments.
357
+
358
+ Strips the longest common directory prefix from all file paths, then
359
+ adaptively picks a grouping depth that yields 10-200 communities.
360
+ """
361
+ # Collect all directory paths (normalized, without filename)
362
+ all_dir_parts: list[list[str]] = []
363
+ for n in nodes:
364
+ parts = n.file_path.replace("\\", "/").split("/")
365
+ all_dir_parts.append([p for p in parts[:-1] if p])
366
+
367
+ # Find the longest common prefix among directory parts
368
+ prefix_len = 0
369
+ if all_dir_parts:
370
+ shortest = min(len(p) for p in all_dir_parts)
371
+ for i in range(shortest):
372
+ seg = all_dir_parts[0][i]
373
+ if all(p[i] == seg for p in all_dir_parts):
374
+ prefix_len = i + 1
375
+ else:
376
+ break
377
+
378
+ def _group_at_depth(depth: int) -> dict[str, list[GraphNode]]:
379
+ groups: dict[str, list[GraphNode]] = defaultdict(list)
380
+ for n in nodes:
381
+ parts = n.file_path.replace("\\", "/").split("/")
382
+ dir_parts = [p for p in parts[:-1] if p]
383
+ remainder = dir_parts[prefix_len:]
384
+ if remainder:
385
+ key = "/".join(remainder[:depth])
386
+ else:
387
+ key = parts[-1].rsplit(".", 1)[0] if parts else "root"
388
+ groups[key].append(n)
389
+ return groups
390
+
391
+ # Try increasing depths until we get 10-200 qualifying groups
392
+ max_depth = max((len(p) - prefix_len for p in all_dir_parts), default=0)
393
+ best_groups = _group_at_depth(1) # depth=1 always works (file stem fallback)
394
+ for depth in range(1, max_depth + 1):
395
+ groups = _group_at_depth(depth)
396
+ qualifying = sum(1 for v in groups.values() if len(v) >= min_size)
397
+ best_groups = groups
398
+ if qualifying >= 10:
399
+ break
400
+
401
+ by_dir = best_groups
402
+
403
+ # Pre-filter to communities meeting min_size and collect their member
404
+ # sets so we can batch-compute all cohesions in a single O(edges) pass.
405
+ # Without this, per-community cohesion is O(edges * files), which makes
406
+ # community detection effectively hang on large repos.
407
+ pending: list[tuple[str, list[GraphNode], set[str]]] = []
408
+ for dir_path, members in by_dir.items():
409
+ if len(members) < min_size:
410
+ continue
411
+ member_qns = {m.qualified_name for m in members}
412
+ pending.append((dir_path, members, member_qns))
413
+
414
+ cohesions = _compute_cohesion_batch([p[2] for p in pending], edges)
415
+
416
+ communities: list[dict[str, Any]] = []
417
+ for (dir_path, members, member_qns), cohesion in zip(pending, cohesions):
418
+ lang_counts = Counter(m.language for m in members if m.language)
419
+ dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
420
+ name = _generate_community_name(members)
421
+
422
+ communities.append({
423
+ "name": name,
424
+ "level": 0,
425
+ "size": len(members),
426
+ "cohesion": round(cohesion, 4),
427
+ "dominant_language": dominant_lang,
428
+ "description": f"Directory-based community: {dir_path}",
429
+ "members": [m.qualified_name for m in members],
430
+ "member_qns": member_qns,
431
+ })
432
+
433
+ return communities
434
+
435
+
436
+ # ---------------------------------------------------------------------------
437
+ # Oversized community splitting
438
+ # ---------------------------------------------------------------------------
439
+
440
+
441
+ def _split_oversized(
442
+ communities: list[dict],
443
+ nodes: list[GraphNode],
444
+ edges: list[GraphEdge],
445
+ threshold_pct: float = 0.25,
446
+ min_split_size: int = 10,
447
+ ) -> list[dict]:
448
+ """Recursively split communities that exceed threshold_pct of total.
449
+
450
+ Uses Leiden on the subgraph of oversized communities. If igraph is
451
+ not available, returns communities unchanged.
452
+ """
453
+ if not IGRAPH_AVAILABLE:
454
+ return communities
455
+
456
+ total = sum(
457
+ c.get("size", len(c.get("members", [])))
458
+ for c in communities
459
+ )
460
+ if total == 0:
461
+ return communities
462
+
463
+ threshold = max(int(total * threshold_pct), min_split_size)
464
+ result: list[dict] = []
465
+ next_id = max(
466
+ (c.get("id", 0) for c in communities), default=0
467
+ ) + 1
468
+
469
+ for comm in communities:
470
+ members = set(comm.get("members", []))
471
+ if len(members) <= threshold:
472
+ result.append(comm)
473
+ continue
474
+
475
+ # Build subgraph for this community
476
+ member_nodes = [
477
+ n for n in nodes
478
+ if n.qualified_name in members
479
+ ]
480
+ member_edges = [
481
+ e for e in edges
482
+ if (
483
+ e.source_qualified in members
484
+ and e.target_qualified in members
485
+ )
486
+ ]
487
+
488
+ if len(member_nodes) < min_split_size:
489
+ result.append(comm)
490
+ continue
491
+
492
+ # Run Leiden on subgraph
493
+ qn_to_idx = {
494
+ n.qualified_name: i
495
+ for i, n in enumerate(member_nodes)
496
+ }
497
+ ig_edges: list[tuple[int, int]] = []
498
+ ig_weights: list[float] = []
499
+ for e in member_edges:
500
+ si = qn_to_idx.get(e.source_qualified)
501
+ ti = qn_to_idx.get(e.target_qualified)
502
+ if si is not None and ti is not None and si != ti:
503
+ ig_edges.append((si, ti))
504
+ ig_weights.append(
505
+ EDGE_WEIGHTS.get(e.kind, 0.5)
506
+ )
507
+
508
+ if not ig_edges:
509
+ result.append(comm)
510
+ continue
511
+
512
+ try:
513
+ g = ig.Graph(
514
+ n=len(member_nodes),
515
+ edges=ig_edges,
516
+ directed=False,
517
+ )
518
+ g.es["weight"] = ig_weights
519
+ import os
520
+ seed = int(os.environ.get("CRG_LEIDEN_SEED", _LEIDEN_SEED))
521
+ # Deterministic seeding for benchmark reproducibility — community
522
+ # detection is not a security-sensitive context. nosec B311.
523
+ ig.set_random_number_generator(random.Random(seed)) # nosec B311
524
+ partition = g.community_leiden(
525
+ objective_function="modularity",
526
+ weights="weight",
527
+ resolution=0.5,
528
+ )
529
+
530
+ sub_communities: dict[int, list[str]] = {}
531
+ for idx, cid in enumerate(partition.membership):
532
+ sub_communities.setdefault(cid, []).append(
533
+ member_nodes[idx].qualified_name
534
+ )
535
+
536
+ if len(sub_communities) <= 1:
537
+ result.append(comm)
538
+ continue
539
+
540
+ parent_id = comm.get("id", 0)
541
+ comm_name = comm.get("name", "")
542
+ for sub_members in sub_communities.values():
543
+ sub_comm = {
544
+ "id": next_id,
545
+ "name": comm_name + f"-sub{next_id}",
546
+ "level": comm.get("level", 0) + 1,
547
+ "parent_id": parent_id,
548
+ "members": sub_members,
549
+ "size": len(sub_members),
550
+ "cohesion": 0.0,
551
+ "dominant_language": comm.get(
552
+ "dominant_language"
553
+ ),
554
+ "description": (
555
+ f"Split from {comm_name}"
556
+ ),
557
+ }
558
+ result.append(sub_comm)
559
+ next_id += 1
560
+
561
+ logger.info(
562
+ "Split oversized community '%s' "
563
+ "(%d members) into %d",
564
+ comm_name,
565
+ len(members),
566
+ len(sub_communities),
567
+ )
568
+ except Exception:
569
+ logger.warning(
570
+ "Failed to split community '%s', "
571
+ "keeping as-is",
572
+ comm.get("name", ""),
573
+ exc_info=True,
574
+ )
575
+ result.append(comm)
576
+
577
+ return result
578
+
579
+
580
+ # ---------------------------------------------------------------------------
581
+ # Public API
582
+ # ---------------------------------------------------------------------------
583
+
584
+
585
+ def detect_communities(
586
+ store: GraphStore, min_size: int = 2
587
+ ) -> list[dict[str, Any]]:
588
+ """Detect communities in the code graph.
589
+
590
+ Uses the Leiden algorithm via igraph if available, otherwise falls back to
591
+ file-based grouping.
592
+
593
+ Args:
594
+ store: The GraphStore instance.
595
+ min_size: Minimum number of nodes for a community to be included.
596
+
597
+ Returns:
598
+ List of community dicts with keys: name, level, size, cohesion,
599
+ dominant_language, description, members, member_qns.
600
+ """
601
+ # Gather all nodes (exclude File nodes to focus on code entities)
602
+ all_edges = store.get_all_edges()
603
+ unique_nodes = store.get_all_nodes(exclude_files=True)
604
+
605
+ # Build adjacency index once for fast cohesion computation
606
+ adj = _build_adjacency(all_edges)
607
+
608
+ logger.info(
609
+ "Loaded %d unique nodes, %d edges",
610
+ len(unique_nodes), len(all_edges),
611
+ )
612
+
613
+ if IGRAPH_AVAILABLE:
614
+ logger.info("Detecting communities with Leiden algorithm (igraph)")
615
+ results = _detect_leiden(unique_nodes, all_edges, min_size, adj=adj)
616
+ else:
617
+ logger.info("igraph not available, using file-based community detection")
618
+ results = _detect_file_based(unique_nodes, all_edges, min_size, adj=adj)
619
+
620
+ # Split oversized communities
621
+ results = _split_oversized(
622
+ results, unique_nodes, all_edges,
623
+ )
624
+
625
+ # Convert member_qns (internal set) to a list for serialization safety,
626
+ # then strip it from the returned dicts to avoid leaking internal state.
627
+ for comm in results:
628
+ if "member_qns" in comm:
629
+ comm["member_qns"] = list(comm["member_qns"])
630
+ del comm["member_qns"]
631
+
632
+ return results
633
+
634
+
635
+ def incremental_detect_communities(
636
+ store: GraphStore,
637
+ changed_files: list[str],
638
+ min_size: int = 2,
639
+ ) -> int:
640
+ """Re-detect communities only if changed files affect existing communities.
641
+
642
+ If no existing communities contain nodes from changed files, skips
643
+ re-detection entirely (the common case for small changes). Otherwise
644
+ re-runs full community detection.
645
+
646
+ Args:
647
+ store: The GraphStore instance.
648
+ changed_files: List of file paths that have changed.
649
+ min_size: Minimum number of nodes for a community to be included.
650
+
651
+ Returns:
652
+ Number of communities detected, or 0 if skipped.
653
+ """
654
+ if not changed_files:
655
+ return 0
656
+
657
+ conn = store._conn
658
+
659
+ # Check if any communities are affected (batch to stay under SQLite limit)
660
+ affected_count = 0
661
+ for i in range(0, len(changed_files), _SQL_BATCH):
662
+ batch = changed_files[i:i + _SQL_BATCH]
663
+ placeholders = ",".join("?" * len(batch))
664
+ row = conn.execute(
665
+ f"SELECT COUNT(DISTINCT community_id) FROM nodes " # nosec B608
666
+ f"WHERE community_id IS NOT NULL AND file_path IN ({placeholders})",
667
+ batch,
668
+ ).fetchone()
669
+ if row:
670
+ affected_count += row[0]
671
+ affected = (affected_count,) if affected_count else None
672
+
673
+ if not affected or affected[0] == 0:
674
+ return 0 # No communities affected, skip
675
+
676
+ # Re-run full community detection (correct and fast enough)
677
+ communities = detect_communities(store, min_size=min_size)
678
+ return store_communities(store, communities)
679
+
680
+
681
+ def store_communities(
682
+ store: GraphStore, communities: list[dict[str, Any]]
683
+ ) -> int:
684
+ """Store detected communities in the database.
685
+
686
+ Clears existing communities and community_id assignments, then inserts
687
+ the new communities and updates node community_id references.
688
+
689
+ Args:
690
+ store: The GraphStore instance.
691
+ communities: List of community dicts from detect_communities().
692
+
693
+ Returns:
694
+ Number of communities stored.
695
+ """
696
+ # NOTE: store_communities uses _conn directly because it performs
697
+ # multi-statement batch writes (DELETE + INSERT loop + UPDATE loop)
698
+ # that are tightly coupled to the DB transaction lifecycle.
699
+ conn = store._conn
700
+
701
+ if conn.in_transaction:
702
+ logger.warning("Rolling back uncommitted transaction before BEGIN IMMEDIATE")
703
+ conn.rollback()
704
+ # Wrap in explicit transaction so the DELETE + INSERT + UPDATE
705
+ # sequence is atomic — no partial community data on crash.
706
+ conn.execute("BEGIN IMMEDIATE")
707
+ try:
708
+ conn.execute("DELETE FROM communities")
709
+ conn.execute("UPDATE nodes SET community_id = NULL")
710
+
711
+ count = 0
712
+ for comm in communities:
713
+ cursor = conn.execute(
714
+ """INSERT INTO communities
715
+ (name, level, cohesion, size, dominant_language, description)
716
+ VALUES (?, ?, ?, ?, ?, ?)""",
717
+ (
718
+ comm["name"],
719
+ comm.get("level", 0),
720
+ comm.get("cohesion", 0.0),
721
+ comm["size"],
722
+ comm.get("dominant_language", ""),
723
+ comm.get("description", ""),
724
+ ),
725
+ )
726
+ community_id = cursor.lastrowid
727
+
728
+ # Batch update community_id on member nodes
729
+ member_qns = comm.get("members", [])
730
+ for j in range(0, len(member_qns), _SQL_BATCH):
731
+ batch = member_qns[j:j + _SQL_BATCH]
732
+ placeholders = ",".join("?" * len(batch))
733
+ conn.execute(
734
+ f"UPDATE nodes SET community_id = ? WHERE qualified_name IN ({placeholders})", # nosec B608
735
+ [community_id] + batch,
736
+ )
737
+ count += 1
738
+
739
+ conn.commit()
740
+ except BaseException:
741
+ conn.rollback()
742
+ raise
743
+ return count
744
+
745
+
746
+ def get_communities(
747
+ store: GraphStore, sort_by: str = "size", min_size: int = 0
748
+ ) -> list[dict[str, Any]]:
749
+ """Retrieve stored communities from the database.
750
+
751
+ Args:
752
+ store: The GraphStore instance.
753
+ sort_by: Column to sort by ("size", "cohesion", "name").
754
+ min_size: Minimum community size to include.
755
+
756
+ Returns:
757
+ List of community dicts.
758
+ """
759
+ valid_sorts = {"size", "cohesion", "name"}
760
+ if sort_by not in valid_sorts:
761
+ sort_by = "size"
762
+
763
+ order = "DESC" if sort_by in ("size", "cohesion") else "ASC"
764
+
765
+ # NOTE: get_communities reads the communities table which has no
766
+ # dedicated GraphStore method (it's a domain-specific table managed
767
+ # entirely by the communities module). We use _conn for this query.
768
+ rows = store._conn.execute(
769
+ f"SELECT * FROM communities WHERE size >= ? ORDER BY {sort_by} {order}", # nosec B608
770
+ (min_size,),
771
+ ).fetchall()
772
+
773
+ communities: list[dict[str, Any]] = []
774
+ for row in rows:
775
+ # Fetch member qualified names for this community
776
+ member_qns = [
777
+ _sanitize_name(qn)
778
+ for qn in store.get_community_member_qns(row["id"])
779
+ ]
780
+
781
+ communities.append({
782
+ "id": row["id"],
783
+ "name": _sanitize_name(row["name"]),
784
+ "level": row["level"],
785
+ "cohesion": row["cohesion"],
786
+ "size": row["size"],
787
+ "dominant_language": row["dominant_language"] or "",
788
+ "description": _sanitize_name(row["description"] or ""),
789
+ "members": member_qns,
790
+ })
791
+
792
+ return communities
793
+
794
+
795
+ _TEST_COMMUNITY_RE = re.compile(
796
+ r"(^test[-/]|[-/]test([:/]|$)|it:should|describe:|spec[-/]|[-/]spec$)",
797
+ re.IGNORECASE,
798
+ )
799
+
800
+
801
+ def _is_test_community(name: str) -> bool:
802
+ """Return True if a community name indicates it is test-dominated."""
803
+ return bool(_TEST_COMMUNITY_RE.search(name))
804
+
805
+
806
+ def get_architecture_overview(store: GraphStore) -> dict[str, Any]:
807
+ """Generate an architecture overview based on community structure.
808
+
809
+ Builds a node-to-community mapping, counts cross-community edges,
810
+ and generates warnings for high coupling.
811
+
812
+ Args:
813
+ store: The GraphStore instance.
814
+
815
+ Returns:
816
+ Dict with keys: communities, cross_community_edges, warnings.
817
+ """
818
+ communities = get_communities(store)
819
+
820
+ # Build node -> community_id mapping
821
+ node_to_community: dict[str, int] = {}
822
+ for comm in communities:
823
+ comm_id = comm.get("id", 0)
824
+ for qn in comm.get("members", []):
825
+ node_to_community[qn] = comm_id
826
+
827
+ # Count cross-community edges
828
+ all_edges = store.get_all_edges()
829
+ cross_edges: list[dict[str, Any]] = []
830
+ cross_counts: Counter[tuple[int, int]] = Counter()
831
+
832
+ for e in all_edges:
833
+ # TESTED_BY edges are expected cross-community coupling (test → code),
834
+ # not an architectural smell.
835
+ if e.kind == "TESTED_BY":
836
+ continue
837
+ src_comm = node_to_community.get(e.source_qualified)
838
+ tgt_comm = node_to_community.get(e.target_qualified)
839
+ if (
840
+ src_comm is not None
841
+ and tgt_comm is not None
842
+ and src_comm != tgt_comm
843
+ ):
844
+ pair = (min(src_comm, tgt_comm), max(src_comm, tgt_comm))
845
+ cross_counts[pair] += 1
846
+ cross_edges.append({
847
+ "source_community": src_comm,
848
+ "target_community": tgt_comm,
849
+ "edge_kind": e.kind,
850
+ "source": _sanitize_name(e.source_qualified),
851
+ "target": _sanitize_name(e.target_qualified),
852
+ })
853
+
854
+ # Generate warnings for high coupling, skipping test-dominated pairs.
855
+ warnings: list[str] = []
856
+ comm_name_map = {c.get("id", 0): c["name"] for c in communities}
857
+ for (c1, c2), count in cross_counts.most_common():
858
+ if count > 10:
859
+ name1 = comm_name_map.get(c1, f"community-{c1}")
860
+ name2 = comm_name_map.get(c2, f"community-{c2}")
861
+ # Skip pairs where either community is test-dominated — coupling
862
+ # between test and production code is expected, not architectural.
863
+ if _is_test_community(name1) or _is_test_community(name2):
864
+ continue
865
+ warnings.append(
866
+ f"High coupling ({count} edges) between "
867
+ f"'{name1}' and '{name2}'"
868
+ )
869
+
870
+ return {
871
+ "communities": communities,
872
+ "cross_community_edges": cross_edges,
873
+ "warnings": warnings,
874
+ }