code-review-graph 2.2.2__tar.gz → 2.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/PKG-INFO +8 -3
  2. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/README.md +5 -2
  3. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/changes.py +2 -0
  4. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/cli.py +20 -6
  5. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/communities.py +136 -142
  6. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/embeddings.py +4 -1
  7. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/flows.py +39 -33
  8. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/graph.py +17 -3
  9. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/incremental.py +94 -10
  10. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/main.py +4 -1
  11. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/parser.py +543 -12
  12. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/refactor.py +16 -4
  13. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/registry.py +4 -1
  14. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/skills.py +172 -54
  15. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/build.py +21 -7
  16. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/visualization.py +1 -1
  17. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/USAGE.md +4 -0
  18. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/architecture.md +1 -1
  19. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/pyproject.toml +10 -1
  20. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/.gitignore +0 -0
  21. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/LICENSE +0 -0
  22. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code-review-graph-vscode/LICENSE +0 -0
  23. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code-review-graph-vscode/README.md +0 -0
  24. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/__init__.py +0 -0
  25. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/__main__.py +0 -0
  26. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/constants.py +0 -0
  27. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/__init__.py +0 -0
  28. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/__init__.py +0 -0
  29. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/build_performance.py +0 -0
  30. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/flow_completeness.py +0 -0
  31. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/impact_accuracy.py +0 -0
  32. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/search_quality.py +0 -0
  33. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/benchmarks/token_efficiency.py +0 -0
  34. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/express.yaml +0 -0
  35. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/fastapi.yaml +0 -0
  36. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/flask.yaml +0 -0
  37. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/gin.yaml +0 -0
  38. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/httpx.yaml +0 -0
  39. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/configs/nextjs.yaml +0 -0
  40. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/reporter.py +0 -0
  41. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/runner.py +0 -0
  42. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/scorer.py +0 -0
  43. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/eval/token_benchmark.py +0 -0
  44. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/hints.py +0 -0
  45. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/migrations.py +0 -0
  46. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/prompts.py +0 -0
  47. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/search.py +0 -0
  48. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/__init__.py +0 -0
  49. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/_common.py +0 -0
  50. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/community_tools.py +0 -0
  51. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/context.py +0 -0
  52. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/docs.py +0 -0
  53. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/flows_tools.py +0 -0
  54. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/query.py +0 -0
  55. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/refactor_tools.py +0 -0
  56. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/registry_tools.py +0 -0
  57. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tools/review.py +0 -0
  58. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/tsconfig_resolver.py +0 -0
  59. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/code_review_graph/wiki.py +0 -0
  60. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/COMMANDS.md +0 -0
  61. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/FEATURES.md +0 -0
  62. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/INDEX.md +0 -0
  63. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/LEGAL.md +0 -0
  64. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/LLM-OPTIMIZED-REFERENCE.md +0 -0
  65. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/ROADMAP.md +0 -0
  66. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/TROUBLESHOOTING.md +0 -0
  67. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/docs/schema.md +0 -0
  68. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/hooks/hooks.json +0 -0
  69. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/hooks/session-start.sh +0 -0
  70. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/skills/build-graph/SKILL.md +0 -0
  71. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/skills/review-delta/SKILL.md +0 -0
  72. {code_review_graph-2.2.2 → code_review_graph-2.2.3}/skills/review-pr/SKILL.md +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: code-review-graph
3
- Version: 2.2.2
3
+ Version: 2.2.3
4
4
  Summary: Persistent incremental knowledge graph for token-efficient, context-aware code reviews with Claude Code
5
5
  Project-URL: Homepage, https://code-review-graph.com
6
6
  Project-URL: Repository, https://github.com/tirth8205/code-review-graph
@@ -38,8 +38,10 @@ Provides-Extra: communities
38
38
  Requires-Dist: igraph>=0.11.0; extra == 'communities'
39
39
  Provides-Extra: dev
40
40
  Requires-Dist: pytest-asyncio<1,>=0.23; extra == 'dev'
41
+ Requires-Dist: pytest-cov<8,>=4.0; extra == 'dev'
41
42
  Requires-Dist: pytest<9,>=8.0; extra == 'dev'
42
43
  Requires-Dist: ruff<1,>=0.3.0; extra == 'dev'
44
+ Requires-Dist: tomli>=2.0; (python_version < '3.11') and extra == 'dev'
43
45
  Provides-Extra: embeddings
44
46
  Requires-Dist: numpy<3,>=1.26; extra == 'embeddings'
45
47
  Requires-Dist: sentence-transformers<4,>=3.0.0; extra == 'embeddings'
@@ -90,12 +92,13 @@ code-review-graph build # parse your codebase
90
92
  One command sets up everything. `install` detects which AI coding tools you have, writes the correct MCP configuration for each one, and injects graph-aware instructions into your platform rules. It auto-detects whether you installed via `uvx` or `pip`/`pipx` and generates the right config. Restart your editor/tool after installing.
91
93
 
92
94
  <p align="center">
93
- <img src="diagrams/diagram8_supported_platforms.png" alt="One Install, Every Platform: auto-detects Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity" width="85%" />
95
+ <img src="diagrams/diagram8_supported_platforms.png" alt="One Install, Every Platform: auto-detects Codex, Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity" width="85%" />
94
96
  </p>
95
97
 
96
98
  To target a specific platform:
97
99
 
98
100
  ```bash
101
+ code-review-graph install --platform codex # configure only Codex
99
102
  code-review-graph install --platform cursor # configure only Cursor
100
103
  code-review-graph install --platform claude-code # configure only Claude Code
101
104
  ```
@@ -341,6 +344,8 @@ vendor/**
341
344
  node_modules/**
342
345
  ```
343
346
 
347
+ Note: in git repos, only tracked files are indexed (`git ls-files`), so gitignored files are skipped automatically. Use `.code-review-graphignore` to exclude tracked files or when git isn't available.
348
+
344
349
  Optional dependency groups:
345
350
 
346
351
  ```bash
@@ -382,5 +387,5 @@ MIT. See [LICENSE](LICENSE).
382
387
  <br>
383
388
  <a href="https://code-review-graph.com">code-review-graph.com</a><br><br>
384
389
  <code>pip install code-review-graph && code-review-graph install</code><br>
385
- <sub>Works with Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity</sub>
390
+ <sub>Works with Codex, Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity</sub>
386
391
  </p>
@@ -36,12 +36,13 @@ code-review-graph build # parse your codebase
36
36
  One command sets up everything. `install` detects which AI coding tools you have, writes the correct MCP configuration for each one, and injects graph-aware instructions into your platform rules. It auto-detects whether you installed via `uvx` or `pip`/`pipx` and generates the right config. Restart your editor/tool after installing.
37
37
 
38
38
  <p align="center">
39
- <img src="diagrams/diagram8_supported_platforms.png" alt="One Install, Every Platform: auto-detects Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity" width="85%" />
39
+ <img src="diagrams/diagram8_supported_platforms.png" alt="One Install, Every Platform: auto-detects Codex, Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity" width="85%" />
40
40
  </p>
41
41
 
42
42
  To target a specific platform:
43
43
 
44
44
  ```bash
45
+ code-review-graph install --platform codex # configure only Codex
45
46
  code-review-graph install --platform cursor # configure only Cursor
46
47
  code-review-graph install --platform claude-code # configure only Claude Code
47
48
  ```
@@ -287,6 +288,8 @@ vendor/**
287
288
  node_modules/**
288
289
  ```
289
290
 
291
+ Note: in git repos, only tracked files are indexed (`git ls-files`), so gitignored files are skipped automatically. Use `.code-review-graphignore` to exclude tracked files or when git isn't available.
292
+
290
293
  Optional dependency groups:
291
294
 
292
295
  ```bash
@@ -328,5 +331,5 @@ MIT. See [LICENSE](LICENSE).
328
331
  <br>
329
332
  <a href="https://code-review-graph.com">code-review-graph.com</a><br><br>
330
333
  <code>pip install code-review-graph && code-review-graph install</code><br>
331
- <sub>Works with Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity</sub>
334
+ <sub>Works with Codex, Claude Code, Cursor, Windsurf, Zed, Continue, OpenCode, and Antigravity</sub>
332
335
  </p>
@@ -50,6 +50,8 @@ def parse_git_diff_ranges(
50
50
  ["git", "diff", "--unified=0", base, "--"],
51
51
  capture_output=True,
52
52
  text=True,
53
+ encoding="utf-8",
54
+ errors="replace",
53
55
  cwd=repo_root,
54
56
  timeout=_GIT_TIMEOUT,
55
57
  )
@@ -96,7 +96,7 @@ def _print_banner() -> None:
96
96
 
97
97
  def _handle_init(args: argparse.Namespace) -> None:
98
98
  """Set up MCP config for detected AI coding platforms."""
99
- from .incremental import find_repo_root
99
+ from .incremental import ensure_repo_gitignore_excludes_crg, find_repo_root
100
100
  from .skills import install_platform_configs
101
101
 
102
102
  repo_root = Path(args.repo) if args.repo else find_repo_root()
@@ -117,9 +117,18 @@ def _handle_init(args: argparse.Namespace) -> None:
117
117
  print(f"\nConfigured {len(configured)} platform(s): {', '.join(configured)}")
118
118
 
119
119
  if dry_run:
120
+ print("[dry-run] Would ensure .gitignore ignores .code-review-graph/.")
120
121
  print("\n[dry-run] No files were modified.")
121
122
  return
122
123
 
124
+ gitignore_state = ensure_repo_gitignore_excludes_crg(repo_root)
125
+ if gitignore_state == "created":
126
+ print("Created .gitignore and added .code-review-graph/.")
127
+ elif gitignore_state == "updated":
128
+ print("Updated .gitignore with .code-review-graph/.")
129
+ else:
130
+ print(".gitignore already contains .code-review-graph/.")
131
+
123
132
  # Skills and hooks are installed by default so Claude actually uses the
124
133
  # graph tools proactively. Use --no-skills / --no-hooks to opt out.
125
134
  skip_skills = getattr(args, "no_skills", False)
@@ -130,20 +139,25 @@ def _handle_init(args: argparse.Namespace) -> None:
130
139
  generate_skills,
131
140
  inject_claude_md,
132
141
  inject_platform_instructions,
142
+ install_git_hook,
133
143
  install_hooks,
134
144
  )
135
145
 
136
146
  if not skip_skills:
137
147
  skills_dir = generate_skills(repo_root)
138
148
  print(f"Generated skills in {skills_dir}")
139
- inject_claude_md(repo_root)
140
- updated = inject_platform_instructions(repo_root)
149
+ if target in ("claude", "all"):
150
+ inject_claude_md(repo_root)
151
+ updated = inject_platform_instructions(repo_root, target=target)
141
152
  if updated:
142
153
  print(f"Injected graph instructions into: {', '.join(updated)}")
143
154
 
144
- if not skip_hooks:
155
+ if not skip_hooks and target in ("claude", "all"):
145
156
  install_hooks(repo_root)
146
157
  print(f"Installed hooks in {repo_root / '.claude' / 'settings.json'}")
158
+ git_hook = install_git_hook(repo_root)
159
+ if git_hook:
160
+ print(f"Installed git pre-commit hook in {git_hook}")
147
161
 
148
162
  print()
149
163
  print("Next steps:")
@@ -187,7 +201,7 @@ def main() -> None:
187
201
  install_cmd.add_argument(
188
202
  "--platform",
189
203
  choices=[
190
- "claude", "claude-code", "cursor", "windsurf", "zed",
204
+ "codex", "claude", "claude-code", "cursor", "windsurf", "zed",
191
205
  "continue", "opencode", "antigravity", "all",
192
206
  ],
193
207
  default="all",
@@ -217,7 +231,7 @@ def main() -> None:
217
231
  init_cmd.add_argument(
218
232
  "--platform",
219
233
  choices=[
220
- "claude", "claude-code", "cursor", "windsurf", "zed",
234
+ "codex", "claude", "claude-code", "cursor", "windsurf", "zed",
221
235
  "continue", "opencode", "antigravity", "all",
222
236
  ],
223
237
  default="all",
@@ -149,24 +149,62 @@ def _to_slug(s: str) -> str:
149
149
  # ---------------------------------------------------------------------------
150
150
 
151
151
 
152
+ def _compute_cohesion_batch(
153
+ community_member_qns: list[set[str]],
154
+ all_edges: list[GraphEdge],
155
+ ) -> list[float]:
156
+ """Compute cohesion for multiple communities in a single O(edges) pass.
157
+
158
+ Builds a ``qualified_name -> community_index`` reverse map (each node
159
+ appears in at most one community since all callers produce partitions),
160
+ then walks every edge exactly once, bucketing it into internal/external
161
+ counters per community.
162
+
163
+ Total work: O(edges + sum(|members|)) instead of
164
+ O(edges * communities) for naive per-community cohesion.
165
+
166
+ Returns a list of cohesion scores aligned with ``community_member_qns``.
167
+ """
168
+ qn_to_idx: dict[str, int] = {}
169
+ for idx, members in enumerate(community_member_qns):
170
+ for qn in members:
171
+ qn_to_idx[qn] = idx
172
+
173
+ n = len(community_member_qns)
174
+ internal = [0] * n
175
+ external = [0] * n
176
+
177
+ for e in all_edges:
178
+ sc = qn_to_idx.get(e.source_qualified)
179
+ tc = qn_to_idx.get(e.target_qualified)
180
+ if sc is None and tc is None:
181
+ continue
182
+ if sc == tc:
183
+ # Safe: sc is not None here (sc == tc and not both None).
184
+ assert sc is not None
185
+ internal[sc] += 1
186
+ else:
187
+ if sc is not None:
188
+ external[sc] += 1
189
+ if tc is not None:
190
+ external[tc] += 1
191
+
192
+ results: list[float] = []
193
+ for i in range(n):
194
+ total = internal[i] + external[i]
195
+ results.append(internal[i] / total if total > 0 else 0.0)
196
+ return results
197
+
198
+
152
199
  def _compute_cohesion(
153
200
  member_qns: set[str], all_edges: list[GraphEdge]
154
201
  ) -> float:
155
- """Compute cohesion: internal_edges / (internal_edges + external_edges)."""
156
- internal = 0
157
- external = 0
158
- for e in all_edges:
159
- src_in = e.source_qualified in member_qns
160
- tgt_in = e.target_qualified in member_qns
161
- if src_in or tgt_in:
162
- if src_in and tgt_in:
163
- internal += 1
164
- else:
165
- external += 1
166
- total = internal + external
167
- if total == 0:
168
- return 0.0
169
- return internal / total
202
+ """Compute cohesion: internal_edges / (internal_edges + external_edges).
203
+
204
+ For multiple communities, prefer :func:`_compute_cohesion_batch`, which
205
+ runs in O(edges) total instead of O(edges) per community.
206
+ """
207
+ return _compute_cohesion_batch([member_qns], all_edges)[0]
170
208
 
171
209
 
172
210
  # ---------------------------------------------------------------------------
@@ -177,11 +215,15 @@ def _compute_cohesion(
177
215
  def _detect_leiden(
178
216
  nodes: list[GraphNode], edges: list[GraphEdge], min_size: int
179
217
  ) -> list[dict[str, Any]]:
180
- """Detect communities using Leiden algorithm via igraph."""
218
+ """Detect communities using Leiden algorithm via igraph.
219
+
220
+ Caps Leiden at ``n_iterations=2`` (sufficient for code dependency graphs)
221
+ and skips the recursive sub-community splitting pass that caused
222
+ exponential blow-up on large repos (>100k nodes).
223
+ """
181
224
  if ig is None:
182
225
  return []
183
226
 
184
- # Build mapping from qualified_name to index
185
227
  qn_to_idx: dict[str, int] = {}
186
228
  idx_to_node: dict[int, GraphNode] = {}
187
229
  for i, node in enumerate(nodes):
@@ -191,7 +233,8 @@ def _detect_leiden(
191
233
  if not qn_to_idx:
192
234
  return []
193
235
 
194
- # Build igraph graph (undirected, weighted)
236
+ logger.info("Building igraph with %d nodes...", len(qn_to_idx))
237
+
195
238
  g = ig.Graph(n=len(qn_to_idx), directed=False)
196
239
  edge_list: list[tuple[int, int]] = []
197
240
  weights: list[float] = []
@@ -208,20 +251,28 @@ def _detect_leiden(
208
251
  weights.append(EDGE_WEIGHTS.get(e.kind, 0.5))
209
252
 
210
253
  if not edge_list:
211
- # No edges — fall back to file grouping
212
254
  return _detect_file_based(nodes, edges, min_size)
213
255
 
214
256
  g.add_edges(edge_list)
215
257
  g.es["weight"] = weights
216
258
 
217
- # Run Leiden
259
+ logger.info(
260
+ "Running Leiden on %d nodes, %d edges...",
261
+ g.vcount(), g.ecount(),
262
+ )
263
+
218
264
  partition = g.community_leiden(
219
265
  objective_function="modularity",
220
266
  weights="weight",
267
+ n_iterations=2,
221
268
  )
222
269
 
223
- # Build communities from partition
224
- communities: list[dict[str, Any]] = []
270
+ logger.info(
271
+ "Leiden complete, found %d partitions. Computing cohesion...",
272
+ len(partition),
273
+ )
274
+
275
+ pending: list[tuple[list[GraphNode], set[str]]] = []
225
276
  for cluster_ids in partition:
226
277
  if len(cluster_ids) < min_size:
227
278
  continue
@@ -229,7 +280,12 @@ def _detect_leiden(
229
280
  if len(members) < min_size:
230
281
  continue
231
282
  member_qns = {m.qualified_name for m in members}
232
- cohesion = _compute_cohesion(member_qns, edges)
283
+ pending.append((members, member_qns))
284
+
285
+ cohesions = _compute_cohesion_batch([p[1] for p in pending], edges)
286
+
287
+ communities: list[dict[str, Any]] = []
288
+ for (members, member_qns), cohesion in zip(pending, cohesions):
233
289
  lang_counts = Counter(m.language for m in members if m.language)
234
290
  dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
235
291
  name = _generate_community_name(members)
@@ -245,94 +301,8 @@ def _detect_leiden(
245
301
  "member_qns": member_qns,
246
302
  })
247
303
 
248
- # Second pass: split large communities (>50 nodes)
249
- final: list[dict[str, Any]] = []
250
- for comm in communities:
251
- if comm["size"] > 50:
252
- sub_nodes = [n for n in nodes if n.qualified_name in comm["member_qns"]]
253
- sub_edges = [
254
- e for e in edges
255
- if e.source_qualified in comm["member_qns"]
256
- and e.target_qualified in comm["member_qns"]
257
- ]
258
- subs = _detect_leiden_sub(sub_nodes, sub_edges, min_size, parent_name=comm["name"])
259
- if len(subs) >= 2:
260
- final.extend(subs)
261
- else:
262
- final.append(comm)
263
- else:
264
- final.append(comm)
265
-
266
- return final
267
-
268
-
269
- def _detect_leiden_sub(
270
- nodes: list[GraphNode],
271
- edges: list[GraphEdge],
272
- min_size: int,
273
- parent_name: str,
274
- ) -> list[dict[str, Any]]:
275
- """Second-pass Leiden on a large community for sub-communities."""
276
- if ig is None:
277
- return []
278
-
279
- qn_to_idx: dict[str, int] = {}
280
- idx_to_node: dict[int, GraphNode] = {}
281
- for i, node in enumerate(nodes):
282
- qn_to_idx[node.qualified_name] = i
283
- idx_to_node[i] = node
284
-
285
- g = ig.Graph(n=len(qn_to_idx), directed=False)
286
- edge_list: list[tuple[int, int]] = []
287
- weights: list[float] = []
288
- seen_edges: set[tuple[int, int]] = set()
289
-
290
- for e in edges:
291
- src_idx = qn_to_idx.get(e.source_qualified)
292
- tgt_idx = qn_to_idx.get(e.target_qualified)
293
- if src_idx is not None and tgt_idx is not None and src_idx != tgt_idx:
294
- pair = (min(src_idx, tgt_idx), max(src_idx, tgt_idx))
295
- if pair not in seen_edges:
296
- seen_edges.add(pair)
297
- edge_list.append(pair)
298
- weights.append(EDGE_WEIGHTS.get(e.kind, 0.5))
299
-
300
- if not edge_list:
301
- return []
302
-
303
- g.add_edges(edge_list)
304
- g.es["weight"] = weights
305
-
306
- partition = g.community_leiden(
307
- objective_function="modularity",
308
- weights="weight",
309
- )
310
-
311
- subs: list[dict[str, Any]] = []
312
- for idx, cluster_ids in enumerate(partition):
313
- if len(cluster_ids) < min_size:
314
- continue
315
- members = [idx_to_node[i] for i in cluster_ids if i in idx_to_node]
316
- if len(members) < min_size:
317
- continue
318
- member_qns = {m.qualified_name for m in members}
319
- cohesion = _compute_cohesion(member_qns, edges)
320
- lang_counts = Counter(m.language for m in members if m.language)
321
- dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
322
- name = _generate_community_name(members)
323
-
324
- subs.append({
325
- "name": f"{parent_name}/{name}",
326
- "level": 1,
327
- "size": len(members),
328
- "cohesion": round(cohesion, 4),
329
- "dominant_language": dominant_lang,
330
- "description": f"Sub-community of {len(members)} nodes within {parent_name}",
331
- "members": [m.qualified_name for m in members],
332
- "member_qns": member_qns,
333
- })
334
-
335
- return subs
304
+ logger.info("Community detection complete: %d communities", len(communities))
305
+ return communities
336
306
 
337
307
 
338
308
  # ---------------------------------------------------------------------------
@@ -348,12 +318,21 @@ def _detect_file_based(
348
318
  for n in nodes:
349
319
  by_file[n.file_path].append(n)
350
320
 
351
- communities: list[dict[str, Any]] = []
321
+ # Pre-filter to communities meeting min_size and collect their member
322
+ # sets so we can batch-compute all cohesions in a single O(edges) pass.
323
+ # Without this, per-community cohesion is O(edges * files), which makes
324
+ # community detection effectively hang on large repos.
325
+ pending: list[tuple[str, list[GraphNode], set[str]]] = []
352
326
  for file_path, members in by_file.items():
353
327
  if len(members) < min_size:
354
328
  continue
355
329
  member_qns = {m.qualified_name for m in members}
356
- cohesion = _compute_cohesion(member_qns, edges)
330
+ pending.append((file_path, members, member_qns))
331
+
332
+ cohesions = _compute_cohesion_batch([p[2] for p in pending], edges)
333
+
334
+ communities: list[dict[str, Any]] = []
335
+ for (file_path, members, member_qns), cohesion in zip(pending, cohesions):
357
336
  lang_counts = Counter(m.language for m in members if m.language)
358
337
  dominant_lang = lang_counts.most_common(1)[0][0] if lang_counts else ""
359
338
  name = _generate_community_name(members)
@@ -397,6 +376,8 @@ def detect_communities(
397
376
  all_edges = store.get_all_edges()
398
377
  all_files = store.get_all_files()
399
378
 
379
+ logger.info("Loading nodes from %d files...", len(all_files))
380
+
400
381
  nodes: list[GraphNode] = []
401
382
  for fp in all_files:
402
383
  nodes.extend(store.get_nodes_by_file(fp))
@@ -416,6 +397,11 @@ def detect_communities(
416
397
  seen_qns.add(n.qualified_name)
417
398
  unique_nodes.append(n)
418
399
 
400
+ logger.info(
401
+ "Loaded %d unique nodes, %d edges",
402
+ len(unique_nodes), len(all_edges),
403
+ )
404
+
419
405
  if IGRAPH_AVAILABLE:
420
406
  logger.info("Detecting communities with Leiden algorithm (igraph)")
421
407
  results = _detect_leiden(unique_nodes, all_edges, min_size)
@@ -493,36 +479,44 @@ def store_communities(
493
479
  # that are tightly coupled to the DB transaction lifecycle.
494
480
  conn = store._conn
495
481
 
496
- # Clear existing data
497
- conn.execute("DELETE FROM communities")
498
- conn.execute("UPDATE nodes SET community_id = NULL")
499
-
500
- count = 0
501
- for comm in communities:
502
- cursor = conn.execute(
503
- """INSERT INTO communities (name, level, cohesion, size, dominant_language, description)
504
- VALUES (?, ?, ?, ?, ?, ?)""",
505
- (
506
- comm["name"],
507
- comm.get("level", 0),
508
- comm.get("cohesion", 0.0),
509
- comm["size"],
510
- comm.get("dominant_language", ""),
511
- comm.get("description", ""),
512
- ),
513
- )
514
- community_id = cursor.lastrowid
515
-
516
- # Update community_id on member nodes
517
- member_qns = comm.get("members", [])
518
- for qn in member_qns:
519
- conn.execute(
520
- "UPDATE nodes SET community_id = ? WHERE qualified_name = ?",
521
- (community_id, qn),
482
+ # Wrap in explicit transaction so the DELETE + INSERT + UPDATE
483
+ # sequence is atomic — no partial community data on crash.
484
+ conn.execute("BEGIN IMMEDIATE")
485
+ try:
486
+ conn.execute("DELETE FROM communities")
487
+ conn.execute("UPDATE nodes SET community_id = NULL")
488
+
489
+ count = 0
490
+ for comm in communities:
491
+ cursor = conn.execute(
492
+ """INSERT INTO communities
493
+ (name, level, cohesion, size, dominant_language, description)
494
+ VALUES (?, ?, ?, ?, ?, ?)""",
495
+ (
496
+ comm["name"],
497
+ comm.get("level", 0),
498
+ comm.get("cohesion", 0.0),
499
+ comm["size"],
500
+ comm.get("dominant_language", ""),
501
+ comm.get("description", ""),
502
+ ),
522
503
  )
523
- count += 1
524
-
525
- conn.commit()
504
+ community_id = cursor.lastrowid
505
+
506
+ # Batch update community_id on member nodes
507
+ member_qns = comm.get("members", [])
508
+ if member_qns:
509
+ placeholders = ",".join("?" * len(member_qns))
510
+ conn.execute(
511
+ f"UPDATE nodes SET community_id = ? WHERE qualified_name IN ({placeholders})", # nosec B608
512
+ [community_id] + member_qns,
513
+ )
514
+ count += 1
515
+
516
+ conn.commit()
517
+ except BaseException:
518
+ conn.rollback()
519
+ raise
526
520
  return count
527
521
 
528
522
 
@@ -366,7 +366,10 @@ class EmbeddingStore:
366
366
  self.provider = get_provider(provider, model=model)
367
367
  self.available = self.provider is not None
368
368
  self.db_path = Path(db_path)
369
- self._conn = sqlite3.connect(str(self.db_path), timeout=30, check_same_thread=False)
369
+ self._conn = sqlite3.connect(
370
+ str(self.db_path), timeout=30, check_same_thread=False,
371
+ isolation_level=None,
372
+ )
370
373
  self._conn.row_factory = sqlite3.Row
371
374
  self._conn.executescript(_EMBEDDINGS_SCHEMA)
372
375
 
@@ -314,41 +314,47 @@ def store_flows(store: GraphStore, flows: list[dict]) -> int:
314
314
  # tightly coupled to the DB transaction lifecycle.
315
315
  conn = store._conn
316
316
 
317
- # Clear old data.
318
- conn.execute("DELETE FROM flow_memberships")
319
- conn.execute("DELETE FROM flows")
320
-
321
- count = 0
322
- for flow in flows:
323
- path_json = json.dumps(flow.get("path", []))
324
- conn.execute(
325
- """INSERT INTO flows
326
- (name, entry_point_id, depth, node_count, file_count,
327
- criticality, path_json)
328
- VALUES (?, ?, ?, ?, ?, ?, ?)""",
329
- (
330
- flow["name"],
331
- flow["entry_point_id"],
332
- flow["depth"],
333
- flow["node_count"],
334
- flow["file_count"],
335
- flow["criticality"],
336
- path_json,
337
- ),
338
- )
339
- flow_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
340
-
341
- # Insert memberships.
342
- node_ids = flow.get("path", [])
343
- for position, node_id in enumerate(node_ids):
317
+ # Wrap the full DELETE + INSERT sequence in an explicit transaction
318
+ # so partial writes cannot occur if an exception interrupts the loop.
319
+ conn.execute("BEGIN IMMEDIATE")
320
+ try:
321
+ conn.execute("DELETE FROM flow_memberships")
322
+ conn.execute("DELETE FROM flows")
323
+
324
+ count = 0
325
+ for flow in flows:
326
+ path_json = json.dumps(flow.get("path", []))
344
327
  conn.execute(
345
- "INSERT OR IGNORE INTO flow_memberships (flow_id, node_id, position) "
346
- "VALUES (?, ?, ?)",
347
- (flow_id, node_id, position),
328
+ """INSERT INTO flows
329
+ (name, entry_point_id, depth, node_count, file_count,
330
+ criticality, path_json)
331
+ VALUES (?, ?, ?, ?, ?, ?, ?)""",
332
+ (
333
+ flow["name"],
334
+ flow["entry_point_id"],
335
+ flow["depth"],
336
+ flow["node_count"],
337
+ flow["file_count"],
338
+ flow["criticality"],
339
+ path_json,
340
+ ),
348
341
  )
349
- count += 1
350
-
351
- conn.commit()
342
+ flow_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
343
+
344
+ # Insert memberships.
345
+ node_ids = flow.get("path", [])
346
+ for position, node_id in enumerate(node_ids):
347
+ conn.execute(
348
+ "INSERT OR IGNORE INTO flow_memberships (flow_id, node_id, position) "
349
+ "VALUES (?, ?, ?)",
350
+ (flow_id, node_id, position),
351
+ )
352
+ count += 1
353
+
354
+ conn.commit()
355
+ except BaseException:
356
+ conn.rollback()
357
+ raise
352
358
  return count
353
359
 
354
360