emdash-core 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. emdash_core/__init__.py +3 -0
  2. emdash_core/agent/__init__.py +37 -0
  3. emdash_core/agent/agents.py +225 -0
  4. emdash_core/agent/code_reviewer.py +476 -0
  5. emdash_core/agent/compaction.py +143 -0
  6. emdash_core/agent/context_manager.py +140 -0
  7. emdash_core/agent/events.py +338 -0
  8. emdash_core/agent/handlers.py +224 -0
  9. emdash_core/agent/inprocess_subagent.py +377 -0
  10. emdash_core/agent/mcp/__init__.py +50 -0
  11. emdash_core/agent/mcp/client.py +346 -0
  12. emdash_core/agent/mcp/config.py +302 -0
  13. emdash_core/agent/mcp/manager.py +496 -0
  14. emdash_core/agent/mcp/tool_factory.py +213 -0
  15. emdash_core/agent/prompts/__init__.py +38 -0
  16. emdash_core/agent/prompts/main_agent.py +104 -0
  17. emdash_core/agent/prompts/subagents.py +131 -0
  18. emdash_core/agent/prompts/workflow.py +136 -0
  19. emdash_core/agent/providers/__init__.py +34 -0
  20. emdash_core/agent/providers/base.py +143 -0
  21. emdash_core/agent/providers/factory.py +80 -0
  22. emdash_core/agent/providers/models.py +220 -0
  23. emdash_core/agent/providers/openai_provider.py +463 -0
  24. emdash_core/agent/providers/transformers_provider.py +217 -0
  25. emdash_core/agent/research/__init__.py +81 -0
  26. emdash_core/agent/research/agent.py +143 -0
  27. emdash_core/agent/research/controller.py +254 -0
  28. emdash_core/agent/research/critic.py +428 -0
  29. emdash_core/agent/research/macros.py +469 -0
  30. emdash_core/agent/research/planner.py +449 -0
  31. emdash_core/agent/research/researcher.py +436 -0
  32. emdash_core/agent/research/state.py +523 -0
  33. emdash_core/agent/research/synthesizer.py +594 -0
  34. emdash_core/agent/reviewer_profile.py +475 -0
  35. emdash_core/agent/rules.py +123 -0
  36. emdash_core/agent/runner.py +601 -0
  37. emdash_core/agent/session.py +262 -0
  38. emdash_core/agent/spec_schema.py +66 -0
  39. emdash_core/agent/specification.py +479 -0
  40. emdash_core/agent/subagent.py +397 -0
  41. emdash_core/agent/subagent_prompts.py +13 -0
  42. emdash_core/agent/toolkit.py +482 -0
  43. emdash_core/agent/toolkits/__init__.py +64 -0
  44. emdash_core/agent/toolkits/base.py +96 -0
  45. emdash_core/agent/toolkits/explore.py +47 -0
  46. emdash_core/agent/toolkits/plan.py +55 -0
  47. emdash_core/agent/tools/__init__.py +141 -0
  48. emdash_core/agent/tools/analytics.py +436 -0
  49. emdash_core/agent/tools/base.py +131 -0
  50. emdash_core/agent/tools/coding.py +484 -0
  51. emdash_core/agent/tools/github_mcp.py +592 -0
  52. emdash_core/agent/tools/history.py +13 -0
  53. emdash_core/agent/tools/modes.py +153 -0
  54. emdash_core/agent/tools/plan.py +206 -0
  55. emdash_core/agent/tools/plan_write.py +135 -0
  56. emdash_core/agent/tools/search.py +412 -0
  57. emdash_core/agent/tools/spec.py +341 -0
  58. emdash_core/agent/tools/task.py +262 -0
  59. emdash_core/agent/tools/task_output.py +204 -0
  60. emdash_core/agent/tools/tasks.py +454 -0
  61. emdash_core/agent/tools/traversal.py +588 -0
  62. emdash_core/agent/tools/web.py +179 -0
  63. emdash_core/analytics/__init__.py +5 -0
  64. emdash_core/analytics/engine.py +1286 -0
  65. emdash_core/api/__init__.py +5 -0
  66. emdash_core/api/agent.py +308 -0
  67. emdash_core/api/agents.py +154 -0
  68. emdash_core/api/analyze.py +264 -0
  69. emdash_core/api/auth.py +173 -0
  70. emdash_core/api/context.py +77 -0
  71. emdash_core/api/db.py +121 -0
  72. emdash_core/api/embed.py +131 -0
  73. emdash_core/api/feature.py +143 -0
  74. emdash_core/api/health.py +93 -0
  75. emdash_core/api/index.py +162 -0
  76. emdash_core/api/plan.py +110 -0
  77. emdash_core/api/projectmd.py +210 -0
  78. emdash_core/api/query.py +320 -0
  79. emdash_core/api/research.py +122 -0
  80. emdash_core/api/review.py +161 -0
  81. emdash_core/api/router.py +76 -0
  82. emdash_core/api/rules.py +116 -0
  83. emdash_core/api/search.py +119 -0
  84. emdash_core/api/spec.py +99 -0
  85. emdash_core/api/swarm.py +223 -0
  86. emdash_core/api/tasks.py +109 -0
  87. emdash_core/api/team.py +120 -0
  88. emdash_core/auth/__init__.py +17 -0
  89. emdash_core/auth/github.py +389 -0
  90. emdash_core/config.py +74 -0
  91. emdash_core/context/__init__.py +52 -0
  92. emdash_core/context/models.py +50 -0
  93. emdash_core/context/providers/__init__.py +11 -0
  94. emdash_core/context/providers/base.py +74 -0
  95. emdash_core/context/providers/explored_areas.py +183 -0
  96. emdash_core/context/providers/touched_areas.py +360 -0
  97. emdash_core/context/registry.py +73 -0
  98. emdash_core/context/reranker.py +199 -0
  99. emdash_core/context/service.py +260 -0
  100. emdash_core/context/session.py +352 -0
  101. emdash_core/core/__init__.py +104 -0
  102. emdash_core/core/config.py +454 -0
  103. emdash_core/core/exceptions.py +55 -0
  104. emdash_core/core/models.py +265 -0
  105. emdash_core/core/review_config.py +57 -0
  106. emdash_core/db/__init__.py +67 -0
  107. emdash_core/db/auth.py +134 -0
  108. emdash_core/db/models.py +91 -0
  109. emdash_core/db/provider.py +222 -0
  110. emdash_core/db/providers/__init__.py +5 -0
  111. emdash_core/db/providers/supabase.py +452 -0
  112. emdash_core/embeddings/__init__.py +24 -0
  113. emdash_core/embeddings/indexer.py +534 -0
  114. emdash_core/embeddings/models.py +192 -0
  115. emdash_core/embeddings/providers/__init__.py +7 -0
  116. emdash_core/embeddings/providers/base.py +112 -0
  117. emdash_core/embeddings/providers/fireworks.py +141 -0
  118. emdash_core/embeddings/providers/openai.py +104 -0
  119. emdash_core/embeddings/registry.py +146 -0
  120. emdash_core/embeddings/service.py +215 -0
  121. emdash_core/graph/__init__.py +26 -0
  122. emdash_core/graph/builder.py +134 -0
  123. emdash_core/graph/connection.py +692 -0
  124. emdash_core/graph/schema.py +416 -0
  125. emdash_core/graph/writer.py +667 -0
  126. emdash_core/ingestion/__init__.py +7 -0
  127. emdash_core/ingestion/change_detector.py +150 -0
  128. emdash_core/ingestion/git/__init__.py +5 -0
  129. emdash_core/ingestion/git/commit_analyzer.py +196 -0
  130. emdash_core/ingestion/github/__init__.py +6 -0
  131. emdash_core/ingestion/github/pr_fetcher.py +296 -0
  132. emdash_core/ingestion/github/task_extractor.py +100 -0
  133. emdash_core/ingestion/orchestrator.py +540 -0
  134. emdash_core/ingestion/parsers/__init__.py +10 -0
  135. emdash_core/ingestion/parsers/base_parser.py +66 -0
  136. emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
  137. emdash_core/ingestion/parsers/class_extractor.py +154 -0
  138. emdash_core/ingestion/parsers/function_extractor.py +202 -0
  139. emdash_core/ingestion/parsers/import_analyzer.py +119 -0
  140. emdash_core/ingestion/parsers/python_parser.py +123 -0
  141. emdash_core/ingestion/parsers/registry.py +72 -0
  142. emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
  143. emdash_core/ingestion/parsers/typescript_parser.py +278 -0
  144. emdash_core/ingestion/repository.py +346 -0
  145. emdash_core/models/__init__.py +38 -0
  146. emdash_core/models/agent.py +68 -0
  147. emdash_core/models/index.py +77 -0
  148. emdash_core/models/query.py +113 -0
  149. emdash_core/planning/__init__.py +7 -0
  150. emdash_core/planning/agent_api.py +413 -0
  151. emdash_core/planning/context_builder.py +265 -0
  152. emdash_core/planning/feature_context.py +232 -0
  153. emdash_core/planning/feature_expander.py +646 -0
  154. emdash_core/planning/llm_explainer.py +198 -0
  155. emdash_core/planning/similarity.py +509 -0
  156. emdash_core/planning/team_focus.py +821 -0
  157. emdash_core/server.py +153 -0
  158. emdash_core/sse/__init__.py +5 -0
  159. emdash_core/sse/stream.py +196 -0
  160. emdash_core/swarm/__init__.py +17 -0
  161. emdash_core/swarm/merge_agent.py +383 -0
  162. emdash_core/swarm/session_manager.py +274 -0
  163. emdash_core/swarm/swarm_runner.py +226 -0
  164. emdash_core/swarm/task_definition.py +137 -0
  165. emdash_core/swarm/worker_spawner.py +319 -0
  166. emdash_core/swarm/worktree_manager.py +278 -0
  167. emdash_core/templates/__init__.py +10 -0
  168. emdash_core/templates/defaults/agent-builder.md.template +82 -0
  169. emdash_core/templates/defaults/focus.md.template +115 -0
  170. emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
  171. emdash_core/templates/defaults/pr-review.md.template +80 -0
  172. emdash_core/templates/defaults/project.md.template +85 -0
  173. emdash_core/templates/defaults/research_critic.md.template +112 -0
  174. emdash_core/templates/defaults/research_planner.md.template +85 -0
  175. emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
  176. emdash_core/templates/defaults/reviewer.md.template +81 -0
  177. emdash_core/templates/defaults/spec.md.template +41 -0
  178. emdash_core/templates/defaults/tasks.md.template +78 -0
  179. emdash_core/templates/loader.py +296 -0
  180. emdash_core/utils/__init__.py +45 -0
  181. emdash_core/utils/git.py +84 -0
  182. emdash_core/utils/image.py +502 -0
  183. emdash_core/utils/logger.py +51 -0
  184. emdash_core-0.1.7.dist-info/METADATA +35 -0
  185. emdash_core-0.1.7.dist-info/RECORD +187 -0
  186. emdash_core-0.1.7.dist-info/WHEEL +4 -0
  187. emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,1286 @@
1
+ """Analytics engine for computing graph metrics."""
2
+
3
+ import os
4
+ import networkx as nx
5
+ from typing import Dict, List, Tuple, Optional, Iterable
6
+ from datetime import datetime, timedelta
7
+ from collections import defaultdict
8
+ import math
9
+ import community as community_louvain # python-louvain
10
+
11
+ from ..graph.connection import KuzuConnection, get_connection, get_read_connection, write_lock_context
12
+ from ..agent.providers import get_provider
13
+ from ..agent.providers.factory import DEFAULT_MODEL
14
+ from ..utils.logger import log
15
+
16
+
17
+ class AnalyticsEngine:
18
+ """Computes graph analytics metrics on the knowledge graph."""
19
+
20
+ def __init__(self, connection: KuzuConnection = None, read_only: bool = True):
21
+ """Initialize analytics engine.
22
+
23
+ Args:
24
+ connection: Kuzu connection. If None, uses appropriate global connection.
25
+ read_only: If True and no connection provided, use read-only connection.
26
+ """
27
+ if connection:
28
+ self.connection = connection
29
+ elif read_only:
30
+ self.connection = get_read_connection()
31
+ else:
32
+ self.connection = get_connection()
33
+
34
+ def compute_pagerank(
35
+ self,
36
+ damping: float = 0.85,
37
+ max_iter: int = 100,
38
+ write_back: bool = True,
39
+ use_ast_nodes: bool = False
40
+ ) -> Dict[str, float]:
41
+ """Compute PageRank scores for all code entities.
42
+
43
+ PageRank identifies the most "important" entities based on
44
+ how many other entities reference/call them.
45
+
46
+ Args:
47
+ damping: Damping parameter (default 0.85)
48
+ max_iter: Maximum iterations
49
+ write_back: Whether to write scores back to database
50
+ use_ast_nodes: If True, use ASTNodes (legacy). If False (default),
51
+ use Function/Class nodes with CALLS relationships.
52
+
53
+ Returns:
54
+ Dictionary mapping entity qualified_name to PageRank score
55
+ """
56
+ log.info(f"Computing PageRank scores (use_ast_nodes={use_ast_nodes})...")
57
+
58
+ # Build NetworkX graph from Kuzu
59
+ graph = self._build_code_graph(use_ast_nodes=use_ast_nodes)
60
+
61
+ if len(graph.nodes) == 0:
62
+ log.warning("Graph is empty, cannot compute PageRank")
63
+ return {}
64
+
65
+ # Compute PageRank
66
+ pagerank_scores = nx.pagerank(
67
+ graph,
68
+ alpha=damping,
69
+ max_iter=max_iter
70
+ )
71
+
72
+ log.info(f"Computed PageRank for {len(pagerank_scores)} entities")
73
+
74
+ # Write back to database
75
+ if write_back:
76
+ self._write_pagerank_scores(pagerank_scores, use_ast_nodes=use_ast_nodes)
77
+
78
+ return pagerank_scores
79
+
80
+ def compute_betweenness_centrality(
81
+ self,
82
+ normalized: bool = True,
83
+ write_back: bool = True
84
+ ) -> Dict[str, float]:
85
+ """Compute Betweenness Centrality for all code entities.
86
+
87
+ Betweenness identifies "bridge" entities that connect different
88
+ parts of the codebase.
89
+
90
+ Args:
91
+ normalized: Whether to normalize scores
92
+ write_back: Whether to write scores back to database
93
+
94
+ Returns:
95
+ Dictionary mapping entity qualified_name to betweenness score
96
+ """
97
+ log.info("Computing Betweenness Centrality...")
98
+
99
+ # Build NetworkX graph from Kuzu
100
+ graph = self._build_code_graph()
101
+
102
+ if len(graph.nodes) == 0:
103
+ log.warning("Graph is empty, cannot compute Betweenness")
104
+ return {}
105
+
106
+ # Compute Betweenness Centrality
107
+ betweenness_scores = nx.betweenness_centrality(
108
+ graph,
109
+ normalized=normalized
110
+ )
111
+
112
+ log.info(f"Computed Betweenness for {len(betweenness_scores)} entities")
113
+
114
+ # Write back to database
115
+ if write_back:
116
+ self._write_betweenness_scores(betweenness_scores)
117
+
118
+ return betweenness_scores
119
+
120
+ def detect_communities(
121
+ self,
122
+ resolution: float = 1.0,
123
+ write_back: bool = True,
124
+ describe: bool = False,
125
+ describe_model: str = DEFAULT_MODEL,
126
+ overwrite_descriptions: bool = False,
127
+ ) -> Dict[str, int]:
128
+ """Detect communities/clusters using Louvain algorithm.
129
+
130
+ Identifies modules or clusters of tightly-coupled code entities.
131
+
132
+ Args:
133
+ resolution: Resolution parameter for Louvain algorithm
134
+ write_back: Whether to write community IDs back to database
135
+ describe: Whether to auto-generate community descriptions via LLM
136
+ describe_model: LLM model to use for descriptions
137
+ overwrite_descriptions: Whether to overwrite existing descriptions
138
+
139
+ Returns:
140
+ Dictionary mapping entity qualified_name to community ID
141
+ """
142
+ log.info("Detecting communities with Louvain algorithm...")
143
+
144
+ # Build NetworkX graph from Kuzu (undirected for community detection)
145
+ graph = self._build_code_graph(directed=False)
146
+
147
+ if len(graph.nodes) == 0:
148
+ log.warning("Graph is empty, cannot detect communities")
149
+ return {}
150
+
151
+ # Compute communities using Louvain
152
+ communities = community_louvain.best_partition(
153
+ graph,
154
+ resolution=resolution
155
+ )
156
+
157
+ # Count communities
158
+ num_communities = len(set(communities.values()))
159
+ log.info(f"Detected {num_communities} communities")
160
+
161
+ # Write back to database
162
+ if write_back:
163
+ self._write_community_assignments(communities)
164
+ self._ensure_community_nodes(set(communities.values()))
165
+ if describe:
166
+ self.generate_community_descriptions(
167
+ community_ids=set(communities.values()),
168
+ model=describe_model,
169
+ overwrite=overwrite_descriptions,
170
+ )
171
+
172
+ return communities
173
+
174
+ def get_top_pagerank(
175
+ self,
176
+ limit: int = 20,
177
+ use_ast_nodes: bool = False
178
+ ) -> List[Tuple[str, str, float]]:
179
+ """Get top entities by PageRank score.
180
+
181
+ Args:
182
+ limit: Number of results to return
183
+ use_ast_nodes: If True, query ASTNodes. If False, query Class/Function.
184
+
185
+ Returns:
186
+ List of (qualified_name, entity_type, score) tuples
187
+ """
188
+ if use_ast_nodes:
189
+ results = self.connection.execute("""
190
+ MATCH (n:ASTNode)
191
+ WHERE n.pagerank IS NOT NULL
192
+ RETURN n.id AS name,
193
+ n.ast_type AS type,
194
+ n.pagerank AS score
195
+ ORDER BY n.pagerank DESC
196
+ LIMIT $limit
197
+ """, {"limit": limit})
198
+ else:
199
+ # Query functions and classes separately, then merge
200
+ func_results = self.connection.execute("""
201
+ MATCH (n:Function)
202
+ WHERE n.pagerank IS NOT NULL
203
+ RETURN n.qualified_name AS name,
204
+ 'Function' AS type,
205
+ n.pagerank AS score
206
+ ORDER BY n.pagerank DESC
207
+ LIMIT $limit
208
+ """, {"limit": limit})
209
+
210
+ class_results = self.connection.execute("""
211
+ MATCH (n:Class)
212
+ WHERE n.pagerank IS NOT NULL
213
+ RETURN n.qualified_name AS name,
214
+ 'Class' AS type,
215
+ n.pagerank AS score
216
+ ORDER BY n.pagerank DESC
217
+ LIMIT $limit
218
+ """, {"limit": limit})
219
+
220
+ results = func_results + class_results
221
+ results.sort(key=lambda x: x['score'], reverse=True)
222
+ results = results[:limit]
223
+
224
+ return [(r["name"], r["type"], r["score"]) for r in results]
225
+
226
+ def get_top_betweenness(self, limit: int = 20) -> List[Tuple[str, str, float]]:
227
+ """Get top entities by Betweenness Centrality.
228
+
229
+ Args:
230
+ limit: Number of results to return
231
+
232
+ Returns:
233
+ List of (qualified_name, entity_type, score) tuples
234
+ """
235
+ # Query functions and classes separately
236
+ func_results = self.connection.execute("""
237
+ MATCH (n:Function)
238
+ WHERE n.betweenness IS NOT NULL
239
+ RETURN n.qualified_name AS name,
240
+ 'Function' AS type,
241
+ n.betweenness AS score
242
+ ORDER BY n.betweenness DESC
243
+ LIMIT $limit
244
+ """, {"limit": limit})
245
+
246
+ class_results = self.connection.execute("""
247
+ MATCH (n:Class)
248
+ WHERE n.betweenness IS NOT NULL
249
+ RETURN n.qualified_name AS name,
250
+ 'Class' AS type,
251
+ n.betweenness AS score
252
+ ORDER BY n.betweenness DESC
253
+ LIMIT $limit
254
+ """, {"limit": limit})
255
+
256
+ results = func_results + class_results
257
+ results.sort(key=lambda x: x['score'], reverse=True)
258
+ return [(r["name"], r["type"], r["score"]) for r in results[:limit]]
259
+
260
+ def get_communities_summary(self, max_members: int = 5) -> List[Dict]:
261
+ """Get summary of detected communities.
262
+
263
+ Args:
264
+ max_members: Maximum number of member names to return per community
265
+
266
+ Returns:
267
+ List of community summaries with member counts
268
+ """
269
+ descriptions = self._fetch_community_descriptions()
270
+
271
+ # Query functions and classes with community assignments
272
+ func_results = self.connection.execute("""
273
+ MATCH (n:Function)
274
+ WHERE n.community IS NOT NULL
275
+ RETURN n.community AS community_id,
276
+ n.qualified_name AS qualified_name
277
+ """)
278
+
279
+ class_results = self.connection.execute("""
280
+ MATCH (n:Class)
281
+ WHERE n.community IS NOT NULL
282
+ RETURN n.community AS community_id,
283
+ n.qualified_name AS qualified_name
284
+ """)
285
+
286
+ # Group by community
287
+ communities = defaultdict(list)
288
+ for r in func_results + class_results:
289
+ communities[r['community_id']].append(r['qualified_name'])
290
+
291
+ # Build summary
292
+ summaries = []
293
+ for community_id, members in communities.items():
294
+ summaries.append({
295
+ 'community_id': community_id,
296
+ 'member_count': len(members),
297
+ 'sample_members': members[:max_members],
298
+ 'description': descriptions.get(community_id)
299
+ })
300
+
301
+ summaries.sort(key=lambda x: x['member_count'], reverse=True)
302
+ return summaries
303
+
304
+ def get_community_members(self, community_id: int) -> List[Dict]:
305
+ """Get all members of a specific community.
306
+
307
+ Args:
308
+ community_id: The community ID to query
309
+
310
+ Returns:
311
+ List of member details (name, type, qualified_name)
312
+ """
313
+ func_results = self.connection.execute("""
314
+ MATCH (n:Function)
315
+ WHERE n.community = $community_id
316
+ RETURN n.name AS name,
317
+ 'Function' AS type,
318
+ n.qualified_name AS qualified_name,
319
+ n.file_path AS file_path
320
+ ORDER BY n.name
321
+ """, {"community_id": community_id})
322
+
323
+ class_results = self.connection.execute("""
324
+ MATCH (n:Class)
325
+ WHERE n.community = $community_id
326
+ RETURN n.name AS name,
327
+ 'Class' AS type,
328
+ n.qualified_name AS qualified_name,
329
+ n.file_path AS file_path
330
+ ORDER BY n.name
331
+ """, {"community_id": community_id})
332
+
333
+ return func_results + class_results
334
+
335
+ def search_communities(
336
+ self,
337
+ query: Optional[str] = None,
338
+ qualified_names: Optional[List[str]] = None,
339
+ limit: int = 10,
340
+ max_members: int = 5,
341
+ min_score: float = 0.3,
342
+ ) -> List[Dict]:
343
+ """Search for communities by semantic similarity or member matching.
344
+
345
+ Two search modes:
346
+ 1. Query mode: Search community descriptions by embedding similarity
347
+ 2. Member mode: Find communities containing specific code entities
348
+
349
+ Args:
350
+ query: Semantic search query for community descriptions
351
+ qualified_names: List of code entity qualified names to find communities for
352
+ limit: Maximum number of communities to return
353
+ max_members: Maximum sample members per community
354
+ min_score: Minimum similarity score for semantic search (0-1)
355
+
356
+ Returns:
357
+ List of community dicts with community_id, description, member_count, sample_members
358
+ """
359
+ if query:
360
+ # Semantic search mode - search by community description embedding
361
+ return self._search_communities_by_embedding(
362
+ query=query,
363
+ limit=limit,
364
+ max_members=max_members,
365
+ min_score=min_score,
366
+ )
367
+ elif qualified_names:
368
+ # Member matching mode - find communities containing these entities
369
+ return self._search_communities_by_members(
370
+ qualified_names=qualified_names,
371
+ limit=limit,
372
+ max_members=max_members,
373
+ )
374
+ else:
375
+ # No search criteria - return top communities by size
376
+ return self.get_communities_summary(max_members=max_members)[:limit]
377
+
378
+ def _search_communities_by_embedding(
379
+ self,
380
+ query: str,
381
+ limit: int = 10,
382
+ max_members: int = 5,
383
+ min_score: float = 0.3,
384
+ ) -> List[Dict]:
385
+ """Search communities by embedding similarity on descriptions."""
386
+ from ..embeddings.service import EmbeddingService
387
+ from ..planning.similarity import cosine_similarity
388
+
389
+ embedding_service = EmbeddingService()
390
+ if not embedding_service.is_available:
391
+ log.warning("Embedding service not available. Falling back to text search.")
392
+ return self._fallback_community_text_search(query, limit, max_members)
393
+
394
+ # Generate query embedding
395
+ query_embedding = embedding_service.embed_query(query)
396
+ if not query_embedding:
397
+ log.error("Failed to generate query embedding")
398
+ return []
399
+
400
+ # Fetch all communities with embeddings
401
+ communities = self.connection.execute("""
402
+ MATCH (c:Community)
403
+ WHERE c.embedding IS NOT NULL
404
+ RETURN c.community_id AS community_id,
405
+ c.description AS description,
406
+ c.embedding AS embedding
407
+ """)
408
+
409
+ if not communities:
410
+ log.warning("No communities with embeddings found. Run `emdash embed index` first.")
411
+ return self._fallback_community_text_search(query, limit, max_members)
412
+
413
+ # Compute similarity scores
414
+ scored_communities = []
415
+ for comm in communities:
416
+ if comm.get('embedding'):
417
+ similarity = cosine_similarity(query_embedding, comm['embedding'])
418
+ if similarity >= min_score:
419
+ scored_communities.append({
420
+ 'community_id': comm['community_id'],
421
+ 'description': comm['description'],
422
+ 'score': similarity,
423
+ })
424
+
425
+ # Sort by score
426
+ scored_communities.sort(key=lambda x: x['score'], reverse=True)
427
+ scored_communities = scored_communities[:limit]
428
+
429
+ # Enrich with member info
430
+ for comm in scored_communities:
431
+ members = self.get_community_members(comm['community_id'])
432
+ comm['member_count'] = len(members)
433
+ comm['sample_members'] = [m['qualified_name'] for m in members[:max_members]]
434
+
435
+ return scored_communities
436
+
437
+ def _search_communities_by_members(
438
+ self,
439
+ qualified_names: List[str],
440
+ limit: int = 10,
441
+ max_members: int = 5,
442
+ ) -> List[Dict]:
443
+ """Find communities containing the given code entities."""
444
+ if not qualified_names:
445
+ return []
446
+
447
+ # Get community IDs for the given entities
448
+ func_results = self.connection.execute("""
449
+ MATCH (f:Function)
450
+ WHERE f.qualified_name IN $names AND f.community IS NOT NULL
451
+ RETURN f.community AS community_id, count(*) AS match_count
452
+ """, {"names": qualified_names})
453
+
454
+ class_results = self.connection.execute("""
455
+ MATCH (c:Class)
456
+ WHERE c.qualified_name IN $names AND c.community IS NOT NULL
457
+ RETURN c.community AS community_id, count(*) AS match_count
458
+ """, {"names": qualified_names})
459
+
460
+ # Aggregate by community
461
+ community_matches = {}
462
+ for r in func_results + class_results:
463
+ cid = r['community_id']
464
+ community_matches[cid] = community_matches.get(cid, 0) + r['match_count']
465
+
466
+ if not community_matches:
467
+ return []
468
+
469
+ # Sort by match count and fetch details
470
+ sorted_ids = sorted(
471
+ community_matches.keys(),
472
+ key=lambda x: community_matches[x],
473
+ reverse=True
474
+ )[:limit]
475
+
476
+ descriptions = self._fetch_community_descriptions()
477
+ results = []
478
+ for cid in sorted_ids:
479
+ members = self.get_community_members(cid)
480
+ results.append({
481
+ 'community_id': cid,
482
+ 'description': descriptions.get(cid),
483
+ 'member_count': len(members),
484
+ 'sample_members': [m['qualified_name'] for m in members[:max_members]],
485
+ 'match_count': community_matches[cid],
486
+ })
487
+
488
+ return results
489
+
490
+ def _fallback_community_text_search(
491
+ self,
492
+ query: str,
493
+ limit: int = 10,
494
+ max_members: int = 5,
495
+ ) -> List[Dict]:
496
+ """Fallback to text-based community search."""
497
+ results = self.connection.execute("""
498
+ MATCH (c:Community)
499
+ WHERE c.description IS NOT NULL
500
+ AND lower(c.description) CONTAINS lower($query)
501
+ RETURN c.community_id AS community_id,
502
+ c.description AS description
503
+ LIMIT $limit
504
+ """, {"query": query, "limit": limit})
505
+
506
+ # Enrich with member info
507
+ for comm in results:
508
+ members = self.get_community_members(comm['community_id'])
509
+ comm['member_count'] = len(members)
510
+ comm['sample_members'] = [m['qualified_name'] for m in members[:max_members]]
511
+
512
+ return results
513
+
514
+ def get_community_description(self, community_id: int) -> Optional[str]:
515
+ """Get a stored description for a community."""
516
+ results = self.connection.execute("""
517
+ MATCH (c:Community {community_id: $community_id})
518
+ RETURN c.description AS description
519
+ """, {"community_id": community_id})
520
+ if not results:
521
+ return None
522
+ return results[0].get("description")
523
+
524
+ def set_community_description(self, community_id: int, description: str, source: str = "manual") -> None:
525
+ """Set or update the description for a community."""
526
+ if description is None:
527
+ return
528
+
529
+ cleaned = description.strip()
530
+ if not cleaned:
531
+ return
532
+
533
+ write_conn = get_connection()
534
+ with write_lock_context("community_description"):
535
+ write_conn.execute_write("""
536
+ MERGE (c:Community {community_id: $community_id})
537
+ SET c.description = $description,
538
+ c.source = $source
539
+ """, {"community_id": community_id, "description": cleaned, "source": source})
540
+
541
+ def generate_community_descriptions(
542
+ self,
543
+ community_ids: Optional[Iterable[int]] = None,
544
+ model: str = DEFAULT_MODEL,
545
+ overwrite: bool = False,
546
+ max_members: int = 20,
547
+ progress_callback: Optional[callable] = None,
548
+ max_workers: int = 10,
549
+ ) -> Dict[int, str]:
550
+ """Generate and store LLM descriptions for communities.
551
+
552
+ Args:
553
+ community_ids: Specific community IDs to describe. If None, uses all.
554
+ model: LLM model to use for generation.
555
+ overwrite: Whether to overwrite existing descriptions.
556
+ max_members: Max members to include in the prompt.
557
+ progress_callback: Optional callback(current, total, community_id, description)
558
+ called after each community is processed.
559
+ max_workers: Maximum parallel LLM requests (default 10).
560
+ """
561
+ import threading
562
+ from concurrent.futures import ThreadPoolExecutor, as_completed
563
+
564
+ if community_ids is None:
565
+ community_ids = self._get_community_ids()
566
+
567
+ community_ids = list(community_ids)
568
+ if not community_ids:
569
+ return {}
570
+
571
+ existing = self._fetch_community_descriptions()
572
+ if not overwrite:
573
+ community_ids = [cid for cid in community_ids if not existing.get(cid)]
574
+
575
+ if not community_ids:
576
+ return {}
577
+
578
+ try:
579
+ provider = get_provider(model)
580
+ except Exception as e:
581
+ log.warning(f"LLM provider not available for community descriptions: {e}")
582
+ return {}
583
+
584
+ generated = {}
585
+ total = len(community_ids)
586
+ completed = [0] # Use list for mutable counter in closure
587
+ lock = threading.Lock()
588
+
589
+ def process_community(community_id: int) -> tuple[int, Optional[str]]:
590
+ """Process a single community - returns (community_id, description or None)."""
591
+ prompt = self._build_community_description_prompt(community_id, max_members=max_members)
592
+ if not prompt:
593
+ return (community_id, None)
594
+
595
+ try:
596
+ response = provider.chat(
597
+ messages=[{"role": "user", "content": prompt}],
598
+ system=(
599
+ "You are a senior engineer summarizing a code community. "
600
+ "Write 1-2 concise sentences describing what the community does. "
601
+ "Use plain text only. No bullets, no prefixes, no quotes."
602
+ ),
603
+ )
604
+ except Exception as e:
605
+ log.warning(f"Failed to generate description for community {community_id}: {e}")
606
+ return (community_id, None)
607
+
608
+ description = self._clean_description(response.content or "")
609
+ if not description:
610
+ return (community_id, None)
611
+
612
+ if len(description) > 400:
613
+ description = description[:397].rstrip() + "..."
614
+
615
+ return (community_id, description)
616
+
617
+ log.info(f"Generating descriptions for {total} communities with {max_workers} workers...")
618
+
619
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
620
+ futures = {executor.submit(process_community, cid): cid for cid in community_ids}
621
+
622
+ for future in as_completed(futures):
623
+ community_id, description = future.result()
624
+
625
+ with lock:
626
+ completed[0] += 1
627
+ current = completed[0]
628
+
629
+ if description:
630
+ self.set_community_description(community_id, description, source="llm")
631
+ generated[community_id] = description
632
+
633
+ if progress_callback:
634
+ progress_callback(current, total, community_id, description)
635
+
636
+ log.info(f"Generated {len(generated)} community descriptions")
637
+ return generated
638
+
639
+ def _fetch_community_descriptions(self) -> Dict[int, Optional[str]]:
640
+ """Fetch all stored community descriptions."""
641
+ results = self.connection.execute("""
642
+ MATCH (c:Community)
643
+ RETURN c.community_id AS community_id,
644
+ c.description AS description
645
+ """)
646
+ return {row["community_id"]: row.get("description") for row in results}
647
+
648
+ def detect_knowledge_silos(
649
+ self,
650
+ importance_threshold: float = 0.0001,
651
+ max_authors: int = 2
652
+ ) -> List[Dict]:
653
+ """Detect knowledge silos - critical code with few maintainers.
654
+
655
+ A knowledge silo is important code (high PageRank) that only
656
+ 1-2 people have worked on, creating a "bus factor" risk.
657
+
658
+ Args:
659
+ importance_threshold: Minimum PageRank score to consider
660
+ max_authors: Maximum author count to flag as silo
661
+
662
+ Returns:
663
+ List of knowledge silos with risk scores
664
+ """
665
+ log.info("Detecting knowledge silos...")
666
+
667
+ try:
668
+ results = self.connection.execute("""
669
+ MATCH (f:File)-[:CONTAINS_FUNCTION|CONTAINS_CLASS]->(entity)
670
+ WHERE entity.pagerank IS NOT NULL
671
+ AND entity.pagerank >= $importance_threshold
672
+ WITH f, max(entity.pagerank) AS importance,
673
+ collect(entity.qualified_name)[0] AS top_entity
674
+
675
+ MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
676
+ WITH f, importance, top_entity,
677
+ collect(DISTINCT c.author_email) AS authors
678
+ WHERE size(authors) <= $max_authors
679
+
680
+ WITH f, importance, top_entity, authors,
681
+ importance / size(authors) AS risk_score
682
+
683
+ RETURN f.path AS file_path,
684
+ top_entity AS critical_entity,
685
+ importance,
686
+ size(authors) AS author_count,
687
+ authors,
688
+ risk_score
689
+ ORDER BY risk_score DESC
690
+ """, {
691
+ "importance_threshold": importance_threshold,
692
+ "max_authors": max_authors
693
+ })
694
+
695
+ silos = []
696
+ for record in results:
697
+ silos.append({
698
+ 'file_path': record['file_path'],
699
+ 'critical_entity': record['critical_entity'],
700
+ 'importance': record['importance'],
701
+ 'author_count': record['author_count'],
702
+ 'authors': record['authors'],
703
+ 'risk_score': record['risk_score']
704
+ })
705
+
706
+ log.info(f"Found {len(silos)} knowledge silos")
707
+ return silos
708
+ except Exception as e:
709
+ log.warning(f"Failed to detect knowledge silos: {e}")
710
+ return []
711
+
712
+ def get_file_ownership(self, file_path: str) -> Dict:
713
+ """Get detailed ownership information for a file.
714
+
715
+ Args:
716
+ file_path: Path to the file
717
+
718
+ Returns:
719
+ Dictionary with ownership statistics
720
+ """
721
+ results = self.connection.execute("""
722
+ MATCH (f:File {path: $file_path})
723
+ OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
724
+ OPTIONAL MATCH (c)-[:AUTHORED_BY]->(a:Author)
725
+
726
+ WITH f,
727
+ count(DISTINCT c) AS total_commits,
728
+ collect(DISTINCT a.email) AS authors,
729
+ collect(DISTINCT a.name) AS author_names
730
+
731
+ RETURN f.path AS file_path,
732
+ total_commits,
733
+ size(authors) AS author_count,
734
+ authors,
735
+ author_names
736
+ """, {"file_path": file_path})
737
+
738
+ if not results:
739
+ return {}
740
+
741
+ record = results[0]
742
+ return {
743
+ 'file_path': record['file_path'],
744
+ 'total_commits': record['total_commits'],
745
+ 'author_count': record['author_count'],
746
+ 'authors': record['authors'],
747
+ 'author_names': record['author_names']
748
+ }
749
+
750
+ def compute_commit_importance(
751
+ self,
752
+ write_back: bool = True
753
+ ) -> Dict[str, Dict]:
754
+ """Compute file importance based on commit activity.
755
+
756
+ Files with more commits and more authors are considered more
757
+ important as they are actively maintained and have broader ownership.
758
+
759
+ Args:
760
+ write_back: Whether to write scores back to database
761
+
762
+ Returns:
763
+ Dictionary mapping file path to importance metrics
764
+ """
765
+ log.info("Computing commit-based importance...")
766
+
767
+ results = self.connection.execute("""
768
+ MATCH (f:File)
769
+ OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
770
+ WITH f, count(DISTINCT c) AS commit_count
771
+ OPTIONAL MATCH (c2:GitCommit)-[:COMMIT_MODIFIES]->(f)
772
+ OPTIONAL MATCH (c2)-[:AUTHORED_BY]->(a:Author)
773
+ WITH f, commit_count,
774
+ count(DISTINCT a) AS author_count,
775
+ collect(DISTINCT a.email)[0:5] AS top_authors
776
+ WHERE commit_count > 0
777
+ RETURN f.path AS file_path,
778
+ commit_count,
779
+ author_count,
780
+ top_authors,
781
+ commit_count * (1.0 + log(author_count + 1)) AS importance_score
782
+ ORDER BY importance_score DESC
783
+ """)
784
+
785
+ importance = {}
786
+ for record in results:
787
+ importance[record['file_path']] = {
788
+ 'commit_count': record['commit_count'],
789
+ 'author_count': record['author_count'],
790
+ 'top_authors': record['top_authors'],
791
+ 'importance_score': record['importance_score']
792
+ }
793
+
794
+ log.info(f"Computed importance for {len(importance)} files")
795
+
796
+ if write_back:
797
+ self._write_commit_importance(importance)
798
+
799
+ return importance
800
+
801
+ def _write_commit_importance(self, importance: Dict[str, Dict]):
802
+ """Write commit-based importance scores to database.
803
+
804
+ Args:
805
+ importance: Dictionary mapping file path to importance metrics
806
+ """
807
+ log.info("Writing commit importance scores to database...")
808
+ write_conn = get_connection()
809
+
810
+ with write_lock_context("commit_importance"):
811
+ for path, data in importance.items():
812
+ write_conn.execute_write("""
813
+ MATCH (f:File {path: $path})
814
+ SET f.commit_importance = $importance_score,
815
+ f.commit_count = $commit_count,
816
+ f.author_count = $author_count
817
+ """, {
818
+ "path": path,
819
+ "importance_score": data['importance_score'],
820
+ "commit_count": data['commit_count'],
821
+ "author_count": data['author_count']
822
+ })
823
+
824
+ log.info(f"Wrote {len(importance)} commit importance scores")
825
+
826
+ def get_top_commit_importance(
827
+ self,
828
+ limit: int = 20
829
+ ) -> List[Tuple[str, int, int, float]]:
830
+ """Get top files by commit-based importance.
831
+
832
+ Args:
833
+ limit: Number of results to return
834
+
835
+ Returns:
836
+ List of (file_path, commit_count, author_count, importance_score) tuples
837
+ """
838
+ results = self.connection.execute("""
839
+ MATCH (f:File)
840
+ WHERE f.commit_importance IS NOT NULL
841
+ RETURN f.path AS file_path,
842
+ f.commit_count AS commits,
843
+ f.author_count AS authors,
844
+ f.commit_importance AS score
845
+ ORDER BY f.commit_importance DESC
846
+ LIMIT $limit
847
+ """, {"limit": limit})
848
+
849
+ return [(r["file_path"], r["commits"], r["authors"], r["score"])
850
+ for r in results]
851
+
852
+ def compute_file_importance(
853
+ self,
854
+ days: int = 30,
855
+ limit: int = 50
856
+ ) -> List[Dict]:
857
+ """Compute file importance with recency weighting.
858
+
859
+ Args:
860
+ days: Time window in days for recency scoring
861
+ limit: Maximum number of files to return
862
+
863
+ Returns:
864
+ List of dicts with file importance metrics
865
+ """
866
+ log.info(f"Computing file importance with {days}-day recency window...")
867
+
868
+ try:
869
+ results = self.connection.execute("""
870
+ MATCH (f:File)
871
+ OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
872
+ WITH f, collect(c) AS commits
873
+ WHERE size(commits) > 0
874
+
875
+ UNWIND commits AS commit
876
+ OPTIONAL MATCH (commit)-[:AUTHORED_BY]->(a:Author)
877
+ WITH f, commits,
878
+ count(DISTINCT commit) AS commit_count,
879
+ count(DISTINCT a) AS author_count
880
+
881
+ WITH f, commit_count, author_count,
882
+ commit_count * (1.0 + log(author_count + 1)) AS base_importance
883
+
884
+ RETURN f.path AS file_path,
885
+ commit_count AS commits,
886
+ author_count AS authors,
887
+ base_importance AS importance_score
888
+ ORDER BY importance_score DESC
889
+ LIMIT $limit
890
+ """, {"limit": limit})
891
+
892
+ files = []
893
+ for record in results:
894
+ files.append({
895
+ 'file_path': record['file_path'],
896
+ 'commits': record['commits'],
897
+ 'authors': record['authors'],
898
+ 'recent_commits': 0, # Simplified - recency calculation complex in Kuzu
899
+ 'importance_score': record['importance_score']
900
+ })
901
+
902
+ log.info(f"Computed importance for {len(files)} files")
903
+ return files
904
+ except Exception as e:
905
+ log.warning(f"Failed to compute file importance: {e}")
906
+ return []
907
+
908
+ def compute_area_importance(
909
+ self,
910
+ depth: int = 2,
911
+ days: int = 30,
912
+ limit: int = 20
913
+ ) -> List[Dict]:
914
+ """Aggregate file importance by directory.
915
+
916
+ Args:
917
+ depth: Directory depth for grouping
918
+ days: Time window for recency scoring
919
+ limit: Maximum number of areas to return
920
+
921
+ Returns:
922
+ List of dicts with area importance metrics
923
+ """
924
+ log.info(f"Computing area importance at depth {depth}...")
925
+
926
+ # Get file importance first
927
+ files = self.compute_file_importance(days=days, limit=500)
928
+ if not files:
929
+ return []
930
+
931
+ # Find common prefix (repo root) to make paths relative
932
+ all_paths = [f['file_path'] for f in files]
933
+ if all_paths:
934
+ common = os.path.commonpath(all_paths)
935
+ if not os.path.isdir(common):
936
+ common = os.path.dirname(common)
937
+ repo_root = common.rstrip('/') + '/'
938
+ else:
939
+ repo_root = '/'
940
+
941
+ # Group by directory
942
+ areas = defaultdict(lambda: {
943
+ 'files': [],
944
+ 'total_commits': 0,
945
+ 'importance_sum': 0.0,
946
+ 'max_authors': 0,
947
+ 'recent_commits': 0,
948
+ })
949
+
950
+ total_recent = sum(f.get('recent_commits', 0) for f in files)
951
+
952
+ for f in files:
953
+ abs_path = f['file_path']
954
+
955
+ if abs_path.startswith(repo_root):
956
+ rel_path = abs_path[len(repo_root):]
957
+ else:
958
+ rel_path = abs_path
959
+
960
+ parts = rel_path.split('/')
961
+
962
+ if len(parts) > depth:
963
+ rel_area = '/'.join(parts[:depth]) + '/'
964
+ else:
965
+ rel_area = '/'.join(parts[:-1]) + '/' if len(parts) > 1 else '/'
966
+
967
+ areas[rel_area]['files'].append(f)
968
+ areas[rel_area]['total_commits'] += f['commits']
969
+ areas[rel_area]['importance_sum'] += f['importance_score']
970
+ # Track max authors seen in any file (approximation for unique authors)
971
+ areas[rel_area]['max_authors'] = max(
972
+ areas[rel_area]['max_authors'],
973
+ f.get('authors', 0)
974
+ )
975
+ areas[rel_area]['recent_commits'] += f.get('recent_commits', 0)
976
+
977
+ # Build result list
978
+ result_list = []
979
+ for rel_area, data in areas.items():
980
+ recent = data['recent_commits']
981
+ focus_pct = round(100 * recent / total_recent, 1) if total_recent > 0 else 0.0
982
+ result_list.append({
983
+ 'path': rel_area,
984
+ 'total_commits': data['total_commits'],
985
+ 'file_count': len(data['files']),
986
+ 'importance': data['importance_sum'],
987
+ 'unique_authors': data['max_authors'],
988
+ 'recent_commits': recent,
989
+ 'focus_pct': focus_pct,
990
+ })
991
+
992
+ result_list.sort(key=lambda x: x['importance'], reverse=True)
993
+ log.info(f"Computed importance for {len(result_list)} areas")
994
+ return result_list[:limit]
995
+
996
+ def _build_code_graph(
997
+ self,
998
+ directed: bool = True,
999
+ use_ast_nodes: bool = False
1000
+ ) -> nx.Graph:
1001
+ """Build NetworkX graph from Kuzu code entities.
1002
+
1003
+ Args:
1004
+ directed: Whether to create a directed graph
1005
+ use_ast_nodes: If True, use ASTNodes with CALLS/USES relationships
1006
+
1007
+ Returns:
1008
+ NetworkX graph
1009
+ """
1010
+ log.info(f"Building {'directed' if directed else 'undirected'} graph from Kuzu...")
1011
+
1012
+ graph = nx.DiGraph() if directed else nx.Graph()
1013
+
1014
+ if use_ast_nodes:
1015
+ # Use ASTNodes
1016
+ nodes = self.connection.execute("""
1017
+ MATCH (n:ASTNode)
1018
+ WHERE n.id IS NOT NULL
1019
+ AND NOT n.file_path CONTAINS 'venv/'
1020
+ AND NOT n.file_path CONTAINS 'node_modules/'
1021
+ RETURN n.id AS name, n.ast_type AS type
1022
+ """)
1023
+
1024
+ for record in nodes:
1025
+ graph.add_node(record["name"], entity_type=record["type"])
1026
+
1027
+ # Get edges (this might need adjustment based on actual schema)
1028
+ edges = self.connection.execute("""
1029
+ MATCH (a:ASTNode)-[r:CALLS]->(b:ASTNode)
1030
+ WHERE a.id IS NOT NULL AND b.id IS NOT NULL
1031
+ RETURN a.id AS source, b.id AS target, 'CALLS' AS rel_type
1032
+ """)
1033
+
1034
+ for record in edges:
1035
+ if record["source"] != record["target"]:
1036
+ graph.add_edge(
1037
+ record["source"],
1038
+ record["target"],
1039
+ relationship=record["rel_type"]
1040
+ )
1041
+ else:
1042
+ # Use Function/Class nodes
1043
+ func_nodes = self.connection.execute("""
1044
+ MATCH (n:Function)
1045
+ RETURN n.qualified_name AS name, 'Function' AS type
1046
+ """)
1047
+
1048
+ class_nodes = self.connection.execute("""
1049
+ MATCH (n:Class)
1050
+ RETURN n.qualified_name AS name, 'Class' AS type
1051
+ """)
1052
+
1053
+ for record in func_nodes + class_nodes:
1054
+ graph.add_node(record["name"], entity_type=record["type"])
1055
+
1056
+ # Get CALLS relationships
1057
+ call_edges = self.connection.execute("""
1058
+ MATCH (a:Function)-[:CALLS]->(b:Function)
1059
+ RETURN a.qualified_name AS source,
1060
+ b.qualified_name AS target,
1061
+ 'CALLS' AS rel_type
1062
+ """)
1063
+
1064
+ for record in call_edges:
1065
+ graph.add_edge(
1066
+ record["source"],
1067
+ record["target"],
1068
+ relationship=record["rel_type"]
1069
+ )
1070
+
1071
+ # Get INHERITS_FROM relationships
1072
+ inherit_edges = self.connection.execute("""
1073
+ MATCH (a:Class)-[:INHERITS_FROM]->(b:Class)
1074
+ RETURN a.qualified_name AS source,
1075
+ b.qualified_name AS target,
1076
+ 'INHERITS_FROM' AS rel_type
1077
+ """)
1078
+
1079
+ for record in inherit_edges:
1080
+ graph.add_edge(
1081
+ record["source"],
1082
+ record["target"],
1083
+ relationship=record["rel_type"]
1084
+ )
1085
+
1086
+ log.info(f"Built graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
1087
+ return graph
1088
+
1089
+ def _write_pagerank_scores(
1090
+ self,
1091
+ scores: Dict[str, float],
1092
+ use_ast_nodes: bool = False
1093
+ ):
1094
+ """Write PageRank scores back to database.
1095
+
1096
+ Args:
1097
+ scores: Dictionary mapping qualified_name to PageRank score
1098
+ use_ast_nodes: Whether scores are for ASTNodes
1099
+ """
1100
+ log.info("Writing PageRank scores to database...")
1101
+ write_conn = get_connection()
1102
+
1103
+ with write_lock_context("pagerank"):
1104
+ for name, value in scores.items():
1105
+ if use_ast_nodes:
1106
+ write_conn.execute_write("""
1107
+ MATCH (n:ASTNode {id: $name})
1108
+ SET n.pagerank = $value
1109
+ """, {"name": name, "value": value})
1110
+ else:
1111
+ # Try Function first, then Class
1112
+ write_conn.execute_write("""
1113
+ MATCH (n:Function {qualified_name: $name})
1114
+ SET n.pagerank = $value
1115
+ """, {"name": name, "value": value})
1116
+ write_conn.execute_write("""
1117
+ MATCH (n:Class {qualified_name: $name})
1118
+ SET n.pagerank = $value
1119
+ """, {"name": name, "value": value})
1120
+
1121
+ log.info(f"Wrote {len(scores)} PageRank scores")
1122
+
1123
+ def _write_betweenness_scores(self, scores: Dict[str, float]):
1124
+ """Write Betweenness Centrality scores back to database.
1125
+
1126
+ Args:
1127
+ scores: Dictionary mapping qualified_name to betweenness score
1128
+ """
1129
+ log.info("Writing Betweenness scores to database...")
1130
+ write_conn = get_connection()
1131
+
1132
+ with write_lock_context("betweenness"):
1133
+ for name, value in scores.items():
1134
+ write_conn.execute_write("""
1135
+ MATCH (n:Function {qualified_name: $name})
1136
+ SET n.betweenness = $value
1137
+ """, {"name": name, "value": value})
1138
+ write_conn.execute_write("""
1139
+ MATCH (n:Class {qualified_name: $name})
1140
+ SET n.betweenness = $value
1141
+ """, {"name": name, "value": value})
1142
+
1143
+ log.info(f"Wrote {len(scores)} Betweenness scores")
1144
+
1145
+ def _write_community_assignments(self, communities: Dict[str, int]):
1146
+ """Write community assignments back to database.
1147
+
1148
+ Args:
1149
+ communities: Dictionary mapping qualified_name to community ID
1150
+ """
1151
+ log.info("Writing community assignments to database...")
1152
+
1153
+ rows = [{"name": name, "community_id": comm_id} for name, comm_id in communities.items()]
1154
+ if not rows:
1155
+ log.info("No community assignments to write")
1156
+ return
1157
+
1158
+ write_conn = get_connection()
1159
+ with write_lock_context("communities"):
1160
+ write_conn.execute_write("""
1161
+ UNWIND $rows AS row
1162
+ MATCH (n:Function {qualified_name: row.name})
1163
+ SET n.community = row.community_id
1164
+ """, {"rows": rows})
1165
+ write_conn.execute_write("""
1166
+ UNWIND $rows AS row
1167
+ MATCH (n:Class {qualified_name: row.name})
1168
+ SET n.community = row.community_id
1169
+ """, {"rows": rows})
1170
+
1171
+ log.info(f"Wrote {len(communities)} community assignments")
1172
+
1173
+ def _ensure_community_nodes(self, community_ids: Iterable[int]) -> None:
1174
+ """Ensure community nodes exist for detected community IDs."""
1175
+ rows = [{"community_id": community_id} for community_id in community_ids]
1176
+ if not rows:
1177
+ return
1178
+
1179
+ write_conn = get_connection()
1180
+ with write_lock_context("community_nodes"):
1181
+ write_conn.execute_write("""
1182
+ UNWIND $rows AS row
1183
+ MERGE (c:Community {community_id: row.community_id})
1184
+ """, {"rows": rows})
1185
+
1186
+ def _get_community_ids(self) -> List[int]:
1187
+ """Fetch distinct community IDs from the graph."""
1188
+ func_results = self.connection.execute("""
1189
+ MATCH (n:Function)
1190
+ WHERE n.community IS NOT NULL
1191
+ RETURN DISTINCT n.community AS community_id
1192
+ """)
1193
+ class_results = self.connection.execute("""
1194
+ MATCH (n:Class)
1195
+ WHERE n.community IS NOT NULL
1196
+ RETURN DISTINCT n.community AS community_id
1197
+ """)
1198
+ community_ids = {row["community_id"] for row in func_results + class_results}
1199
+ return sorted(community_ids)
1200
+
1201
+ def _build_community_description_prompt(self, community_id: int, max_members: int = 20) -> str:
1202
+ """Build the LLM prompt for a community description."""
1203
+ func_results = self.connection.execute("""
1204
+ MATCH (n:Function)
1205
+ WHERE n.community = $community_id
1206
+ RETURN 'Function' AS type,
1207
+ n.name AS name,
1208
+ n.qualified_name AS qualified_name,
1209
+ n.file_path AS file_path,
1210
+ n.docstring AS docstring,
1211
+ n.pagerank AS pagerank
1212
+ ORDER BY n.pagerank DESC, n.name
1213
+ LIMIT $limit
1214
+ """, {"community_id": community_id, "limit": max_members})
1215
+ class_results = self.connection.execute("""
1216
+ MATCH (n:Class)
1217
+ WHERE n.community = $community_id
1218
+ RETURN 'Class' AS type,
1219
+ n.name AS name,
1220
+ n.qualified_name AS qualified_name,
1221
+ n.file_path AS file_path,
1222
+ n.docstring AS docstring,
1223
+ n.pagerank AS pagerank
1224
+ ORDER BY n.pagerank DESC, n.name
1225
+ LIMIT $limit
1226
+ """, {"community_id": community_id, "limit": max_members})
1227
+ results = func_results + class_results
1228
+ results.sort(key=lambda row: (row.get("pagerank") is None, -(row.get("pagerank") or 0), row.get("name") or ""))
1229
+ results = results[:max_members]
1230
+
1231
+ if not results:
1232
+ return ""
1233
+
1234
+ members = []
1235
+ file_paths = []
1236
+ seen_paths = set()
1237
+
1238
+ for row in results:
1239
+ name = row.get("qualified_name") or row.get("name") or "unknown"
1240
+ mtype = row.get("type") or "Entity"
1241
+ docstring = (row.get("docstring") or "").strip()
1242
+ if docstring:
1243
+ docstring = " ".join(docstring.split())
1244
+ if len(docstring) > 160:
1245
+ docstring = docstring[:157].rstrip() + "..."
1246
+ members.append(f"- {mtype}: {name} — {docstring}")
1247
+ else:
1248
+ members.append(f"- {mtype}: {name}")
1249
+
1250
+ file_path = row.get("file_path")
1251
+ if file_path:
1252
+ short_path = self._shorten_path(file_path)
1253
+ if short_path not in seen_paths:
1254
+ seen_paths.add(short_path)
1255
+ file_paths.append(short_path)
1256
+
1257
+ prompt_sections = [
1258
+ f"Community ID: {community_id}",
1259
+ "",
1260
+ "Key members:",
1261
+ *members,
1262
+ ]
1263
+
1264
+ if file_paths:
1265
+ prompt_sections.extend([
1266
+ "",
1267
+ "Representative files:",
1268
+ *[f"- {p}" for p in file_paths[:15]],
1269
+ ])
1270
+
1271
+ return "\n".join(prompt_sections)
1272
+
1273
+ def _shorten_path(self, path: str, max_parts: int = 3) -> str:
1274
+ """Shorten a file path to the last few segments."""
1275
+ parts = path.replace("\\", "/").split("/")
1276
+ if len(parts) <= max_parts:
1277
+ return "/".join(parts)
1278
+ return "/".join(parts[-max_parts:])
1279
+
1280
+ def _clean_description(self, description: str) -> str:
1281
+ """Normalize LLM output for storage."""
1282
+ cleaned_lines = [line.strip(" -\t") for line in description.splitlines() if line.strip()]
1283
+ cleaned = " ".join(cleaned_lines).strip()
1284
+ if len(cleaned) >= 2 and cleaned[0] == cleaned[-1] and cleaned[0] in {"'", '"'}:
1285
+ cleaned = cleaned[1:-1].strip()
1286
+ return cleaned