emdash-core 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emdash_core/__init__.py +3 -0
- emdash_core/agent/__init__.py +37 -0
- emdash_core/agent/agents.py +225 -0
- emdash_core/agent/code_reviewer.py +476 -0
- emdash_core/agent/compaction.py +143 -0
- emdash_core/agent/context_manager.py +140 -0
- emdash_core/agent/events.py +338 -0
- emdash_core/agent/handlers.py +224 -0
- emdash_core/agent/inprocess_subagent.py +377 -0
- emdash_core/agent/mcp/__init__.py +50 -0
- emdash_core/agent/mcp/client.py +346 -0
- emdash_core/agent/mcp/config.py +302 -0
- emdash_core/agent/mcp/manager.py +496 -0
- emdash_core/agent/mcp/tool_factory.py +213 -0
- emdash_core/agent/prompts/__init__.py +38 -0
- emdash_core/agent/prompts/main_agent.py +104 -0
- emdash_core/agent/prompts/subagents.py +131 -0
- emdash_core/agent/prompts/workflow.py +136 -0
- emdash_core/agent/providers/__init__.py +34 -0
- emdash_core/agent/providers/base.py +143 -0
- emdash_core/agent/providers/factory.py +80 -0
- emdash_core/agent/providers/models.py +220 -0
- emdash_core/agent/providers/openai_provider.py +463 -0
- emdash_core/agent/providers/transformers_provider.py +217 -0
- emdash_core/agent/research/__init__.py +81 -0
- emdash_core/agent/research/agent.py +143 -0
- emdash_core/agent/research/controller.py +254 -0
- emdash_core/agent/research/critic.py +428 -0
- emdash_core/agent/research/macros.py +469 -0
- emdash_core/agent/research/planner.py +449 -0
- emdash_core/agent/research/researcher.py +436 -0
- emdash_core/agent/research/state.py +523 -0
- emdash_core/agent/research/synthesizer.py +594 -0
- emdash_core/agent/reviewer_profile.py +475 -0
- emdash_core/agent/rules.py +123 -0
- emdash_core/agent/runner.py +601 -0
- emdash_core/agent/session.py +262 -0
- emdash_core/agent/spec_schema.py +66 -0
- emdash_core/agent/specification.py +479 -0
- emdash_core/agent/subagent.py +397 -0
- emdash_core/agent/subagent_prompts.py +13 -0
- emdash_core/agent/toolkit.py +482 -0
- emdash_core/agent/toolkits/__init__.py +64 -0
- emdash_core/agent/toolkits/base.py +96 -0
- emdash_core/agent/toolkits/explore.py +47 -0
- emdash_core/agent/toolkits/plan.py +55 -0
- emdash_core/agent/tools/__init__.py +141 -0
- emdash_core/agent/tools/analytics.py +436 -0
- emdash_core/agent/tools/base.py +131 -0
- emdash_core/agent/tools/coding.py +484 -0
- emdash_core/agent/tools/github_mcp.py +592 -0
- emdash_core/agent/tools/history.py +13 -0
- emdash_core/agent/tools/modes.py +153 -0
- emdash_core/agent/tools/plan.py +206 -0
- emdash_core/agent/tools/plan_write.py +135 -0
- emdash_core/agent/tools/search.py +412 -0
- emdash_core/agent/tools/spec.py +341 -0
- emdash_core/agent/tools/task.py +262 -0
- emdash_core/agent/tools/task_output.py +204 -0
- emdash_core/agent/tools/tasks.py +454 -0
- emdash_core/agent/tools/traversal.py +588 -0
- emdash_core/agent/tools/web.py +179 -0
- emdash_core/analytics/__init__.py +5 -0
- emdash_core/analytics/engine.py +1286 -0
- emdash_core/api/__init__.py +5 -0
- emdash_core/api/agent.py +308 -0
- emdash_core/api/agents.py +154 -0
- emdash_core/api/analyze.py +264 -0
- emdash_core/api/auth.py +173 -0
- emdash_core/api/context.py +77 -0
- emdash_core/api/db.py +121 -0
- emdash_core/api/embed.py +131 -0
- emdash_core/api/feature.py +143 -0
- emdash_core/api/health.py +93 -0
- emdash_core/api/index.py +162 -0
- emdash_core/api/plan.py +110 -0
- emdash_core/api/projectmd.py +210 -0
- emdash_core/api/query.py +320 -0
- emdash_core/api/research.py +122 -0
- emdash_core/api/review.py +161 -0
- emdash_core/api/router.py +76 -0
- emdash_core/api/rules.py +116 -0
- emdash_core/api/search.py +119 -0
- emdash_core/api/spec.py +99 -0
- emdash_core/api/swarm.py +223 -0
- emdash_core/api/tasks.py +109 -0
- emdash_core/api/team.py +120 -0
- emdash_core/auth/__init__.py +17 -0
- emdash_core/auth/github.py +389 -0
- emdash_core/config.py +74 -0
- emdash_core/context/__init__.py +52 -0
- emdash_core/context/models.py +50 -0
- emdash_core/context/providers/__init__.py +11 -0
- emdash_core/context/providers/base.py +74 -0
- emdash_core/context/providers/explored_areas.py +183 -0
- emdash_core/context/providers/touched_areas.py +360 -0
- emdash_core/context/registry.py +73 -0
- emdash_core/context/reranker.py +199 -0
- emdash_core/context/service.py +260 -0
- emdash_core/context/session.py +352 -0
- emdash_core/core/__init__.py +104 -0
- emdash_core/core/config.py +454 -0
- emdash_core/core/exceptions.py +55 -0
- emdash_core/core/models.py +265 -0
- emdash_core/core/review_config.py +57 -0
- emdash_core/db/__init__.py +67 -0
- emdash_core/db/auth.py +134 -0
- emdash_core/db/models.py +91 -0
- emdash_core/db/provider.py +222 -0
- emdash_core/db/providers/__init__.py +5 -0
- emdash_core/db/providers/supabase.py +452 -0
- emdash_core/embeddings/__init__.py +24 -0
- emdash_core/embeddings/indexer.py +534 -0
- emdash_core/embeddings/models.py +192 -0
- emdash_core/embeddings/providers/__init__.py +7 -0
- emdash_core/embeddings/providers/base.py +112 -0
- emdash_core/embeddings/providers/fireworks.py +141 -0
- emdash_core/embeddings/providers/openai.py +104 -0
- emdash_core/embeddings/registry.py +146 -0
- emdash_core/embeddings/service.py +215 -0
- emdash_core/graph/__init__.py +26 -0
- emdash_core/graph/builder.py +134 -0
- emdash_core/graph/connection.py +692 -0
- emdash_core/graph/schema.py +416 -0
- emdash_core/graph/writer.py +667 -0
- emdash_core/ingestion/__init__.py +7 -0
- emdash_core/ingestion/change_detector.py +150 -0
- emdash_core/ingestion/git/__init__.py +5 -0
- emdash_core/ingestion/git/commit_analyzer.py +196 -0
- emdash_core/ingestion/github/__init__.py +6 -0
- emdash_core/ingestion/github/pr_fetcher.py +296 -0
- emdash_core/ingestion/github/task_extractor.py +100 -0
- emdash_core/ingestion/orchestrator.py +540 -0
- emdash_core/ingestion/parsers/__init__.py +10 -0
- emdash_core/ingestion/parsers/base_parser.py +66 -0
- emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
- emdash_core/ingestion/parsers/class_extractor.py +154 -0
- emdash_core/ingestion/parsers/function_extractor.py +202 -0
- emdash_core/ingestion/parsers/import_analyzer.py +119 -0
- emdash_core/ingestion/parsers/python_parser.py +123 -0
- emdash_core/ingestion/parsers/registry.py +72 -0
- emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
- emdash_core/ingestion/parsers/typescript_parser.py +278 -0
- emdash_core/ingestion/repository.py +346 -0
- emdash_core/models/__init__.py +38 -0
- emdash_core/models/agent.py +68 -0
- emdash_core/models/index.py +77 -0
- emdash_core/models/query.py +113 -0
- emdash_core/planning/__init__.py +7 -0
- emdash_core/planning/agent_api.py +413 -0
- emdash_core/planning/context_builder.py +265 -0
- emdash_core/planning/feature_context.py +232 -0
- emdash_core/planning/feature_expander.py +646 -0
- emdash_core/planning/llm_explainer.py +198 -0
- emdash_core/planning/similarity.py +509 -0
- emdash_core/planning/team_focus.py +821 -0
- emdash_core/server.py +153 -0
- emdash_core/sse/__init__.py +5 -0
- emdash_core/sse/stream.py +196 -0
- emdash_core/swarm/__init__.py +17 -0
- emdash_core/swarm/merge_agent.py +383 -0
- emdash_core/swarm/session_manager.py +274 -0
- emdash_core/swarm/swarm_runner.py +226 -0
- emdash_core/swarm/task_definition.py +137 -0
- emdash_core/swarm/worker_spawner.py +319 -0
- emdash_core/swarm/worktree_manager.py +278 -0
- emdash_core/templates/__init__.py +10 -0
- emdash_core/templates/defaults/agent-builder.md.template +82 -0
- emdash_core/templates/defaults/focus.md.template +115 -0
- emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
- emdash_core/templates/defaults/pr-review.md.template +80 -0
- emdash_core/templates/defaults/project.md.template +85 -0
- emdash_core/templates/defaults/research_critic.md.template +112 -0
- emdash_core/templates/defaults/research_planner.md.template +85 -0
- emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
- emdash_core/templates/defaults/reviewer.md.template +81 -0
- emdash_core/templates/defaults/spec.md.template +41 -0
- emdash_core/templates/defaults/tasks.md.template +78 -0
- emdash_core/templates/loader.py +296 -0
- emdash_core/utils/__init__.py +45 -0
- emdash_core/utils/git.py +84 -0
- emdash_core/utils/image.py +502 -0
- emdash_core/utils/logger.py +51 -0
- emdash_core-0.1.7.dist-info/METADATA +35 -0
- emdash_core-0.1.7.dist-info/RECORD +187 -0
- emdash_core-0.1.7.dist-info/WHEEL +4 -0
- emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1286 @@
|
|
|
1
|
+
"""Analytics engine for computing graph metrics."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import networkx as nx
|
|
5
|
+
from typing import Dict, List, Tuple, Optional, Iterable
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from collections import defaultdict
|
|
8
|
+
import math
|
|
9
|
+
import community as community_louvain # python-louvain
|
|
10
|
+
|
|
11
|
+
from ..graph.connection import KuzuConnection, get_connection, get_read_connection, write_lock_context
|
|
12
|
+
from ..agent.providers import get_provider
|
|
13
|
+
from ..agent.providers.factory import DEFAULT_MODEL
|
|
14
|
+
from ..utils.logger import log
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AnalyticsEngine:
|
|
18
|
+
"""Computes graph analytics metrics on the knowledge graph."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, connection: KuzuConnection = None, read_only: bool = True):
|
|
21
|
+
"""Initialize analytics engine.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
connection: Kuzu connection. If None, uses appropriate global connection.
|
|
25
|
+
read_only: If True and no connection provided, use read-only connection.
|
|
26
|
+
"""
|
|
27
|
+
if connection:
|
|
28
|
+
self.connection = connection
|
|
29
|
+
elif read_only:
|
|
30
|
+
self.connection = get_read_connection()
|
|
31
|
+
else:
|
|
32
|
+
self.connection = get_connection()
|
|
33
|
+
|
|
34
|
+
def compute_pagerank(
|
|
35
|
+
self,
|
|
36
|
+
damping: float = 0.85,
|
|
37
|
+
max_iter: int = 100,
|
|
38
|
+
write_back: bool = True,
|
|
39
|
+
use_ast_nodes: bool = False
|
|
40
|
+
) -> Dict[str, float]:
|
|
41
|
+
"""Compute PageRank scores for all code entities.
|
|
42
|
+
|
|
43
|
+
PageRank identifies the most "important" entities based on
|
|
44
|
+
how many other entities reference/call them.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
damping: Damping parameter (default 0.85)
|
|
48
|
+
max_iter: Maximum iterations
|
|
49
|
+
write_back: Whether to write scores back to database
|
|
50
|
+
use_ast_nodes: If True, use ASTNodes (legacy). If False (default),
|
|
51
|
+
use Function/Class nodes with CALLS relationships.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Dictionary mapping entity qualified_name to PageRank score
|
|
55
|
+
"""
|
|
56
|
+
log.info(f"Computing PageRank scores (use_ast_nodes={use_ast_nodes})...")
|
|
57
|
+
|
|
58
|
+
# Build NetworkX graph from Kuzu
|
|
59
|
+
graph = self._build_code_graph(use_ast_nodes=use_ast_nodes)
|
|
60
|
+
|
|
61
|
+
if len(graph.nodes) == 0:
|
|
62
|
+
log.warning("Graph is empty, cannot compute PageRank")
|
|
63
|
+
return {}
|
|
64
|
+
|
|
65
|
+
# Compute PageRank
|
|
66
|
+
pagerank_scores = nx.pagerank(
|
|
67
|
+
graph,
|
|
68
|
+
alpha=damping,
|
|
69
|
+
max_iter=max_iter
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
log.info(f"Computed PageRank for {len(pagerank_scores)} entities")
|
|
73
|
+
|
|
74
|
+
# Write back to database
|
|
75
|
+
if write_back:
|
|
76
|
+
self._write_pagerank_scores(pagerank_scores, use_ast_nodes=use_ast_nodes)
|
|
77
|
+
|
|
78
|
+
return pagerank_scores
|
|
79
|
+
|
|
80
|
+
def compute_betweenness_centrality(
|
|
81
|
+
self,
|
|
82
|
+
normalized: bool = True,
|
|
83
|
+
write_back: bool = True
|
|
84
|
+
) -> Dict[str, float]:
|
|
85
|
+
"""Compute Betweenness Centrality for all code entities.
|
|
86
|
+
|
|
87
|
+
Betweenness identifies "bridge" entities that connect different
|
|
88
|
+
parts of the codebase.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
normalized: Whether to normalize scores
|
|
92
|
+
write_back: Whether to write scores back to database
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Dictionary mapping entity qualified_name to betweenness score
|
|
96
|
+
"""
|
|
97
|
+
log.info("Computing Betweenness Centrality...")
|
|
98
|
+
|
|
99
|
+
# Build NetworkX graph from Kuzu
|
|
100
|
+
graph = self._build_code_graph()
|
|
101
|
+
|
|
102
|
+
if len(graph.nodes) == 0:
|
|
103
|
+
log.warning("Graph is empty, cannot compute Betweenness")
|
|
104
|
+
return {}
|
|
105
|
+
|
|
106
|
+
# Compute Betweenness Centrality
|
|
107
|
+
betweenness_scores = nx.betweenness_centrality(
|
|
108
|
+
graph,
|
|
109
|
+
normalized=normalized
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
log.info(f"Computed Betweenness for {len(betweenness_scores)} entities")
|
|
113
|
+
|
|
114
|
+
# Write back to database
|
|
115
|
+
if write_back:
|
|
116
|
+
self._write_betweenness_scores(betweenness_scores)
|
|
117
|
+
|
|
118
|
+
return betweenness_scores
|
|
119
|
+
|
|
120
|
+
def detect_communities(
|
|
121
|
+
self,
|
|
122
|
+
resolution: float = 1.0,
|
|
123
|
+
write_back: bool = True,
|
|
124
|
+
describe: bool = False,
|
|
125
|
+
describe_model: str = DEFAULT_MODEL,
|
|
126
|
+
overwrite_descriptions: bool = False,
|
|
127
|
+
) -> Dict[str, int]:
|
|
128
|
+
"""Detect communities/clusters using Louvain algorithm.
|
|
129
|
+
|
|
130
|
+
Identifies modules or clusters of tightly-coupled code entities.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
resolution: Resolution parameter for Louvain algorithm
|
|
134
|
+
write_back: Whether to write community IDs back to database
|
|
135
|
+
describe: Whether to auto-generate community descriptions via LLM
|
|
136
|
+
describe_model: LLM model to use for descriptions
|
|
137
|
+
overwrite_descriptions: Whether to overwrite existing descriptions
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Dictionary mapping entity qualified_name to community ID
|
|
141
|
+
"""
|
|
142
|
+
log.info("Detecting communities with Louvain algorithm...")
|
|
143
|
+
|
|
144
|
+
# Build NetworkX graph from Kuzu (undirected for community detection)
|
|
145
|
+
graph = self._build_code_graph(directed=False)
|
|
146
|
+
|
|
147
|
+
if len(graph.nodes) == 0:
|
|
148
|
+
log.warning("Graph is empty, cannot detect communities")
|
|
149
|
+
return {}
|
|
150
|
+
|
|
151
|
+
# Compute communities using Louvain
|
|
152
|
+
communities = community_louvain.best_partition(
|
|
153
|
+
graph,
|
|
154
|
+
resolution=resolution
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Count communities
|
|
158
|
+
num_communities = len(set(communities.values()))
|
|
159
|
+
log.info(f"Detected {num_communities} communities")
|
|
160
|
+
|
|
161
|
+
# Write back to database
|
|
162
|
+
if write_back:
|
|
163
|
+
self._write_community_assignments(communities)
|
|
164
|
+
self._ensure_community_nodes(set(communities.values()))
|
|
165
|
+
if describe:
|
|
166
|
+
self.generate_community_descriptions(
|
|
167
|
+
community_ids=set(communities.values()),
|
|
168
|
+
model=describe_model,
|
|
169
|
+
overwrite=overwrite_descriptions,
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return communities
|
|
173
|
+
|
|
174
|
+
def get_top_pagerank(
|
|
175
|
+
self,
|
|
176
|
+
limit: int = 20,
|
|
177
|
+
use_ast_nodes: bool = False
|
|
178
|
+
) -> List[Tuple[str, str, float]]:
|
|
179
|
+
"""Get top entities by PageRank score.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
limit: Number of results to return
|
|
183
|
+
use_ast_nodes: If True, query ASTNodes. If False, query Class/Function.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of (qualified_name, entity_type, score) tuples
|
|
187
|
+
"""
|
|
188
|
+
if use_ast_nodes:
|
|
189
|
+
results = self.connection.execute("""
|
|
190
|
+
MATCH (n:ASTNode)
|
|
191
|
+
WHERE n.pagerank IS NOT NULL
|
|
192
|
+
RETURN n.id AS name,
|
|
193
|
+
n.ast_type AS type,
|
|
194
|
+
n.pagerank AS score
|
|
195
|
+
ORDER BY n.pagerank DESC
|
|
196
|
+
LIMIT $limit
|
|
197
|
+
""", {"limit": limit})
|
|
198
|
+
else:
|
|
199
|
+
# Query functions and classes separately, then merge
|
|
200
|
+
func_results = self.connection.execute("""
|
|
201
|
+
MATCH (n:Function)
|
|
202
|
+
WHERE n.pagerank IS NOT NULL
|
|
203
|
+
RETURN n.qualified_name AS name,
|
|
204
|
+
'Function' AS type,
|
|
205
|
+
n.pagerank AS score
|
|
206
|
+
ORDER BY n.pagerank DESC
|
|
207
|
+
LIMIT $limit
|
|
208
|
+
""", {"limit": limit})
|
|
209
|
+
|
|
210
|
+
class_results = self.connection.execute("""
|
|
211
|
+
MATCH (n:Class)
|
|
212
|
+
WHERE n.pagerank IS NOT NULL
|
|
213
|
+
RETURN n.qualified_name AS name,
|
|
214
|
+
'Class' AS type,
|
|
215
|
+
n.pagerank AS score
|
|
216
|
+
ORDER BY n.pagerank DESC
|
|
217
|
+
LIMIT $limit
|
|
218
|
+
""", {"limit": limit})
|
|
219
|
+
|
|
220
|
+
results = func_results + class_results
|
|
221
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
222
|
+
results = results[:limit]
|
|
223
|
+
|
|
224
|
+
return [(r["name"], r["type"], r["score"]) for r in results]
|
|
225
|
+
|
|
226
|
+
def get_top_betweenness(self, limit: int = 20) -> List[Tuple[str, str, float]]:
|
|
227
|
+
"""Get top entities by Betweenness Centrality.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
limit: Number of results to return
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of (qualified_name, entity_type, score) tuples
|
|
234
|
+
"""
|
|
235
|
+
# Query functions and classes separately
|
|
236
|
+
func_results = self.connection.execute("""
|
|
237
|
+
MATCH (n:Function)
|
|
238
|
+
WHERE n.betweenness IS NOT NULL
|
|
239
|
+
RETURN n.qualified_name AS name,
|
|
240
|
+
'Function' AS type,
|
|
241
|
+
n.betweenness AS score
|
|
242
|
+
ORDER BY n.betweenness DESC
|
|
243
|
+
LIMIT $limit
|
|
244
|
+
""", {"limit": limit})
|
|
245
|
+
|
|
246
|
+
class_results = self.connection.execute("""
|
|
247
|
+
MATCH (n:Class)
|
|
248
|
+
WHERE n.betweenness IS NOT NULL
|
|
249
|
+
RETURN n.qualified_name AS name,
|
|
250
|
+
'Class' AS type,
|
|
251
|
+
n.betweenness AS score
|
|
252
|
+
ORDER BY n.betweenness DESC
|
|
253
|
+
LIMIT $limit
|
|
254
|
+
""", {"limit": limit})
|
|
255
|
+
|
|
256
|
+
results = func_results + class_results
|
|
257
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
258
|
+
return [(r["name"], r["type"], r["score"]) for r in results[:limit]]
|
|
259
|
+
|
|
260
|
+
def get_communities_summary(self, max_members: int = 5) -> List[Dict]:
|
|
261
|
+
"""Get summary of detected communities.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
max_members: Maximum number of member names to return per community
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
List of community summaries with member counts
|
|
268
|
+
"""
|
|
269
|
+
descriptions = self._fetch_community_descriptions()
|
|
270
|
+
|
|
271
|
+
# Query functions and classes with community assignments
|
|
272
|
+
func_results = self.connection.execute("""
|
|
273
|
+
MATCH (n:Function)
|
|
274
|
+
WHERE n.community IS NOT NULL
|
|
275
|
+
RETURN n.community AS community_id,
|
|
276
|
+
n.qualified_name AS qualified_name
|
|
277
|
+
""")
|
|
278
|
+
|
|
279
|
+
class_results = self.connection.execute("""
|
|
280
|
+
MATCH (n:Class)
|
|
281
|
+
WHERE n.community IS NOT NULL
|
|
282
|
+
RETURN n.community AS community_id,
|
|
283
|
+
n.qualified_name AS qualified_name
|
|
284
|
+
""")
|
|
285
|
+
|
|
286
|
+
# Group by community
|
|
287
|
+
communities = defaultdict(list)
|
|
288
|
+
for r in func_results + class_results:
|
|
289
|
+
communities[r['community_id']].append(r['qualified_name'])
|
|
290
|
+
|
|
291
|
+
# Build summary
|
|
292
|
+
summaries = []
|
|
293
|
+
for community_id, members in communities.items():
|
|
294
|
+
summaries.append({
|
|
295
|
+
'community_id': community_id,
|
|
296
|
+
'member_count': len(members),
|
|
297
|
+
'sample_members': members[:max_members],
|
|
298
|
+
'description': descriptions.get(community_id)
|
|
299
|
+
})
|
|
300
|
+
|
|
301
|
+
summaries.sort(key=lambda x: x['member_count'], reverse=True)
|
|
302
|
+
return summaries
|
|
303
|
+
|
|
304
|
+
def get_community_members(self, community_id: int) -> List[Dict]:
|
|
305
|
+
"""Get all members of a specific community.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
community_id: The community ID to query
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
List of member details (name, type, qualified_name)
|
|
312
|
+
"""
|
|
313
|
+
func_results = self.connection.execute("""
|
|
314
|
+
MATCH (n:Function)
|
|
315
|
+
WHERE n.community = $community_id
|
|
316
|
+
RETURN n.name AS name,
|
|
317
|
+
'Function' AS type,
|
|
318
|
+
n.qualified_name AS qualified_name,
|
|
319
|
+
n.file_path AS file_path
|
|
320
|
+
ORDER BY n.name
|
|
321
|
+
""", {"community_id": community_id})
|
|
322
|
+
|
|
323
|
+
class_results = self.connection.execute("""
|
|
324
|
+
MATCH (n:Class)
|
|
325
|
+
WHERE n.community = $community_id
|
|
326
|
+
RETURN n.name AS name,
|
|
327
|
+
'Class' AS type,
|
|
328
|
+
n.qualified_name AS qualified_name,
|
|
329
|
+
n.file_path AS file_path
|
|
330
|
+
ORDER BY n.name
|
|
331
|
+
""", {"community_id": community_id})
|
|
332
|
+
|
|
333
|
+
return func_results + class_results
|
|
334
|
+
|
|
335
|
+
def search_communities(
|
|
336
|
+
self,
|
|
337
|
+
query: Optional[str] = None,
|
|
338
|
+
qualified_names: Optional[List[str]] = None,
|
|
339
|
+
limit: int = 10,
|
|
340
|
+
max_members: int = 5,
|
|
341
|
+
min_score: float = 0.3,
|
|
342
|
+
) -> List[Dict]:
|
|
343
|
+
"""Search for communities by semantic similarity or member matching.
|
|
344
|
+
|
|
345
|
+
Two search modes:
|
|
346
|
+
1. Query mode: Search community descriptions by embedding similarity
|
|
347
|
+
2. Member mode: Find communities containing specific code entities
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
query: Semantic search query for community descriptions
|
|
351
|
+
qualified_names: List of code entity qualified names to find communities for
|
|
352
|
+
limit: Maximum number of communities to return
|
|
353
|
+
max_members: Maximum sample members per community
|
|
354
|
+
min_score: Minimum similarity score for semantic search (0-1)
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
List of community dicts with community_id, description, member_count, sample_members
|
|
358
|
+
"""
|
|
359
|
+
if query:
|
|
360
|
+
# Semantic search mode - search by community description embedding
|
|
361
|
+
return self._search_communities_by_embedding(
|
|
362
|
+
query=query,
|
|
363
|
+
limit=limit,
|
|
364
|
+
max_members=max_members,
|
|
365
|
+
min_score=min_score,
|
|
366
|
+
)
|
|
367
|
+
elif qualified_names:
|
|
368
|
+
# Member matching mode - find communities containing these entities
|
|
369
|
+
return self._search_communities_by_members(
|
|
370
|
+
qualified_names=qualified_names,
|
|
371
|
+
limit=limit,
|
|
372
|
+
max_members=max_members,
|
|
373
|
+
)
|
|
374
|
+
else:
|
|
375
|
+
# No search criteria - return top communities by size
|
|
376
|
+
return self.get_communities_summary(max_members=max_members)[:limit]
|
|
377
|
+
|
|
378
|
+
def _search_communities_by_embedding(
|
|
379
|
+
self,
|
|
380
|
+
query: str,
|
|
381
|
+
limit: int = 10,
|
|
382
|
+
max_members: int = 5,
|
|
383
|
+
min_score: float = 0.3,
|
|
384
|
+
) -> List[Dict]:
|
|
385
|
+
"""Search communities by embedding similarity on descriptions."""
|
|
386
|
+
from ..embeddings.service import EmbeddingService
|
|
387
|
+
from ..planning.similarity import cosine_similarity
|
|
388
|
+
|
|
389
|
+
embedding_service = EmbeddingService()
|
|
390
|
+
if not embedding_service.is_available:
|
|
391
|
+
log.warning("Embedding service not available. Falling back to text search.")
|
|
392
|
+
return self._fallback_community_text_search(query, limit, max_members)
|
|
393
|
+
|
|
394
|
+
# Generate query embedding
|
|
395
|
+
query_embedding = embedding_service.embed_query(query)
|
|
396
|
+
if not query_embedding:
|
|
397
|
+
log.error("Failed to generate query embedding")
|
|
398
|
+
return []
|
|
399
|
+
|
|
400
|
+
# Fetch all communities with embeddings
|
|
401
|
+
communities = self.connection.execute("""
|
|
402
|
+
MATCH (c:Community)
|
|
403
|
+
WHERE c.embedding IS NOT NULL
|
|
404
|
+
RETURN c.community_id AS community_id,
|
|
405
|
+
c.description AS description,
|
|
406
|
+
c.embedding AS embedding
|
|
407
|
+
""")
|
|
408
|
+
|
|
409
|
+
if not communities:
|
|
410
|
+
log.warning("No communities with embeddings found. Run `emdash embed index` first.")
|
|
411
|
+
return self._fallback_community_text_search(query, limit, max_members)
|
|
412
|
+
|
|
413
|
+
# Compute similarity scores
|
|
414
|
+
scored_communities = []
|
|
415
|
+
for comm in communities:
|
|
416
|
+
if comm.get('embedding'):
|
|
417
|
+
similarity = cosine_similarity(query_embedding, comm['embedding'])
|
|
418
|
+
if similarity >= min_score:
|
|
419
|
+
scored_communities.append({
|
|
420
|
+
'community_id': comm['community_id'],
|
|
421
|
+
'description': comm['description'],
|
|
422
|
+
'score': similarity,
|
|
423
|
+
})
|
|
424
|
+
|
|
425
|
+
# Sort by score
|
|
426
|
+
scored_communities.sort(key=lambda x: x['score'], reverse=True)
|
|
427
|
+
scored_communities = scored_communities[:limit]
|
|
428
|
+
|
|
429
|
+
# Enrich with member info
|
|
430
|
+
for comm in scored_communities:
|
|
431
|
+
members = self.get_community_members(comm['community_id'])
|
|
432
|
+
comm['member_count'] = len(members)
|
|
433
|
+
comm['sample_members'] = [m['qualified_name'] for m in members[:max_members]]
|
|
434
|
+
|
|
435
|
+
return scored_communities
|
|
436
|
+
|
|
437
|
+
def _search_communities_by_members(
|
|
438
|
+
self,
|
|
439
|
+
qualified_names: List[str],
|
|
440
|
+
limit: int = 10,
|
|
441
|
+
max_members: int = 5,
|
|
442
|
+
) -> List[Dict]:
|
|
443
|
+
"""Find communities containing the given code entities."""
|
|
444
|
+
if not qualified_names:
|
|
445
|
+
return []
|
|
446
|
+
|
|
447
|
+
# Get community IDs for the given entities
|
|
448
|
+
func_results = self.connection.execute("""
|
|
449
|
+
MATCH (f:Function)
|
|
450
|
+
WHERE f.qualified_name IN $names AND f.community IS NOT NULL
|
|
451
|
+
RETURN f.community AS community_id, count(*) AS match_count
|
|
452
|
+
""", {"names": qualified_names})
|
|
453
|
+
|
|
454
|
+
class_results = self.connection.execute("""
|
|
455
|
+
MATCH (c:Class)
|
|
456
|
+
WHERE c.qualified_name IN $names AND c.community IS NOT NULL
|
|
457
|
+
RETURN c.community AS community_id, count(*) AS match_count
|
|
458
|
+
""", {"names": qualified_names})
|
|
459
|
+
|
|
460
|
+
# Aggregate by community
|
|
461
|
+
community_matches = {}
|
|
462
|
+
for r in func_results + class_results:
|
|
463
|
+
cid = r['community_id']
|
|
464
|
+
community_matches[cid] = community_matches.get(cid, 0) + r['match_count']
|
|
465
|
+
|
|
466
|
+
if not community_matches:
|
|
467
|
+
return []
|
|
468
|
+
|
|
469
|
+
# Sort by match count and fetch details
|
|
470
|
+
sorted_ids = sorted(
|
|
471
|
+
community_matches.keys(),
|
|
472
|
+
key=lambda x: community_matches[x],
|
|
473
|
+
reverse=True
|
|
474
|
+
)[:limit]
|
|
475
|
+
|
|
476
|
+
descriptions = self._fetch_community_descriptions()
|
|
477
|
+
results = []
|
|
478
|
+
for cid in sorted_ids:
|
|
479
|
+
members = self.get_community_members(cid)
|
|
480
|
+
results.append({
|
|
481
|
+
'community_id': cid,
|
|
482
|
+
'description': descriptions.get(cid),
|
|
483
|
+
'member_count': len(members),
|
|
484
|
+
'sample_members': [m['qualified_name'] for m in members[:max_members]],
|
|
485
|
+
'match_count': community_matches[cid],
|
|
486
|
+
})
|
|
487
|
+
|
|
488
|
+
return results
|
|
489
|
+
|
|
490
|
+
def _fallback_community_text_search(
|
|
491
|
+
self,
|
|
492
|
+
query: str,
|
|
493
|
+
limit: int = 10,
|
|
494
|
+
max_members: int = 5,
|
|
495
|
+
) -> List[Dict]:
|
|
496
|
+
"""Fallback to text-based community search."""
|
|
497
|
+
results = self.connection.execute("""
|
|
498
|
+
MATCH (c:Community)
|
|
499
|
+
WHERE c.description IS NOT NULL
|
|
500
|
+
AND lower(c.description) CONTAINS lower($query)
|
|
501
|
+
RETURN c.community_id AS community_id,
|
|
502
|
+
c.description AS description
|
|
503
|
+
LIMIT $limit
|
|
504
|
+
""", {"query": query, "limit": limit})
|
|
505
|
+
|
|
506
|
+
# Enrich with member info
|
|
507
|
+
for comm in results:
|
|
508
|
+
members = self.get_community_members(comm['community_id'])
|
|
509
|
+
comm['member_count'] = len(members)
|
|
510
|
+
comm['sample_members'] = [m['qualified_name'] for m in members[:max_members]]
|
|
511
|
+
|
|
512
|
+
return results
|
|
513
|
+
|
|
514
|
+
def get_community_description(self, community_id: int) -> Optional[str]:
|
|
515
|
+
"""Get a stored description for a community."""
|
|
516
|
+
results = self.connection.execute("""
|
|
517
|
+
MATCH (c:Community {community_id: $community_id})
|
|
518
|
+
RETURN c.description AS description
|
|
519
|
+
""", {"community_id": community_id})
|
|
520
|
+
if not results:
|
|
521
|
+
return None
|
|
522
|
+
return results[0].get("description")
|
|
523
|
+
|
|
524
|
+
def set_community_description(self, community_id: int, description: str, source: str = "manual") -> None:
|
|
525
|
+
"""Set or update the description for a community."""
|
|
526
|
+
if description is None:
|
|
527
|
+
return
|
|
528
|
+
|
|
529
|
+
cleaned = description.strip()
|
|
530
|
+
if not cleaned:
|
|
531
|
+
return
|
|
532
|
+
|
|
533
|
+
write_conn = get_connection()
|
|
534
|
+
with write_lock_context("community_description"):
|
|
535
|
+
write_conn.execute_write("""
|
|
536
|
+
MERGE (c:Community {community_id: $community_id})
|
|
537
|
+
SET c.description = $description,
|
|
538
|
+
c.source = $source
|
|
539
|
+
""", {"community_id": community_id, "description": cleaned, "source": source})
|
|
540
|
+
|
|
541
|
+
def generate_community_descriptions(
|
|
542
|
+
self,
|
|
543
|
+
community_ids: Optional[Iterable[int]] = None,
|
|
544
|
+
model: str = DEFAULT_MODEL,
|
|
545
|
+
overwrite: bool = False,
|
|
546
|
+
max_members: int = 20,
|
|
547
|
+
progress_callback: Optional[callable] = None,
|
|
548
|
+
max_workers: int = 10,
|
|
549
|
+
) -> Dict[int, str]:
|
|
550
|
+
"""Generate and store LLM descriptions for communities.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
community_ids: Specific community IDs to describe. If None, uses all.
|
|
554
|
+
model: LLM model to use for generation.
|
|
555
|
+
overwrite: Whether to overwrite existing descriptions.
|
|
556
|
+
max_members: Max members to include in the prompt.
|
|
557
|
+
progress_callback: Optional callback(current, total, community_id, description)
|
|
558
|
+
called after each community is processed.
|
|
559
|
+
max_workers: Maximum parallel LLM requests (default 10).
|
|
560
|
+
"""
|
|
561
|
+
import threading
|
|
562
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
563
|
+
|
|
564
|
+
if community_ids is None:
|
|
565
|
+
community_ids = self._get_community_ids()
|
|
566
|
+
|
|
567
|
+
community_ids = list(community_ids)
|
|
568
|
+
if not community_ids:
|
|
569
|
+
return {}
|
|
570
|
+
|
|
571
|
+
existing = self._fetch_community_descriptions()
|
|
572
|
+
if not overwrite:
|
|
573
|
+
community_ids = [cid for cid in community_ids if not existing.get(cid)]
|
|
574
|
+
|
|
575
|
+
if not community_ids:
|
|
576
|
+
return {}
|
|
577
|
+
|
|
578
|
+
try:
|
|
579
|
+
provider = get_provider(model)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
log.warning(f"LLM provider not available for community descriptions: {e}")
|
|
582
|
+
return {}
|
|
583
|
+
|
|
584
|
+
generated = {}
|
|
585
|
+
total = len(community_ids)
|
|
586
|
+
completed = [0] # Use list for mutable counter in closure
|
|
587
|
+
lock = threading.Lock()
|
|
588
|
+
|
|
589
|
+
def process_community(community_id: int) -> tuple[int, Optional[str]]:
|
|
590
|
+
"""Process a single community - returns (community_id, description or None)."""
|
|
591
|
+
prompt = self._build_community_description_prompt(community_id, max_members=max_members)
|
|
592
|
+
if not prompt:
|
|
593
|
+
return (community_id, None)
|
|
594
|
+
|
|
595
|
+
try:
|
|
596
|
+
response = provider.chat(
|
|
597
|
+
messages=[{"role": "user", "content": prompt}],
|
|
598
|
+
system=(
|
|
599
|
+
"You are a senior engineer summarizing a code community. "
|
|
600
|
+
"Write 1-2 concise sentences describing what the community does. "
|
|
601
|
+
"Use plain text only. No bullets, no prefixes, no quotes."
|
|
602
|
+
),
|
|
603
|
+
)
|
|
604
|
+
except Exception as e:
|
|
605
|
+
log.warning(f"Failed to generate description for community {community_id}: {e}")
|
|
606
|
+
return (community_id, None)
|
|
607
|
+
|
|
608
|
+
description = self._clean_description(response.content or "")
|
|
609
|
+
if not description:
|
|
610
|
+
return (community_id, None)
|
|
611
|
+
|
|
612
|
+
if len(description) > 400:
|
|
613
|
+
description = description[:397].rstrip() + "..."
|
|
614
|
+
|
|
615
|
+
return (community_id, description)
|
|
616
|
+
|
|
617
|
+
log.info(f"Generating descriptions for {total} communities with {max_workers} workers...")
|
|
618
|
+
|
|
619
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
620
|
+
futures = {executor.submit(process_community, cid): cid for cid in community_ids}
|
|
621
|
+
|
|
622
|
+
for future in as_completed(futures):
|
|
623
|
+
community_id, description = future.result()
|
|
624
|
+
|
|
625
|
+
with lock:
|
|
626
|
+
completed[0] += 1
|
|
627
|
+
current = completed[0]
|
|
628
|
+
|
|
629
|
+
if description:
|
|
630
|
+
self.set_community_description(community_id, description, source="llm")
|
|
631
|
+
generated[community_id] = description
|
|
632
|
+
|
|
633
|
+
if progress_callback:
|
|
634
|
+
progress_callback(current, total, community_id, description)
|
|
635
|
+
|
|
636
|
+
log.info(f"Generated {len(generated)} community descriptions")
|
|
637
|
+
return generated
|
|
638
|
+
|
|
639
|
+
def _fetch_community_descriptions(self) -> Dict[int, Optional[str]]:
|
|
640
|
+
"""Fetch all stored community descriptions."""
|
|
641
|
+
results = self.connection.execute("""
|
|
642
|
+
MATCH (c:Community)
|
|
643
|
+
RETURN c.community_id AS community_id,
|
|
644
|
+
c.description AS description
|
|
645
|
+
""")
|
|
646
|
+
return {row["community_id"]: row.get("description") for row in results}
|
|
647
|
+
|
|
648
|
+
def detect_knowledge_silos(
|
|
649
|
+
self,
|
|
650
|
+
importance_threshold: float = 0.0001,
|
|
651
|
+
max_authors: int = 2
|
|
652
|
+
) -> List[Dict]:
|
|
653
|
+
"""Detect knowledge silos - critical code with few maintainers.
|
|
654
|
+
|
|
655
|
+
A knowledge silo is important code (high PageRank) that only
|
|
656
|
+
1-2 people have worked on, creating a "bus factor" risk.
|
|
657
|
+
|
|
658
|
+
Args:
|
|
659
|
+
importance_threshold: Minimum PageRank score to consider
|
|
660
|
+
max_authors: Maximum author count to flag as silo
|
|
661
|
+
|
|
662
|
+
Returns:
|
|
663
|
+
List of knowledge silos with risk scores
|
|
664
|
+
"""
|
|
665
|
+
log.info("Detecting knowledge silos...")
|
|
666
|
+
|
|
667
|
+
try:
|
|
668
|
+
results = self.connection.execute("""
|
|
669
|
+
MATCH (f:File)-[:CONTAINS_FUNCTION|CONTAINS_CLASS]->(entity)
|
|
670
|
+
WHERE entity.pagerank IS NOT NULL
|
|
671
|
+
AND entity.pagerank >= $importance_threshold
|
|
672
|
+
WITH f, max(entity.pagerank) AS importance,
|
|
673
|
+
collect(entity.qualified_name)[0] AS top_entity
|
|
674
|
+
|
|
675
|
+
MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
676
|
+
WITH f, importance, top_entity,
|
|
677
|
+
collect(DISTINCT c.author_email) AS authors
|
|
678
|
+
WHERE size(authors) <= $max_authors
|
|
679
|
+
|
|
680
|
+
WITH f, importance, top_entity, authors,
|
|
681
|
+
importance / size(authors) AS risk_score
|
|
682
|
+
|
|
683
|
+
RETURN f.path AS file_path,
|
|
684
|
+
top_entity AS critical_entity,
|
|
685
|
+
importance,
|
|
686
|
+
size(authors) AS author_count,
|
|
687
|
+
authors,
|
|
688
|
+
risk_score
|
|
689
|
+
ORDER BY risk_score DESC
|
|
690
|
+
""", {
|
|
691
|
+
"importance_threshold": importance_threshold,
|
|
692
|
+
"max_authors": max_authors
|
|
693
|
+
})
|
|
694
|
+
|
|
695
|
+
silos = []
|
|
696
|
+
for record in results:
|
|
697
|
+
silos.append({
|
|
698
|
+
'file_path': record['file_path'],
|
|
699
|
+
'critical_entity': record['critical_entity'],
|
|
700
|
+
'importance': record['importance'],
|
|
701
|
+
'author_count': record['author_count'],
|
|
702
|
+
'authors': record['authors'],
|
|
703
|
+
'risk_score': record['risk_score']
|
|
704
|
+
})
|
|
705
|
+
|
|
706
|
+
log.info(f"Found {len(silos)} knowledge silos")
|
|
707
|
+
return silos
|
|
708
|
+
except Exception as e:
|
|
709
|
+
log.warning(f"Failed to detect knowledge silos: {e}")
|
|
710
|
+
return []
|
|
711
|
+
|
|
712
|
+
def get_file_ownership(self, file_path: str) -> Dict:
|
|
713
|
+
"""Get detailed ownership information for a file.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
file_path: Path to the file
|
|
717
|
+
|
|
718
|
+
Returns:
|
|
719
|
+
Dictionary with ownership statistics
|
|
720
|
+
"""
|
|
721
|
+
results = self.connection.execute("""
|
|
722
|
+
MATCH (f:File {path: $file_path})
|
|
723
|
+
OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
724
|
+
OPTIONAL MATCH (c)-[:AUTHORED_BY]->(a:Author)
|
|
725
|
+
|
|
726
|
+
WITH f,
|
|
727
|
+
count(DISTINCT c) AS total_commits,
|
|
728
|
+
collect(DISTINCT a.email) AS authors,
|
|
729
|
+
collect(DISTINCT a.name) AS author_names
|
|
730
|
+
|
|
731
|
+
RETURN f.path AS file_path,
|
|
732
|
+
total_commits,
|
|
733
|
+
size(authors) AS author_count,
|
|
734
|
+
authors,
|
|
735
|
+
author_names
|
|
736
|
+
""", {"file_path": file_path})
|
|
737
|
+
|
|
738
|
+
if not results:
|
|
739
|
+
return {}
|
|
740
|
+
|
|
741
|
+
record = results[0]
|
|
742
|
+
return {
|
|
743
|
+
'file_path': record['file_path'],
|
|
744
|
+
'total_commits': record['total_commits'],
|
|
745
|
+
'author_count': record['author_count'],
|
|
746
|
+
'authors': record['authors'],
|
|
747
|
+
'author_names': record['author_names']
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
def compute_commit_importance(
|
|
751
|
+
self,
|
|
752
|
+
write_back: bool = True
|
|
753
|
+
) -> Dict[str, Dict]:
|
|
754
|
+
"""Compute file importance based on commit activity.
|
|
755
|
+
|
|
756
|
+
Files with more commits and more authors are considered more
|
|
757
|
+
important as they are actively maintained and have broader ownership.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
write_back: Whether to write scores back to database
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
Dictionary mapping file path to importance metrics
|
|
764
|
+
"""
|
|
765
|
+
log.info("Computing commit-based importance...")
|
|
766
|
+
|
|
767
|
+
results = self.connection.execute("""
|
|
768
|
+
MATCH (f:File)
|
|
769
|
+
OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
770
|
+
WITH f, count(DISTINCT c) AS commit_count
|
|
771
|
+
OPTIONAL MATCH (c2:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
772
|
+
OPTIONAL MATCH (c2)-[:AUTHORED_BY]->(a:Author)
|
|
773
|
+
WITH f, commit_count,
|
|
774
|
+
count(DISTINCT a) AS author_count,
|
|
775
|
+
collect(DISTINCT a.email)[0:5] AS top_authors
|
|
776
|
+
WHERE commit_count > 0
|
|
777
|
+
RETURN f.path AS file_path,
|
|
778
|
+
commit_count,
|
|
779
|
+
author_count,
|
|
780
|
+
top_authors,
|
|
781
|
+
commit_count * (1.0 + log(author_count + 1)) AS importance_score
|
|
782
|
+
ORDER BY importance_score DESC
|
|
783
|
+
""")
|
|
784
|
+
|
|
785
|
+
importance = {}
|
|
786
|
+
for record in results:
|
|
787
|
+
importance[record['file_path']] = {
|
|
788
|
+
'commit_count': record['commit_count'],
|
|
789
|
+
'author_count': record['author_count'],
|
|
790
|
+
'top_authors': record['top_authors'],
|
|
791
|
+
'importance_score': record['importance_score']
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
log.info(f"Computed importance for {len(importance)} files")
|
|
795
|
+
|
|
796
|
+
if write_back:
|
|
797
|
+
self._write_commit_importance(importance)
|
|
798
|
+
|
|
799
|
+
return importance
|
|
800
|
+
|
|
801
|
+
def _write_commit_importance(self, importance: Dict[str, Dict]):
|
|
802
|
+
"""Write commit-based importance scores to database.
|
|
803
|
+
|
|
804
|
+
Args:
|
|
805
|
+
importance: Dictionary mapping file path to importance metrics
|
|
806
|
+
"""
|
|
807
|
+
log.info("Writing commit importance scores to database...")
|
|
808
|
+
write_conn = get_connection()
|
|
809
|
+
|
|
810
|
+
with write_lock_context("commit_importance"):
|
|
811
|
+
for path, data in importance.items():
|
|
812
|
+
write_conn.execute_write("""
|
|
813
|
+
MATCH (f:File {path: $path})
|
|
814
|
+
SET f.commit_importance = $importance_score,
|
|
815
|
+
f.commit_count = $commit_count,
|
|
816
|
+
f.author_count = $author_count
|
|
817
|
+
""", {
|
|
818
|
+
"path": path,
|
|
819
|
+
"importance_score": data['importance_score'],
|
|
820
|
+
"commit_count": data['commit_count'],
|
|
821
|
+
"author_count": data['author_count']
|
|
822
|
+
})
|
|
823
|
+
|
|
824
|
+
log.info(f"Wrote {len(importance)} commit importance scores")
|
|
825
|
+
|
|
826
|
+
def get_top_commit_importance(
|
|
827
|
+
self,
|
|
828
|
+
limit: int = 20
|
|
829
|
+
) -> List[Tuple[str, int, int, float]]:
|
|
830
|
+
"""Get top files by commit-based importance.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
limit: Number of results to return
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
List of (file_path, commit_count, author_count, importance_score) tuples
|
|
837
|
+
"""
|
|
838
|
+
results = self.connection.execute("""
|
|
839
|
+
MATCH (f:File)
|
|
840
|
+
WHERE f.commit_importance IS NOT NULL
|
|
841
|
+
RETURN f.path AS file_path,
|
|
842
|
+
f.commit_count AS commits,
|
|
843
|
+
f.author_count AS authors,
|
|
844
|
+
f.commit_importance AS score
|
|
845
|
+
ORDER BY f.commit_importance DESC
|
|
846
|
+
LIMIT $limit
|
|
847
|
+
""", {"limit": limit})
|
|
848
|
+
|
|
849
|
+
return [(r["file_path"], r["commits"], r["authors"], r["score"])
|
|
850
|
+
for r in results]
|
|
851
|
+
|
|
852
|
+
def compute_file_importance(
|
|
853
|
+
self,
|
|
854
|
+
days: int = 30,
|
|
855
|
+
limit: int = 50
|
|
856
|
+
) -> List[Dict]:
|
|
857
|
+
"""Compute file importance with recency weighting.
|
|
858
|
+
|
|
859
|
+
Args:
|
|
860
|
+
days: Time window in days for recency scoring
|
|
861
|
+
limit: Maximum number of files to return
|
|
862
|
+
|
|
863
|
+
Returns:
|
|
864
|
+
List of dicts with file importance metrics
|
|
865
|
+
"""
|
|
866
|
+
log.info(f"Computing file importance with {days}-day recency window...")
|
|
867
|
+
|
|
868
|
+
try:
|
|
869
|
+
results = self.connection.execute("""
|
|
870
|
+
MATCH (f:File)
|
|
871
|
+
OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
872
|
+
WITH f, collect(c) AS commits
|
|
873
|
+
WHERE size(commits) > 0
|
|
874
|
+
|
|
875
|
+
UNWIND commits AS commit
|
|
876
|
+
OPTIONAL MATCH (commit)-[:AUTHORED_BY]->(a:Author)
|
|
877
|
+
WITH f, commits,
|
|
878
|
+
count(DISTINCT commit) AS commit_count,
|
|
879
|
+
count(DISTINCT a) AS author_count
|
|
880
|
+
|
|
881
|
+
WITH f, commit_count, author_count,
|
|
882
|
+
commit_count * (1.0 + log(author_count + 1)) AS base_importance
|
|
883
|
+
|
|
884
|
+
RETURN f.path AS file_path,
|
|
885
|
+
commit_count AS commits,
|
|
886
|
+
author_count AS authors,
|
|
887
|
+
base_importance AS importance_score
|
|
888
|
+
ORDER BY importance_score DESC
|
|
889
|
+
LIMIT $limit
|
|
890
|
+
""", {"limit": limit})
|
|
891
|
+
|
|
892
|
+
files = []
|
|
893
|
+
for record in results:
|
|
894
|
+
files.append({
|
|
895
|
+
'file_path': record['file_path'],
|
|
896
|
+
'commits': record['commits'],
|
|
897
|
+
'authors': record['authors'],
|
|
898
|
+
'recent_commits': 0, # Simplified - recency calculation complex in Kuzu
|
|
899
|
+
'importance_score': record['importance_score']
|
|
900
|
+
})
|
|
901
|
+
|
|
902
|
+
log.info(f"Computed importance for {len(files)} files")
|
|
903
|
+
return files
|
|
904
|
+
except Exception as e:
|
|
905
|
+
log.warning(f"Failed to compute file importance: {e}")
|
|
906
|
+
return []
|
|
907
|
+
|
|
908
|
+
def compute_area_importance(
|
|
909
|
+
self,
|
|
910
|
+
depth: int = 2,
|
|
911
|
+
days: int = 30,
|
|
912
|
+
limit: int = 20
|
|
913
|
+
) -> List[Dict]:
|
|
914
|
+
"""Aggregate file importance by directory.
|
|
915
|
+
|
|
916
|
+
Args:
|
|
917
|
+
depth: Directory depth for grouping
|
|
918
|
+
days: Time window for recency scoring
|
|
919
|
+
limit: Maximum number of areas to return
|
|
920
|
+
|
|
921
|
+
Returns:
|
|
922
|
+
List of dicts with area importance metrics
|
|
923
|
+
"""
|
|
924
|
+
log.info(f"Computing area importance at depth {depth}...")
|
|
925
|
+
|
|
926
|
+
# Get file importance first
|
|
927
|
+
files = self.compute_file_importance(days=days, limit=500)
|
|
928
|
+
if not files:
|
|
929
|
+
return []
|
|
930
|
+
|
|
931
|
+
# Find common prefix (repo root) to make paths relative
|
|
932
|
+
all_paths = [f['file_path'] for f in files]
|
|
933
|
+
if all_paths:
|
|
934
|
+
common = os.path.commonpath(all_paths)
|
|
935
|
+
if not os.path.isdir(common):
|
|
936
|
+
common = os.path.dirname(common)
|
|
937
|
+
repo_root = common.rstrip('/') + '/'
|
|
938
|
+
else:
|
|
939
|
+
repo_root = '/'
|
|
940
|
+
|
|
941
|
+
# Group by directory
|
|
942
|
+
areas = defaultdict(lambda: {
|
|
943
|
+
'files': [],
|
|
944
|
+
'total_commits': 0,
|
|
945
|
+
'importance_sum': 0.0,
|
|
946
|
+
'max_authors': 0,
|
|
947
|
+
'recent_commits': 0,
|
|
948
|
+
})
|
|
949
|
+
|
|
950
|
+
total_recent = sum(f.get('recent_commits', 0) for f in files)
|
|
951
|
+
|
|
952
|
+
for f in files:
|
|
953
|
+
abs_path = f['file_path']
|
|
954
|
+
|
|
955
|
+
if abs_path.startswith(repo_root):
|
|
956
|
+
rel_path = abs_path[len(repo_root):]
|
|
957
|
+
else:
|
|
958
|
+
rel_path = abs_path
|
|
959
|
+
|
|
960
|
+
parts = rel_path.split('/')
|
|
961
|
+
|
|
962
|
+
if len(parts) > depth:
|
|
963
|
+
rel_area = '/'.join(parts[:depth]) + '/'
|
|
964
|
+
else:
|
|
965
|
+
rel_area = '/'.join(parts[:-1]) + '/' if len(parts) > 1 else '/'
|
|
966
|
+
|
|
967
|
+
areas[rel_area]['files'].append(f)
|
|
968
|
+
areas[rel_area]['total_commits'] += f['commits']
|
|
969
|
+
areas[rel_area]['importance_sum'] += f['importance_score']
|
|
970
|
+
# Track max authors seen in any file (approximation for unique authors)
|
|
971
|
+
areas[rel_area]['max_authors'] = max(
|
|
972
|
+
areas[rel_area]['max_authors'],
|
|
973
|
+
f.get('authors', 0)
|
|
974
|
+
)
|
|
975
|
+
areas[rel_area]['recent_commits'] += f.get('recent_commits', 0)
|
|
976
|
+
|
|
977
|
+
# Build result list
|
|
978
|
+
result_list = []
|
|
979
|
+
for rel_area, data in areas.items():
|
|
980
|
+
recent = data['recent_commits']
|
|
981
|
+
focus_pct = round(100 * recent / total_recent, 1) if total_recent > 0 else 0.0
|
|
982
|
+
result_list.append({
|
|
983
|
+
'path': rel_area,
|
|
984
|
+
'total_commits': data['total_commits'],
|
|
985
|
+
'file_count': len(data['files']),
|
|
986
|
+
'importance': data['importance_sum'],
|
|
987
|
+
'unique_authors': data['max_authors'],
|
|
988
|
+
'recent_commits': recent,
|
|
989
|
+
'focus_pct': focus_pct,
|
|
990
|
+
})
|
|
991
|
+
|
|
992
|
+
result_list.sort(key=lambda x: x['importance'], reverse=True)
|
|
993
|
+
log.info(f"Computed importance for {len(result_list)} areas")
|
|
994
|
+
return result_list[:limit]
|
|
995
|
+
|
|
996
|
+
def _build_code_graph(
|
|
997
|
+
self,
|
|
998
|
+
directed: bool = True,
|
|
999
|
+
use_ast_nodes: bool = False
|
|
1000
|
+
) -> nx.Graph:
|
|
1001
|
+
"""Build NetworkX graph from Kuzu code entities.
|
|
1002
|
+
|
|
1003
|
+
Args:
|
|
1004
|
+
directed: Whether to create a directed graph
|
|
1005
|
+
use_ast_nodes: If True, use ASTNodes with CALLS/USES relationships
|
|
1006
|
+
|
|
1007
|
+
Returns:
|
|
1008
|
+
NetworkX graph
|
|
1009
|
+
"""
|
|
1010
|
+
log.info(f"Building {'directed' if directed else 'undirected'} graph from Kuzu...")
|
|
1011
|
+
|
|
1012
|
+
graph = nx.DiGraph() if directed else nx.Graph()
|
|
1013
|
+
|
|
1014
|
+
if use_ast_nodes:
|
|
1015
|
+
# Use ASTNodes
|
|
1016
|
+
nodes = self.connection.execute("""
|
|
1017
|
+
MATCH (n:ASTNode)
|
|
1018
|
+
WHERE n.id IS NOT NULL
|
|
1019
|
+
AND NOT n.file_path CONTAINS 'venv/'
|
|
1020
|
+
AND NOT n.file_path CONTAINS 'node_modules/'
|
|
1021
|
+
RETURN n.id AS name, n.ast_type AS type
|
|
1022
|
+
""")
|
|
1023
|
+
|
|
1024
|
+
for record in nodes:
|
|
1025
|
+
graph.add_node(record["name"], entity_type=record["type"])
|
|
1026
|
+
|
|
1027
|
+
# Get edges (this might need adjustment based on actual schema)
|
|
1028
|
+
edges = self.connection.execute("""
|
|
1029
|
+
MATCH (a:ASTNode)-[r:CALLS]->(b:ASTNode)
|
|
1030
|
+
WHERE a.id IS NOT NULL AND b.id IS NOT NULL
|
|
1031
|
+
RETURN a.id AS source, b.id AS target, 'CALLS' AS rel_type
|
|
1032
|
+
""")
|
|
1033
|
+
|
|
1034
|
+
for record in edges:
|
|
1035
|
+
if record["source"] != record["target"]:
|
|
1036
|
+
graph.add_edge(
|
|
1037
|
+
record["source"],
|
|
1038
|
+
record["target"],
|
|
1039
|
+
relationship=record["rel_type"]
|
|
1040
|
+
)
|
|
1041
|
+
else:
|
|
1042
|
+
# Use Function/Class nodes
|
|
1043
|
+
func_nodes = self.connection.execute("""
|
|
1044
|
+
MATCH (n:Function)
|
|
1045
|
+
RETURN n.qualified_name AS name, 'Function' AS type
|
|
1046
|
+
""")
|
|
1047
|
+
|
|
1048
|
+
class_nodes = self.connection.execute("""
|
|
1049
|
+
MATCH (n:Class)
|
|
1050
|
+
RETURN n.qualified_name AS name, 'Class' AS type
|
|
1051
|
+
""")
|
|
1052
|
+
|
|
1053
|
+
for record in func_nodes + class_nodes:
|
|
1054
|
+
graph.add_node(record["name"], entity_type=record["type"])
|
|
1055
|
+
|
|
1056
|
+
# Get CALLS relationships
|
|
1057
|
+
call_edges = self.connection.execute("""
|
|
1058
|
+
MATCH (a:Function)-[:CALLS]->(b:Function)
|
|
1059
|
+
RETURN a.qualified_name AS source,
|
|
1060
|
+
b.qualified_name AS target,
|
|
1061
|
+
'CALLS' AS rel_type
|
|
1062
|
+
""")
|
|
1063
|
+
|
|
1064
|
+
for record in call_edges:
|
|
1065
|
+
graph.add_edge(
|
|
1066
|
+
record["source"],
|
|
1067
|
+
record["target"],
|
|
1068
|
+
relationship=record["rel_type"]
|
|
1069
|
+
)
|
|
1070
|
+
|
|
1071
|
+
# Get INHERITS_FROM relationships
|
|
1072
|
+
inherit_edges = self.connection.execute("""
|
|
1073
|
+
MATCH (a:Class)-[:INHERITS_FROM]->(b:Class)
|
|
1074
|
+
RETURN a.qualified_name AS source,
|
|
1075
|
+
b.qualified_name AS target,
|
|
1076
|
+
'INHERITS_FROM' AS rel_type
|
|
1077
|
+
""")
|
|
1078
|
+
|
|
1079
|
+
for record in inherit_edges:
|
|
1080
|
+
graph.add_edge(
|
|
1081
|
+
record["source"],
|
|
1082
|
+
record["target"],
|
|
1083
|
+
relationship=record["rel_type"]
|
|
1084
|
+
)
|
|
1085
|
+
|
|
1086
|
+
log.info(f"Built graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
|
|
1087
|
+
return graph
|
|
1088
|
+
|
|
1089
|
+
def _write_pagerank_scores(
|
|
1090
|
+
self,
|
|
1091
|
+
scores: Dict[str, float],
|
|
1092
|
+
use_ast_nodes: bool = False
|
|
1093
|
+
):
|
|
1094
|
+
"""Write PageRank scores back to database.
|
|
1095
|
+
|
|
1096
|
+
Args:
|
|
1097
|
+
scores: Dictionary mapping qualified_name to PageRank score
|
|
1098
|
+
use_ast_nodes: Whether scores are for ASTNodes
|
|
1099
|
+
"""
|
|
1100
|
+
log.info("Writing PageRank scores to database...")
|
|
1101
|
+
write_conn = get_connection()
|
|
1102
|
+
|
|
1103
|
+
with write_lock_context("pagerank"):
|
|
1104
|
+
for name, value in scores.items():
|
|
1105
|
+
if use_ast_nodes:
|
|
1106
|
+
write_conn.execute_write("""
|
|
1107
|
+
MATCH (n:ASTNode {id: $name})
|
|
1108
|
+
SET n.pagerank = $value
|
|
1109
|
+
""", {"name": name, "value": value})
|
|
1110
|
+
else:
|
|
1111
|
+
# Try Function first, then Class
|
|
1112
|
+
write_conn.execute_write("""
|
|
1113
|
+
MATCH (n:Function {qualified_name: $name})
|
|
1114
|
+
SET n.pagerank = $value
|
|
1115
|
+
""", {"name": name, "value": value})
|
|
1116
|
+
write_conn.execute_write("""
|
|
1117
|
+
MATCH (n:Class {qualified_name: $name})
|
|
1118
|
+
SET n.pagerank = $value
|
|
1119
|
+
""", {"name": name, "value": value})
|
|
1120
|
+
|
|
1121
|
+
log.info(f"Wrote {len(scores)} PageRank scores")
|
|
1122
|
+
|
|
1123
|
+
def _write_betweenness_scores(self, scores: Dict[str, float]):
|
|
1124
|
+
"""Write Betweenness Centrality scores back to database.
|
|
1125
|
+
|
|
1126
|
+
Args:
|
|
1127
|
+
scores: Dictionary mapping qualified_name to betweenness score
|
|
1128
|
+
"""
|
|
1129
|
+
log.info("Writing Betweenness scores to database...")
|
|
1130
|
+
write_conn = get_connection()
|
|
1131
|
+
|
|
1132
|
+
with write_lock_context("betweenness"):
|
|
1133
|
+
for name, value in scores.items():
|
|
1134
|
+
write_conn.execute_write("""
|
|
1135
|
+
MATCH (n:Function {qualified_name: $name})
|
|
1136
|
+
SET n.betweenness = $value
|
|
1137
|
+
""", {"name": name, "value": value})
|
|
1138
|
+
write_conn.execute_write("""
|
|
1139
|
+
MATCH (n:Class {qualified_name: $name})
|
|
1140
|
+
SET n.betweenness = $value
|
|
1141
|
+
""", {"name": name, "value": value})
|
|
1142
|
+
|
|
1143
|
+
log.info(f"Wrote {len(scores)} Betweenness scores")
|
|
1144
|
+
|
|
1145
|
+
def _write_community_assignments(self, communities: Dict[str, int]):
|
|
1146
|
+
"""Write community assignments back to database.
|
|
1147
|
+
|
|
1148
|
+
Args:
|
|
1149
|
+
communities: Dictionary mapping qualified_name to community ID
|
|
1150
|
+
"""
|
|
1151
|
+
log.info("Writing community assignments to database...")
|
|
1152
|
+
|
|
1153
|
+
rows = [{"name": name, "community_id": comm_id} for name, comm_id in communities.items()]
|
|
1154
|
+
if not rows:
|
|
1155
|
+
log.info("No community assignments to write")
|
|
1156
|
+
return
|
|
1157
|
+
|
|
1158
|
+
write_conn = get_connection()
|
|
1159
|
+
with write_lock_context("communities"):
|
|
1160
|
+
write_conn.execute_write("""
|
|
1161
|
+
UNWIND $rows AS row
|
|
1162
|
+
MATCH (n:Function {qualified_name: row.name})
|
|
1163
|
+
SET n.community = row.community_id
|
|
1164
|
+
""", {"rows": rows})
|
|
1165
|
+
write_conn.execute_write("""
|
|
1166
|
+
UNWIND $rows AS row
|
|
1167
|
+
MATCH (n:Class {qualified_name: row.name})
|
|
1168
|
+
SET n.community = row.community_id
|
|
1169
|
+
""", {"rows": rows})
|
|
1170
|
+
|
|
1171
|
+
log.info(f"Wrote {len(communities)} community assignments")
|
|
1172
|
+
|
|
1173
|
+
def _ensure_community_nodes(self, community_ids: Iterable[int]) -> None:
|
|
1174
|
+
"""Ensure community nodes exist for detected community IDs."""
|
|
1175
|
+
rows = [{"community_id": community_id} for community_id in community_ids]
|
|
1176
|
+
if not rows:
|
|
1177
|
+
return
|
|
1178
|
+
|
|
1179
|
+
write_conn = get_connection()
|
|
1180
|
+
with write_lock_context("community_nodes"):
|
|
1181
|
+
write_conn.execute_write("""
|
|
1182
|
+
UNWIND $rows AS row
|
|
1183
|
+
MERGE (c:Community {community_id: row.community_id})
|
|
1184
|
+
""", {"rows": rows})
|
|
1185
|
+
|
|
1186
|
+
def _get_community_ids(self) -> List[int]:
|
|
1187
|
+
"""Fetch distinct community IDs from the graph."""
|
|
1188
|
+
func_results = self.connection.execute("""
|
|
1189
|
+
MATCH (n:Function)
|
|
1190
|
+
WHERE n.community IS NOT NULL
|
|
1191
|
+
RETURN DISTINCT n.community AS community_id
|
|
1192
|
+
""")
|
|
1193
|
+
class_results = self.connection.execute("""
|
|
1194
|
+
MATCH (n:Class)
|
|
1195
|
+
WHERE n.community IS NOT NULL
|
|
1196
|
+
RETURN DISTINCT n.community AS community_id
|
|
1197
|
+
""")
|
|
1198
|
+
community_ids = {row["community_id"] for row in func_results + class_results}
|
|
1199
|
+
return sorted(community_ids)
|
|
1200
|
+
|
|
1201
|
+
def _build_community_description_prompt(self, community_id: int, max_members: int = 20) -> str:
|
|
1202
|
+
"""Build the LLM prompt for a community description."""
|
|
1203
|
+
func_results = self.connection.execute("""
|
|
1204
|
+
MATCH (n:Function)
|
|
1205
|
+
WHERE n.community = $community_id
|
|
1206
|
+
RETURN 'Function' AS type,
|
|
1207
|
+
n.name AS name,
|
|
1208
|
+
n.qualified_name AS qualified_name,
|
|
1209
|
+
n.file_path AS file_path,
|
|
1210
|
+
n.docstring AS docstring,
|
|
1211
|
+
n.pagerank AS pagerank
|
|
1212
|
+
ORDER BY n.pagerank DESC, n.name
|
|
1213
|
+
LIMIT $limit
|
|
1214
|
+
""", {"community_id": community_id, "limit": max_members})
|
|
1215
|
+
class_results = self.connection.execute("""
|
|
1216
|
+
MATCH (n:Class)
|
|
1217
|
+
WHERE n.community = $community_id
|
|
1218
|
+
RETURN 'Class' AS type,
|
|
1219
|
+
n.name AS name,
|
|
1220
|
+
n.qualified_name AS qualified_name,
|
|
1221
|
+
n.file_path AS file_path,
|
|
1222
|
+
n.docstring AS docstring,
|
|
1223
|
+
n.pagerank AS pagerank
|
|
1224
|
+
ORDER BY n.pagerank DESC, n.name
|
|
1225
|
+
LIMIT $limit
|
|
1226
|
+
""", {"community_id": community_id, "limit": max_members})
|
|
1227
|
+
results = func_results + class_results
|
|
1228
|
+
results.sort(key=lambda row: (row.get("pagerank") is None, -(row.get("pagerank") or 0), row.get("name") or ""))
|
|
1229
|
+
results = results[:max_members]
|
|
1230
|
+
|
|
1231
|
+
if not results:
|
|
1232
|
+
return ""
|
|
1233
|
+
|
|
1234
|
+
members = []
|
|
1235
|
+
file_paths = []
|
|
1236
|
+
seen_paths = set()
|
|
1237
|
+
|
|
1238
|
+
for row in results:
|
|
1239
|
+
name = row.get("qualified_name") or row.get("name") or "unknown"
|
|
1240
|
+
mtype = row.get("type") or "Entity"
|
|
1241
|
+
docstring = (row.get("docstring") or "").strip()
|
|
1242
|
+
if docstring:
|
|
1243
|
+
docstring = " ".join(docstring.split())
|
|
1244
|
+
if len(docstring) > 160:
|
|
1245
|
+
docstring = docstring[:157].rstrip() + "..."
|
|
1246
|
+
members.append(f"- {mtype}: {name} — {docstring}")
|
|
1247
|
+
else:
|
|
1248
|
+
members.append(f"- {mtype}: {name}")
|
|
1249
|
+
|
|
1250
|
+
file_path = row.get("file_path")
|
|
1251
|
+
if file_path:
|
|
1252
|
+
short_path = self._shorten_path(file_path)
|
|
1253
|
+
if short_path not in seen_paths:
|
|
1254
|
+
seen_paths.add(short_path)
|
|
1255
|
+
file_paths.append(short_path)
|
|
1256
|
+
|
|
1257
|
+
prompt_sections = [
|
|
1258
|
+
f"Community ID: {community_id}",
|
|
1259
|
+
"",
|
|
1260
|
+
"Key members:",
|
|
1261
|
+
*members,
|
|
1262
|
+
]
|
|
1263
|
+
|
|
1264
|
+
if file_paths:
|
|
1265
|
+
prompt_sections.extend([
|
|
1266
|
+
"",
|
|
1267
|
+
"Representative files:",
|
|
1268
|
+
*[f"- {p}" for p in file_paths[:15]],
|
|
1269
|
+
])
|
|
1270
|
+
|
|
1271
|
+
return "\n".join(prompt_sections)
|
|
1272
|
+
|
|
1273
|
+
def _shorten_path(self, path: str, max_parts: int = 3) -> str:
|
|
1274
|
+
"""Shorten a file path to the last few segments."""
|
|
1275
|
+
parts = path.replace("\\", "/").split("/")
|
|
1276
|
+
if len(parts) <= max_parts:
|
|
1277
|
+
return "/".join(parts)
|
|
1278
|
+
return "/".join(parts[-max_parts:])
|
|
1279
|
+
|
|
1280
|
+
def _clean_description(self, description: str) -> str:
|
|
1281
|
+
"""Normalize LLM output for storage."""
|
|
1282
|
+
cleaned_lines = [line.strip(" -\t") for line in description.splitlines() if line.strip()]
|
|
1283
|
+
cleaned = " ".join(cleaned_lines).strip()
|
|
1284
|
+
if len(cleaned) >= 2 and cleaned[0] == cleaned[-1] and cleaned[0] in {"'", '"'}:
|
|
1285
|
+
cleaned = cleaned[1:-1].strip()
|
|
1286
|
+
return cleaned
|