emdash-core 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. emdash_core/__init__.py +3 -0
  2. emdash_core/agent/__init__.py +37 -0
  3. emdash_core/agent/agents.py +225 -0
  4. emdash_core/agent/code_reviewer.py +476 -0
  5. emdash_core/agent/compaction.py +143 -0
  6. emdash_core/agent/context_manager.py +140 -0
  7. emdash_core/agent/events.py +338 -0
  8. emdash_core/agent/handlers.py +224 -0
  9. emdash_core/agent/inprocess_subagent.py +377 -0
  10. emdash_core/agent/mcp/__init__.py +50 -0
  11. emdash_core/agent/mcp/client.py +346 -0
  12. emdash_core/agent/mcp/config.py +302 -0
  13. emdash_core/agent/mcp/manager.py +496 -0
  14. emdash_core/agent/mcp/tool_factory.py +213 -0
  15. emdash_core/agent/prompts/__init__.py +38 -0
  16. emdash_core/agent/prompts/main_agent.py +104 -0
  17. emdash_core/agent/prompts/subagents.py +131 -0
  18. emdash_core/agent/prompts/workflow.py +136 -0
  19. emdash_core/agent/providers/__init__.py +34 -0
  20. emdash_core/agent/providers/base.py +143 -0
  21. emdash_core/agent/providers/factory.py +80 -0
  22. emdash_core/agent/providers/models.py +220 -0
  23. emdash_core/agent/providers/openai_provider.py +463 -0
  24. emdash_core/agent/providers/transformers_provider.py +217 -0
  25. emdash_core/agent/research/__init__.py +81 -0
  26. emdash_core/agent/research/agent.py +143 -0
  27. emdash_core/agent/research/controller.py +254 -0
  28. emdash_core/agent/research/critic.py +428 -0
  29. emdash_core/agent/research/macros.py +469 -0
  30. emdash_core/agent/research/planner.py +449 -0
  31. emdash_core/agent/research/researcher.py +436 -0
  32. emdash_core/agent/research/state.py +523 -0
  33. emdash_core/agent/research/synthesizer.py +594 -0
  34. emdash_core/agent/reviewer_profile.py +475 -0
  35. emdash_core/agent/rules.py +123 -0
  36. emdash_core/agent/runner.py +601 -0
  37. emdash_core/agent/session.py +262 -0
  38. emdash_core/agent/spec_schema.py +66 -0
  39. emdash_core/agent/specification.py +479 -0
  40. emdash_core/agent/subagent.py +397 -0
  41. emdash_core/agent/subagent_prompts.py +13 -0
  42. emdash_core/agent/toolkit.py +482 -0
  43. emdash_core/agent/toolkits/__init__.py +64 -0
  44. emdash_core/agent/toolkits/base.py +96 -0
  45. emdash_core/agent/toolkits/explore.py +47 -0
  46. emdash_core/agent/toolkits/plan.py +55 -0
  47. emdash_core/agent/tools/__init__.py +141 -0
  48. emdash_core/agent/tools/analytics.py +436 -0
  49. emdash_core/agent/tools/base.py +131 -0
  50. emdash_core/agent/tools/coding.py +484 -0
  51. emdash_core/agent/tools/github_mcp.py +592 -0
  52. emdash_core/agent/tools/history.py +13 -0
  53. emdash_core/agent/tools/modes.py +153 -0
  54. emdash_core/agent/tools/plan.py +206 -0
  55. emdash_core/agent/tools/plan_write.py +135 -0
  56. emdash_core/agent/tools/search.py +412 -0
  57. emdash_core/agent/tools/spec.py +341 -0
  58. emdash_core/agent/tools/task.py +262 -0
  59. emdash_core/agent/tools/task_output.py +204 -0
  60. emdash_core/agent/tools/tasks.py +454 -0
  61. emdash_core/agent/tools/traversal.py +588 -0
  62. emdash_core/agent/tools/web.py +179 -0
  63. emdash_core/analytics/__init__.py +5 -0
  64. emdash_core/analytics/engine.py +1286 -0
  65. emdash_core/api/__init__.py +5 -0
  66. emdash_core/api/agent.py +308 -0
  67. emdash_core/api/agents.py +154 -0
  68. emdash_core/api/analyze.py +264 -0
  69. emdash_core/api/auth.py +173 -0
  70. emdash_core/api/context.py +77 -0
  71. emdash_core/api/db.py +121 -0
  72. emdash_core/api/embed.py +131 -0
  73. emdash_core/api/feature.py +143 -0
  74. emdash_core/api/health.py +93 -0
  75. emdash_core/api/index.py +162 -0
  76. emdash_core/api/plan.py +110 -0
  77. emdash_core/api/projectmd.py +210 -0
  78. emdash_core/api/query.py +320 -0
  79. emdash_core/api/research.py +122 -0
  80. emdash_core/api/review.py +161 -0
  81. emdash_core/api/router.py +76 -0
  82. emdash_core/api/rules.py +116 -0
  83. emdash_core/api/search.py +119 -0
  84. emdash_core/api/spec.py +99 -0
  85. emdash_core/api/swarm.py +223 -0
  86. emdash_core/api/tasks.py +109 -0
  87. emdash_core/api/team.py +120 -0
  88. emdash_core/auth/__init__.py +17 -0
  89. emdash_core/auth/github.py +389 -0
  90. emdash_core/config.py +74 -0
  91. emdash_core/context/__init__.py +52 -0
  92. emdash_core/context/models.py +50 -0
  93. emdash_core/context/providers/__init__.py +11 -0
  94. emdash_core/context/providers/base.py +74 -0
  95. emdash_core/context/providers/explored_areas.py +183 -0
  96. emdash_core/context/providers/touched_areas.py +360 -0
  97. emdash_core/context/registry.py +73 -0
  98. emdash_core/context/reranker.py +199 -0
  99. emdash_core/context/service.py +260 -0
  100. emdash_core/context/session.py +352 -0
  101. emdash_core/core/__init__.py +104 -0
  102. emdash_core/core/config.py +454 -0
  103. emdash_core/core/exceptions.py +55 -0
  104. emdash_core/core/models.py +265 -0
  105. emdash_core/core/review_config.py +57 -0
  106. emdash_core/db/__init__.py +67 -0
  107. emdash_core/db/auth.py +134 -0
  108. emdash_core/db/models.py +91 -0
  109. emdash_core/db/provider.py +222 -0
  110. emdash_core/db/providers/__init__.py +5 -0
  111. emdash_core/db/providers/supabase.py +452 -0
  112. emdash_core/embeddings/__init__.py +24 -0
  113. emdash_core/embeddings/indexer.py +534 -0
  114. emdash_core/embeddings/models.py +192 -0
  115. emdash_core/embeddings/providers/__init__.py +7 -0
  116. emdash_core/embeddings/providers/base.py +112 -0
  117. emdash_core/embeddings/providers/fireworks.py +141 -0
  118. emdash_core/embeddings/providers/openai.py +104 -0
  119. emdash_core/embeddings/registry.py +146 -0
  120. emdash_core/embeddings/service.py +215 -0
  121. emdash_core/graph/__init__.py +26 -0
  122. emdash_core/graph/builder.py +134 -0
  123. emdash_core/graph/connection.py +692 -0
  124. emdash_core/graph/schema.py +416 -0
  125. emdash_core/graph/writer.py +667 -0
  126. emdash_core/ingestion/__init__.py +7 -0
  127. emdash_core/ingestion/change_detector.py +150 -0
  128. emdash_core/ingestion/git/__init__.py +5 -0
  129. emdash_core/ingestion/git/commit_analyzer.py +196 -0
  130. emdash_core/ingestion/github/__init__.py +6 -0
  131. emdash_core/ingestion/github/pr_fetcher.py +296 -0
  132. emdash_core/ingestion/github/task_extractor.py +100 -0
  133. emdash_core/ingestion/orchestrator.py +540 -0
  134. emdash_core/ingestion/parsers/__init__.py +10 -0
  135. emdash_core/ingestion/parsers/base_parser.py +66 -0
  136. emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
  137. emdash_core/ingestion/parsers/class_extractor.py +154 -0
  138. emdash_core/ingestion/parsers/function_extractor.py +202 -0
  139. emdash_core/ingestion/parsers/import_analyzer.py +119 -0
  140. emdash_core/ingestion/parsers/python_parser.py +123 -0
  141. emdash_core/ingestion/parsers/registry.py +72 -0
  142. emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
  143. emdash_core/ingestion/parsers/typescript_parser.py +278 -0
  144. emdash_core/ingestion/repository.py +346 -0
  145. emdash_core/models/__init__.py +38 -0
  146. emdash_core/models/agent.py +68 -0
  147. emdash_core/models/index.py +77 -0
  148. emdash_core/models/query.py +113 -0
  149. emdash_core/planning/__init__.py +7 -0
  150. emdash_core/planning/agent_api.py +413 -0
  151. emdash_core/planning/context_builder.py +265 -0
  152. emdash_core/planning/feature_context.py +232 -0
  153. emdash_core/planning/feature_expander.py +646 -0
  154. emdash_core/planning/llm_explainer.py +198 -0
  155. emdash_core/planning/similarity.py +509 -0
  156. emdash_core/planning/team_focus.py +821 -0
  157. emdash_core/server.py +153 -0
  158. emdash_core/sse/__init__.py +5 -0
  159. emdash_core/sse/stream.py +196 -0
  160. emdash_core/swarm/__init__.py +17 -0
  161. emdash_core/swarm/merge_agent.py +383 -0
  162. emdash_core/swarm/session_manager.py +274 -0
  163. emdash_core/swarm/swarm_runner.py +226 -0
  164. emdash_core/swarm/task_definition.py +137 -0
  165. emdash_core/swarm/worker_spawner.py +319 -0
  166. emdash_core/swarm/worktree_manager.py +278 -0
  167. emdash_core/templates/__init__.py +10 -0
  168. emdash_core/templates/defaults/agent-builder.md.template +82 -0
  169. emdash_core/templates/defaults/focus.md.template +115 -0
  170. emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
  171. emdash_core/templates/defaults/pr-review.md.template +80 -0
  172. emdash_core/templates/defaults/project.md.template +85 -0
  173. emdash_core/templates/defaults/research_critic.md.template +112 -0
  174. emdash_core/templates/defaults/research_planner.md.template +85 -0
  175. emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
  176. emdash_core/templates/defaults/reviewer.md.template +81 -0
  177. emdash_core/templates/defaults/spec.md.template +41 -0
  178. emdash_core/templates/defaults/tasks.md.template +78 -0
  179. emdash_core/templates/loader.py +296 -0
  180. emdash_core/utils/__init__.py +45 -0
  181. emdash_core/utils/git.py +84 -0
  182. emdash_core/utils/image.py +502 -0
  183. emdash_core/utils/logger.py +51 -0
  184. emdash_core-0.1.7.dist-info/METADATA +35 -0
  185. emdash_core-0.1.7.dist-info/RECORD +187 -0
  186. emdash_core-0.1.7.dist-info/WHEEL +4 -0
  187. emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,509 @@
1
+ """Semantic similarity search using Python-based vector operations."""
2
+
3
+ from typing import Optional
4
+ from datetime import datetime, timedelta
5
+
6
+ import numpy as np
7
+ from numpy.linalg import norm
8
+
9
+ from ..graph.connection import KuzuConnection, get_connection
10
+ from ..embeddings.service import EmbeddingService
11
+ from ..utils.logger import log
12
+
13
+
14
+ def cosine_similarity(vec1: list, vec2: list) -> float:
15
+ """Compute cosine similarity between two vectors.
16
+
17
+ Args:
18
+ vec1: First vector
19
+ vec2: Second vector
20
+
21
+ Returns:
22
+ Cosine similarity score (0-1)
23
+ """
24
+ if not vec1 or not vec2:
25
+ return 0.0
26
+ a = np.array(vec1)
27
+ b = np.array(vec2)
28
+ if norm(a) == 0 or norm(b) == 0:
29
+ return 0.0
30
+ return float(np.dot(a, b) / (norm(a) * norm(b)))
31
+
32
+
33
+ class SimilaritySearch:
34
+ """Vector similarity search using Python-based cosine similarity."""
35
+
36
+ def __init__(
37
+ self,
38
+ connection: Optional[KuzuConnection] = None,
39
+ embedding_service: Optional[EmbeddingService] = None,
40
+ ):
41
+ """Initialize similarity search.
42
+
43
+ Args:
44
+ connection: Kuzu connection. If None, uses global connection.
45
+ embedding_service: Embedding service. If None, creates new one.
46
+ """
47
+ self.connection = connection or get_connection()
48
+ self.embedding_service = embedding_service or EmbeddingService()
49
+
50
+ def find_similar_prs(
51
+ self,
52
+ query: str,
53
+ limit: int = 5,
54
+ min_score: float = 0.5,
55
+ ) -> list[dict]:
56
+ """Find PRs similar to a feature description.
57
+
58
+ Args:
59
+ query: Feature description or search query
60
+ limit: Maximum number of results
61
+ min_score: Minimum similarity score (0-1)
62
+
63
+ Returns:
64
+ List of similar PRs with metadata and scores
65
+ """
66
+ if not self.embedding_service.is_available:
67
+ log.warning("OpenAI API not available. Falling back to text search.")
68
+ return self._fallback_pr_search(query, limit)
69
+
70
+ # Generate query embedding
71
+ query_embedding = self.embedding_service.embed_query(query)
72
+ if not query_embedding:
73
+ log.error("Failed to generate query embedding")
74
+ return []
75
+
76
+ try:
77
+ # Fetch all PRs with embeddings from Kuzu
78
+ results = self.connection.execute("""
79
+ MATCH (pr:PullRequest)
80
+ WHERE pr.embedding IS NOT NULL
81
+ RETURN pr.number AS number,
82
+ pr.title AS title,
83
+ pr.description AS description,
84
+ pr.author AS author,
85
+ pr.state AS state,
86
+ pr.labels AS labels,
87
+ pr.files_changed AS files_changed,
88
+ pr.created_at AS created_at,
89
+ pr.embedding AS embedding
90
+ """)
91
+
92
+ # Compute cosine similarity in Python
93
+ scored_results = []
94
+ for row in results:
95
+ pr_embedding = row.get('embedding')
96
+ if pr_embedding:
97
+ similarity = cosine_similarity(query_embedding, pr_embedding)
98
+ if similarity >= min_score:
99
+ result = {k: v for k, v in row.items() if k != 'embedding'}
100
+ result['score'] = similarity
101
+ scored_results.append(result)
102
+
103
+ # Sort by score and return top results
104
+ scored_results.sort(key=lambda x: x['score'], reverse=True)
105
+ return scored_results[:limit]
106
+
107
+ except Exception as e:
108
+ log.warning(f"Vector search failed: {e}")
109
+ return self._fallback_pr_search(query, limit)
110
+
111
+ def find_similar_code(
112
+ self,
113
+ query: str,
114
+ entity_types: list[str] = None,
115
+ limit: int = 10,
116
+ min_score: float = 0.5,
117
+ ) -> list[dict]:
118
+ """Find code entities similar to a description.
119
+
120
+ Args:
121
+ query: Feature description or search query
122
+ entity_types: List of entity types to search (Function, Class)
123
+ limit: Maximum number of results
124
+ min_score: Minimum similarity score (0-1)
125
+
126
+ Returns:
127
+ List of similar code entities with metadata and scores
128
+ """
129
+ if entity_types is None:
130
+ entity_types = ["Function", "Class"]
131
+
132
+ if not self.embedding_service.is_available:
133
+ log.warning("OpenAI API not available. Falling back to text search.")
134
+ return self._fallback_code_search(query, entity_types, limit)
135
+
136
+ # Generate query embedding
137
+ query_embedding = self.embedding_service.embed_query(query)
138
+ if not query_embedding:
139
+ log.error("Failed to generate query embedding")
140
+ return self._fallback_code_search(query, entity_types, limit)
141
+
142
+ results = []
143
+
144
+ try:
145
+ # Search functions
146
+ if "Function" in entity_types:
147
+ func_results = self.connection.execute("""
148
+ MATCH (f:Function)
149
+ WHERE f.embedding IS NOT NULL
150
+ RETURN 'Function' AS type,
151
+ f.name AS name,
152
+ f.qualified_name AS qualified_name,
153
+ f.docstring AS docstring,
154
+ f.file_path AS file_path,
155
+ f.embedding AS embedding
156
+ """)
157
+
158
+ for row in func_results:
159
+ func_embedding = row.get('embedding')
160
+ if func_embedding:
161
+ similarity = cosine_similarity(query_embedding, func_embedding)
162
+ if similarity >= min_score:
163
+ result = {k: v for k, v in row.items() if k != 'embedding'}
164
+ result['score'] = similarity
165
+ results.append(result)
166
+
167
+ # Search classes
168
+ if "Class" in entity_types:
169
+ class_results = self.connection.execute("""
170
+ MATCH (c:Class)
171
+ WHERE c.embedding IS NOT NULL
172
+ RETURN 'Class' AS type,
173
+ c.name AS name,
174
+ c.qualified_name AS qualified_name,
175
+ c.docstring AS docstring,
176
+ c.file_path AS file_path,
177
+ c.embedding AS embedding
178
+ """)
179
+
180
+ for row in class_results:
181
+ class_embedding = row.get('embedding')
182
+ if class_embedding:
183
+ similarity = cosine_similarity(query_embedding, class_embedding)
184
+ if similarity >= min_score:
185
+ result = {k: v for k, v in row.items() if k != 'embedding'}
186
+ result['score'] = similarity
187
+ results.append(result)
188
+
189
+ except Exception as e:
190
+ log.warning(f"Vector search failed: {e}")
191
+ return self._fallback_code_search(query, entity_types, limit)
192
+
193
+ # If no results from vector search, fall back to text search
194
+ if not results:
195
+ log.info("No vector search results. Falling back to text search.")
196
+ return self._fallback_code_search(query, entity_types, limit)
197
+
198
+ # Sort by score and limit
199
+ results.sort(key=lambda x: x.get("score", 0), reverse=True)
200
+ return results[:limit]
201
+
202
+ def _fallback_pr_search(self, query: str, limit: int) -> list[dict]:
203
+ """Fallback to text search when vector search is unavailable.
204
+
205
+ Args:
206
+ query: Search query
207
+ limit: Maximum number of results
208
+
209
+ Returns:
210
+ List of matching PRs
211
+ """
212
+ # Use CONTAINS for simple text matching
213
+ results = self.connection.execute("""
214
+ MATCH (pr:PullRequest)
215
+ WHERE lower(pr.title) CONTAINS lower($search_term)
216
+ OR lower(pr.description) CONTAINS lower($search_term)
217
+ RETURN pr.number AS number,
218
+ pr.title AS title,
219
+ pr.description AS description,
220
+ pr.author AS author,
221
+ pr.state AS state,
222
+ pr.labels AS labels,
223
+ pr.files_changed AS files_changed,
224
+ pr.created_at AS created_at
225
+ ORDER BY pr.created_at DESC
226
+ LIMIT $limit
227
+ """, {"search_term": query, "limit": limit})
228
+
229
+ # Add default score
230
+ for r in results:
231
+ r['score'] = 1.0
232
+
233
+ return results
234
+
235
+ def _fallback_code_search(
236
+ self,
237
+ query: str,
238
+ entity_types: list[str],
239
+ limit: int,
240
+ ) -> list[dict]:
241
+ """Fallback to text search when vector search is unavailable.
242
+
243
+ Args:
244
+ query: Search query
245
+ entity_types: Entity types to search
246
+ limit: Maximum number of results
247
+
248
+ Returns:
249
+ List of matching code entities
250
+ """
251
+ results = []
252
+
253
+ if "Function" in entity_types:
254
+ func_results = self.connection.execute("""
255
+ MATCH (f:Function)
256
+ WHERE lower(f.name) CONTAINS lower($search_term)
257
+ OR lower(f.docstring) CONTAINS lower($search_term)
258
+ RETURN 'Function' AS type,
259
+ f.name AS name,
260
+ f.qualified_name AS qualified_name,
261
+ f.docstring AS docstring,
262
+ f.file_path AS file_path
263
+ LIMIT $limit
264
+ """, {"search_term": query, "limit": limit})
265
+
266
+ for r in func_results:
267
+ r['score'] = 1.0
268
+ results.extend(func_results)
269
+
270
+ if "Class" in entity_types:
271
+ class_results = self.connection.execute("""
272
+ MATCH (c:Class)
273
+ WHERE lower(c.name) CONTAINS lower($search_term)
274
+ OR lower(c.docstring) CONTAINS lower($search_term)
275
+ RETURN 'Class' AS type,
276
+ c.name AS name,
277
+ c.qualified_name AS qualified_name,
278
+ c.docstring AS docstring,
279
+ c.file_path AS file_path
280
+ LIMIT $limit
281
+ """, {"search_term": query, "limit": limit})
282
+
283
+ for r in class_results:
284
+ r['score'] = 1.0
285
+ results.extend(class_results)
286
+
287
+ return results[:limit]
288
+
289
+ def importance_weighted_search(
290
+ self,
291
+ query: str,
292
+ entity_types: list[str] = None,
293
+ limit: int = 10,
294
+ min_score: float = 0.3,
295
+ days: int = 30,
296
+ semantic_weight: float = 0.4,
297
+ importance_weight: float = 0.35,
298
+ pagerank_weight: float = 0.25,
299
+ ) -> list[dict]:
300
+ """Find code entities with importance-weighted ranking.
301
+
302
+ Combines semantic similarity with file importance and PageRank
303
+ to find code that is both relevant AND important to the team.
304
+
305
+ Args:
306
+ query: Feature description or search query
307
+ entity_types: List of entity types to search (Function, Class)
308
+ limit: Maximum number of results
309
+ min_score: Minimum semantic similarity score (0-1)
310
+ days: Time window for importance calculation
311
+ semantic_weight: Weight for semantic similarity (0-1)
312
+ importance_weight: Weight for file importance (0-1)
313
+ pagerank_weight: Weight for PageRank centrality (0-1)
314
+
315
+ Returns:
316
+ List of code entities with combined scores, sorted by relevance
317
+ """
318
+ if entity_types is None:
319
+ entity_types = ["Function", "Class"]
320
+
321
+ # Get more candidates than needed for re-ranking
322
+ candidates = self.find_similar_code(
323
+ query=query,
324
+ entity_types=entity_types,
325
+ limit=limit * 3, # Get extra for re-ranking
326
+ min_score=min_score,
327
+ )
328
+
329
+ if not candidates:
330
+ log.info("No semantic search results found")
331
+ return []
332
+
333
+ log.info(f"Found {len(candidates)} semantic candidates, fetching importance scores...")
334
+
335
+ # Fetch importance and PageRank scores for candidates
336
+ enriched = self._enrich_with_importance(candidates, days=days)
337
+
338
+ # Normalize and combine scores
339
+ ranked = self._compute_combined_scores(
340
+ enriched,
341
+ semantic_weight=semantic_weight,
342
+ importance_weight=importance_weight,
343
+ pagerank_weight=pagerank_weight,
344
+ )
345
+
346
+ # Sort by combined score
347
+ ranked.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
348
+
349
+ log.info(f"Re-ranked results by combined score (semantic={semantic_weight}, importance={importance_weight}, pagerank={pagerank_weight})")
350
+
351
+ return ranked[:limit]
352
+
353
+ def _enrich_with_importance(
354
+ self,
355
+ candidates: list[dict],
356
+ days: int = 30,
357
+ ) -> list[dict]:
358
+ """Enrich candidates with importance and PageRank scores.
359
+
360
+ Args:
361
+ candidates: List of semantic search results
362
+ days: Time window for importance calculation
363
+
364
+ Returns:
365
+ Candidates enriched with importance_score and pagerank_score
366
+ """
367
+ cutoff_date = datetime.now() - timedelta(days=days)
368
+ cutoff_str = cutoff_date.strftime("%Y-%m-%dT%H:%M:%S")
369
+
370
+ # Group by file path for efficient querying
371
+ file_paths = set()
372
+ qualified_names = set()
373
+ for c in candidates:
374
+ if c.get("file_path"):
375
+ file_paths.add(c["file_path"])
376
+ if c.get("qualified_name"):
377
+ qualified_names.add(c["qualified_name"])
378
+
379
+ # Fetch file importance scores
380
+ file_importance = {}
381
+ if file_paths:
382
+ try:
383
+ results = self.connection.execute("""
384
+ MATCH (f:File)
385
+ WHERE f.path IN $paths
386
+ OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
387
+ WITH f, collect(c) AS commits
388
+ WHERE size(commits) > 0
389
+
390
+ UNWIND commits AS commit
391
+ OPTIONAL MATCH (commit)-[:AUTHORED_BY]->(a:Author)
392
+ WITH f, commits,
393
+ count(DISTINCT commit) AS commit_count,
394
+ count(DISTINCT a) AS author_count
395
+
396
+ WITH f, commit_count, author_count,
397
+ commit_count * (1.0 + log(author_count + 1)) AS base_importance
398
+
399
+ RETURN f.path AS path,
400
+ base_importance AS importance_score,
401
+ commit_count AS commits,
402
+ author_count AS authors
403
+ """, {"paths": list(file_paths)})
404
+
405
+ for record in results:
406
+ file_importance[record["path"]] = {
407
+ "importance_score": record["importance_score"] or 0,
408
+ "commits": record["commits"] or 0,
409
+ "authors": record["authors"] or 0,
410
+ }
411
+ except Exception as e:
412
+ log.debug(f"Failed to fetch file importance: {e}")
413
+
414
+ # Fetch PageRank scores for entities
415
+ pagerank_scores = {}
416
+ if qualified_names:
417
+ try:
418
+ # Query functions
419
+ func_results = self.connection.execute("""
420
+ MATCH (f:Function)
421
+ WHERE f.qualified_name IN $names
422
+ RETURN f.qualified_name AS name, f.pagerank AS pagerank
423
+ """, {"names": list(qualified_names)})
424
+
425
+ for record in func_results:
426
+ if record.get("pagerank"):
427
+ pagerank_scores[record["name"]] = record["pagerank"]
428
+
429
+ # Query classes
430
+ class_results = self.connection.execute("""
431
+ MATCH (c:Class)
432
+ WHERE c.qualified_name IN $names
433
+ RETURN c.qualified_name AS name, c.pagerank AS pagerank
434
+ """, {"names": list(qualified_names)})
435
+
436
+ for record in class_results:
437
+ if record.get("pagerank"):
438
+ pagerank_scores[record["name"]] = record["pagerank"]
439
+ except Exception as e:
440
+ log.debug(f"Failed to fetch PageRank scores: {e}")
441
+
442
+ # Enrich candidates
443
+ for c in candidates:
444
+ file_path = c.get("file_path", "")
445
+ qualified_name = c.get("qualified_name", "")
446
+
447
+ # Add file importance
448
+ if file_path in file_importance:
449
+ fi = file_importance[file_path]
450
+ c["importance_score"] = fi["importance_score"]
451
+ c["file_commits"] = fi["commits"]
452
+ c["file_authors"] = fi["authors"]
453
+ else:
454
+ c["importance_score"] = 0
455
+ c["file_commits"] = 0
456
+ c["file_authors"] = 0
457
+
458
+ # Add PageRank
459
+ c["pagerank_score"] = pagerank_scores.get(qualified_name, 0)
460
+
461
+ return candidates
462
+
463
+ def _compute_combined_scores(
464
+ self,
465
+ candidates: list[dict],
466
+ semantic_weight: float,
467
+ importance_weight: float,
468
+ pagerank_weight: float,
469
+ ) -> list[dict]:
470
+ """Compute combined scores for candidates.
471
+
472
+ Normalizes each score type and combines them with weights.
473
+
474
+ Args:
475
+ candidates: Enriched candidates
476
+ semantic_weight: Weight for semantic score
477
+ importance_weight: Weight for importance score
478
+ pagerank_weight: Weight for PageRank score
479
+
480
+ Returns:
481
+ Candidates with combined_score added
482
+ """
483
+ if not candidates:
484
+ return candidates
485
+
486
+ # Find max values for normalization
487
+ max_semantic = max(c.get("score", 0) for c in candidates) or 1
488
+ max_importance = max(c.get("importance_score", 0) for c in candidates) or 1
489
+ max_pagerank = max(c.get("pagerank_score", 0) for c in candidates) or 1
490
+
491
+ for c in candidates:
492
+ # Normalize to 0-1
493
+ norm_semantic = c.get("score", 0) / max_semantic
494
+ norm_importance = c.get("importance_score", 0) / max_importance
495
+ norm_pagerank = c.get("pagerank_score", 0) / max_pagerank
496
+
497
+ # Compute combined score
498
+ c["combined_score"] = (
499
+ norm_semantic * semantic_weight +
500
+ norm_importance * importance_weight +
501
+ norm_pagerank * pagerank_weight
502
+ )
503
+
504
+ # Store normalized scores for debugging
505
+ c["norm_semantic"] = round(norm_semantic, 3)
506
+ c["norm_importance"] = round(norm_importance, 3)
507
+ c["norm_pagerank"] = round(norm_pagerank, 3)
508
+
509
+ return candidates