emdash-core 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emdash_core/__init__.py +3 -0
- emdash_core/agent/__init__.py +37 -0
- emdash_core/agent/agents.py +225 -0
- emdash_core/agent/code_reviewer.py +476 -0
- emdash_core/agent/compaction.py +143 -0
- emdash_core/agent/context_manager.py +140 -0
- emdash_core/agent/events.py +338 -0
- emdash_core/agent/handlers.py +224 -0
- emdash_core/agent/inprocess_subagent.py +377 -0
- emdash_core/agent/mcp/__init__.py +50 -0
- emdash_core/agent/mcp/client.py +346 -0
- emdash_core/agent/mcp/config.py +302 -0
- emdash_core/agent/mcp/manager.py +496 -0
- emdash_core/agent/mcp/tool_factory.py +213 -0
- emdash_core/agent/prompts/__init__.py +38 -0
- emdash_core/agent/prompts/main_agent.py +104 -0
- emdash_core/agent/prompts/subagents.py +131 -0
- emdash_core/agent/prompts/workflow.py +136 -0
- emdash_core/agent/providers/__init__.py +34 -0
- emdash_core/agent/providers/base.py +143 -0
- emdash_core/agent/providers/factory.py +80 -0
- emdash_core/agent/providers/models.py +220 -0
- emdash_core/agent/providers/openai_provider.py +463 -0
- emdash_core/agent/providers/transformers_provider.py +217 -0
- emdash_core/agent/research/__init__.py +81 -0
- emdash_core/agent/research/agent.py +143 -0
- emdash_core/agent/research/controller.py +254 -0
- emdash_core/agent/research/critic.py +428 -0
- emdash_core/agent/research/macros.py +469 -0
- emdash_core/agent/research/planner.py +449 -0
- emdash_core/agent/research/researcher.py +436 -0
- emdash_core/agent/research/state.py +523 -0
- emdash_core/agent/research/synthesizer.py +594 -0
- emdash_core/agent/reviewer_profile.py +475 -0
- emdash_core/agent/rules.py +123 -0
- emdash_core/agent/runner.py +601 -0
- emdash_core/agent/session.py +262 -0
- emdash_core/agent/spec_schema.py +66 -0
- emdash_core/agent/specification.py +479 -0
- emdash_core/agent/subagent.py +397 -0
- emdash_core/agent/subagent_prompts.py +13 -0
- emdash_core/agent/toolkit.py +482 -0
- emdash_core/agent/toolkits/__init__.py +64 -0
- emdash_core/agent/toolkits/base.py +96 -0
- emdash_core/agent/toolkits/explore.py +47 -0
- emdash_core/agent/toolkits/plan.py +55 -0
- emdash_core/agent/tools/__init__.py +141 -0
- emdash_core/agent/tools/analytics.py +436 -0
- emdash_core/agent/tools/base.py +131 -0
- emdash_core/agent/tools/coding.py +484 -0
- emdash_core/agent/tools/github_mcp.py +592 -0
- emdash_core/agent/tools/history.py +13 -0
- emdash_core/agent/tools/modes.py +153 -0
- emdash_core/agent/tools/plan.py +206 -0
- emdash_core/agent/tools/plan_write.py +135 -0
- emdash_core/agent/tools/search.py +412 -0
- emdash_core/agent/tools/spec.py +341 -0
- emdash_core/agent/tools/task.py +262 -0
- emdash_core/agent/tools/task_output.py +204 -0
- emdash_core/agent/tools/tasks.py +454 -0
- emdash_core/agent/tools/traversal.py +588 -0
- emdash_core/agent/tools/web.py +179 -0
- emdash_core/analytics/__init__.py +5 -0
- emdash_core/analytics/engine.py +1286 -0
- emdash_core/api/__init__.py +5 -0
- emdash_core/api/agent.py +308 -0
- emdash_core/api/agents.py +154 -0
- emdash_core/api/analyze.py +264 -0
- emdash_core/api/auth.py +173 -0
- emdash_core/api/context.py +77 -0
- emdash_core/api/db.py +121 -0
- emdash_core/api/embed.py +131 -0
- emdash_core/api/feature.py +143 -0
- emdash_core/api/health.py +93 -0
- emdash_core/api/index.py +162 -0
- emdash_core/api/plan.py +110 -0
- emdash_core/api/projectmd.py +210 -0
- emdash_core/api/query.py +320 -0
- emdash_core/api/research.py +122 -0
- emdash_core/api/review.py +161 -0
- emdash_core/api/router.py +76 -0
- emdash_core/api/rules.py +116 -0
- emdash_core/api/search.py +119 -0
- emdash_core/api/spec.py +99 -0
- emdash_core/api/swarm.py +223 -0
- emdash_core/api/tasks.py +109 -0
- emdash_core/api/team.py +120 -0
- emdash_core/auth/__init__.py +17 -0
- emdash_core/auth/github.py +389 -0
- emdash_core/config.py +74 -0
- emdash_core/context/__init__.py +52 -0
- emdash_core/context/models.py +50 -0
- emdash_core/context/providers/__init__.py +11 -0
- emdash_core/context/providers/base.py +74 -0
- emdash_core/context/providers/explored_areas.py +183 -0
- emdash_core/context/providers/touched_areas.py +360 -0
- emdash_core/context/registry.py +73 -0
- emdash_core/context/reranker.py +199 -0
- emdash_core/context/service.py +260 -0
- emdash_core/context/session.py +352 -0
- emdash_core/core/__init__.py +104 -0
- emdash_core/core/config.py +454 -0
- emdash_core/core/exceptions.py +55 -0
- emdash_core/core/models.py +265 -0
- emdash_core/core/review_config.py +57 -0
- emdash_core/db/__init__.py +67 -0
- emdash_core/db/auth.py +134 -0
- emdash_core/db/models.py +91 -0
- emdash_core/db/provider.py +222 -0
- emdash_core/db/providers/__init__.py +5 -0
- emdash_core/db/providers/supabase.py +452 -0
- emdash_core/embeddings/__init__.py +24 -0
- emdash_core/embeddings/indexer.py +534 -0
- emdash_core/embeddings/models.py +192 -0
- emdash_core/embeddings/providers/__init__.py +7 -0
- emdash_core/embeddings/providers/base.py +112 -0
- emdash_core/embeddings/providers/fireworks.py +141 -0
- emdash_core/embeddings/providers/openai.py +104 -0
- emdash_core/embeddings/registry.py +146 -0
- emdash_core/embeddings/service.py +215 -0
- emdash_core/graph/__init__.py +26 -0
- emdash_core/graph/builder.py +134 -0
- emdash_core/graph/connection.py +692 -0
- emdash_core/graph/schema.py +416 -0
- emdash_core/graph/writer.py +667 -0
- emdash_core/ingestion/__init__.py +7 -0
- emdash_core/ingestion/change_detector.py +150 -0
- emdash_core/ingestion/git/__init__.py +5 -0
- emdash_core/ingestion/git/commit_analyzer.py +196 -0
- emdash_core/ingestion/github/__init__.py +6 -0
- emdash_core/ingestion/github/pr_fetcher.py +296 -0
- emdash_core/ingestion/github/task_extractor.py +100 -0
- emdash_core/ingestion/orchestrator.py +540 -0
- emdash_core/ingestion/parsers/__init__.py +10 -0
- emdash_core/ingestion/parsers/base_parser.py +66 -0
- emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
- emdash_core/ingestion/parsers/class_extractor.py +154 -0
- emdash_core/ingestion/parsers/function_extractor.py +202 -0
- emdash_core/ingestion/parsers/import_analyzer.py +119 -0
- emdash_core/ingestion/parsers/python_parser.py +123 -0
- emdash_core/ingestion/parsers/registry.py +72 -0
- emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
- emdash_core/ingestion/parsers/typescript_parser.py +278 -0
- emdash_core/ingestion/repository.py +346 -0
- emdash_core/models/__init__.py +38 -0
- emdash_core/models/agent.py +68 -0
- emdash_core/models/index.py +77 -0
- emdash_core/models/query.py +113 -0
- emdash_core/planning/__init__.py +7 -0
- emdash_core/planning/agent_api.py +413 -0
- emdash_core/planning/context_builder.py +265 -0
- emdash_core/planning/feature_context.py +232 -0
- emdash_core/planning/feature_expander.py +646 -0
- emdash_core/planning/llm_explainer.py +198 -0
- emdash_core/planning/similarity.py +509 -0
- emdash_core/planning/team_focus.py +821 -0
- emdash_core/server.py +153 -0
- emdash_core/sse/__init__.py +5 -0
- emdash_core/sse/stream.py +196 -0
- emdash_core/swarm/__init__.py +17 -0
- emdash_core/swarm/merge_agent.py +383 -0
- emdash_core/swarm/session_manager.py +274 -0
- emdash_core/swarm/swarm_runner.py +226 -0
- emdash_core/swarm/task_definition.py +137 -0
- emdash_core/swarm/worker_spawner.py +319 -0
- emdash_core/swarm/worktree_manager.py +278 -0
- emdash_core/templates/__init__.py +10 -0
- emdash_core/templates/defaults/agent-builder.md.template +82 -0
- emdash_core/templates/defaults/focus.md.template +115 -0
- emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
- emdash_core/templates/defaults/pr-review.md.template +80 -0
- emdash_core/templates/defaults/project.md.template +85 -0
- emdash_core/templates/defaults/research_critic.md.template +112 -0
- emdash_core/templates/defaults/research_planner.md.template +85 -0
- emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
- emdash_core/templates/defaults/reviewer.md.template +81 -0
- emdash_core/templates/defaults/spec.md.template +41 -0
- emdash_core/templates/defaults/tasks.md.template +78 -0
- emdash_core/templates/loader.py +296 -0
- emdash_core/utils/__init__.py +45 -0
- emdash_core/utils/git.py +84 -0
- emdash_core/utils/image.py +502 -0
- emdash_core/utils/logger.py +51 -0
- emdash_core-0.1.7.dist-info/METADATA +35 -0
- emdash_core-0.1.7.dist-info/RECORD +187 -0
- emdash_core-0.1.7.dist-info/WHEEL +4 -0
- emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
"""Semantic similarity search using Python-based vector operations."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
from datetime import datetime, timedelta
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from numpy.linalg import norm
|
|
8
|
+
|
|
9
|
+
from ..graph.connection import KuzuConnection, get_connection
|
|
10
|
+
from ..embeddings.service import EmbeddingService
|
|
11
|
+
from ..utils.logger import log
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def cosine_similarity(vec1: list, vec2: list) -> float:
|
|
15
|
+
"""Compute cosine similarity between two vectors.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
vec1: First vector
|
|
19
|
+
vec2: Second vector
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
Cosine similarity score (0-1)
|
|
23
|
+
"""
|
|
24
|
+
if not vec1 or not vec2:
|
|
25
|
+
return 0.0
|
|
26
|
+
a = np.array(vec1)
|
|
27
|
+
b = np.array(vec2)
|
|
28
|
+
if norm(a) == 0 or norm(b) == 0:
|
|
29
|
+
return 0.0
|
|
30
|
+
return float(np.dot(a, b) / (norm(a) * norm(b)))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SimilaritySearch:
|
|
34
|
+
"""Vector similarity search using Python-based cosine similarity."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
connection: Optional[KuzuConnection] = None,
|
|
39
|
+
embedding_service: Optional[EmbeddingService] = None,
|
|
40
|
+
):
|
|
41
|
+
"""Initialize similarity search.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
connection: Kuzu connection. If None, uses global connection.
|
|
45
|
+
embedding_service: Embedding service. If None, creates new one.
|
|
46
|
+
"""
|
|
47
|
+
self.connection = connection or get_connection()
|
|
48
|
+
self.embedding_service = embedding_service or EmbeddingService()
|
|
49
|
+
|
|
50
|
+
def find_similar_prs(
|
|
51
|
+
self,
|
|
52
|
+
query: str,
|
|
53
|
+
limit: int = 5,
|
|
54
|
+
min_score: float = 0.5,
|
|
55
|
+
) -> list[dict]:
|
|
56
|
+
"""Find PRs similar to a feature description.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
query: Feature description or search query
|
|
60
|
+
limit: Maximum number of results
|
|
61
|
+
min_score: Minimum similarity score (0-1)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
List of similar PRs with metadata and scores
|
|
65
|
+
"""
|
|
66
|
+
if not self.embedding_service.is_available:
|
|
67
|
+
log.warning("OpenAI API not available. Falling back to text search.")
|
|
68
|
+
return self._fallback_pr_search(query, limit)
|
|
69
|
+
|
|
70
|
+
# Generate query embedding
|
|
71
|
+
query_embedding = self.embedding_service.embed_query(query)
|
|
72
|
+
if not query_embedding:
|
|
73
|
+
log.error("Failed to generate query embedding")
|
|
74
|
+
return []
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Fetch all PRs with embeddings from Kuzu
|
|
78
|
+
results = self.connection.execute("""
|
|
79
|
+
MATCH (pr:PullRequest)
|
|
80
|
+
WHERE pr.embedding IS NOT NULL
|
|
81
|
+
RETURN pr.number AS number,
|
|
82
|
+
pr.title AS title,
|
|
83
|
+
pr.description AS description,
|
|
84
|
+
pr.author AS author,
|
|
85
|
+
pr.state AS state,
|
|
86
|
+
pr.labels AS labels,
|
|
87
|
+
pr.files_changed AS files_changed,
|
|
88
|
+
pr.created_at AS created_at,
|
|
89
|
+
pr.embedding AS embedding
|
|
90
|
+
""")
|
|
91
|
+
|
|
92
|
+
# Compute cosine similarity in Python
|
|
93
|
+
scored_results = []
|
|
94
|
+
for row in results:
|
|
95
|
+
pr_embedding = row.get('embedding')
|
|
96
|
+
if pr_embedding:
|
|
97
|
+
similarity = cosine_similarity(query_embedding, pr_embedding)
|
|
98
|
+
if similarity >= min_score:
|
|
99
|
+
result = {k: v for k, v in row.items() if k != 'embedding'}
|
|
100
|
+
result['score'] = similarity
|
|
101
|
+
scored_results.append(result)
|
|
102
|
+
|
|
103
|
+
# Sort by score and return top results
|
|
104
|
+
scored_results.sort(key=lambda x: x['score'], reverse=True)
|
|
105
|
+
return scored_results[:limit]
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
log.warning(f"Vector search failed: {e}")
|
|
109
|
+
return self._fallback_pr_search(query, limit)
|
|
110
|
+
|
|
111
|
+
def find_similar_code(
|
|
112
|
+
self,
|
|
113
|
+
query: str,
|
|
114
|
+
entity_types: list[str] = None,
|
|
115
|
+
limit: int = 10,
|
|
116
|
+
min_score: float = 0.5,
|
|
117
|
+
) -> list[dict]:
|
|
118
|
+
"""Find code entities similar to a description.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
query: Feature description or search query
|
|
122
|
+
entity_types: List of entity types to search (Function, Class)
|
|
123
|
+
limit: Maximum number of results
|
|
124
|
+
min_score: Minimum similarity score (0-1)
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of similar code entities with metadata and scores
|
|
128
|
+
"""
|
|
129
|
+
if entity_types is None:
|
|
130
|
+
entity_types = ["Function", "Class"]
|
|
131
|
+
|
|
132
|
+
if not self.embedding_service.is_available:
|
|
133
|
+
log.warning("OpenAI API not available. Falling back to text search.")
|
|
134
|
+
return self._fallback_code_search(query, entity_types, limit)
|
|
135
|
+
|
|
136
|
+
# Generate query embedding
|
|
137
|
+
query_embedding = self.embedding_service.embed_query(query)
|
|
138
|
+
if not query_embedding:
|
|
139
|
+
log.error("Failed to generate query embedding")
|
|
140
|
+
return self._fallback_code_search(query, entity_types, limit)
|
|
141
|
+
|
|
142
|
+
results = []
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
# Search functions
|
|
146
|
+
if "Function" in entity_types:
|
|
147
|
+
func_results = self.connection.execute("""
|
|
148
|
+
MATCH (f:Function)
|
|
149
|
+
WHERE f.embedding IS NOT NULL
|
|
150
|
+
RETURN 'Function' AS type,
|
|
151
|
+
f.name AS name,
|
|
152
|
+
f.qualified_name AS qualified_name,
|
|
153
|
+
f.docstring AS docstring,
|
|
154
|
+
f.file_path AS file_path,
|
|
155
|
+
f.embedding AS embedding
|
|
156
|
+
""")
|
|
157
|
+
|
|
158
|
+
for row in func_results:
|
|
159
|
+
func_embedding = row.get('embedding')
|
|
160
|
+
if func_embedding:
|
|
161
|
+
similarity = cosine_similarity(query_embedding, func_embedding)
|
|
162
|
+
if similarity >= min_score:
|
|
163
|
+
result = {k: v for k, v in row.items() if k != 'embedding'}
|
|
164
|
+
result['score'] = similarity
|
|
165
|
+
results.append(result)
|
|
166
|
+
|
|
167
|
+
# Search classes
|
|
168
|
+
if "Class" in entity_types:
|
|
169
|
+
class_results = self.connection.execute("""
|
|
170
|
+
MATCH (c:Class)
|
|
171
|
+
WHERE c.embedding IS NOT NULL
|
|
172
|
+
RETURN 'Class' AS type,
|
|
173
|
+
c.name AS name,
|
|
174
|
+
c.qualified_name AS qualified_name,
|
|
175
|
+
c.docstring AS docstring,
|
|
176
|
+
c.file_path AS file_path,
|
|
177
|
+
c.embedding AS embedding
|
|
178
|
+
""")
|
|
179
|
+
|
|
180
|
+
for row in class_results:
|
|
181
|
+
class_embedding = row.get('embedding')
|
|
182
|
+
if class_embedding:
|
|
183
|
+
similarity = cosine_similarity(query_embedding, class_embedding)
|
|
184
|
+
if similarity >= min_score:
|
|
185
|
+
result = {k: v for k, v in row.items() if k != 'embedding'}
|
|
186
|
+
result['score'] = similarity
|
|
187
|
+
results.append(result)
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
log.warning(f"Vector search failed: {e}")
|
|
191
|
+
return self._fallback_code_search(query, entity_types, limit)
|
|
192
|
+
|
|
193
|
+
# If no results from vector search, fall back to text search
|
|
194
|
+
if not results:
|
|
195
|
+
log.info("No vector search results. Falling back to text search.")
|
|
196
|
+
return self._fallback_code_search(query, entity_types, limit)
|
|
197
|
+
|
|
198
|
+
# Sort by score and limit
|
|
199
|
+
results.sort(key=lambda x: x.get("score", 0), reverse=True)
|
|
200
|
+
return results[:limit]
|
|
201
|
+
|
|
202
|
+
def _fallback_pr_search(self, query: str, limit: int) -> list[dict]:
|
|
203
|
+
"""Fallback to text search when vector search is unavailable.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
query: Search query
|
|
207
|
+
limit: Maximum number of results
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
List of matching PRs
|
|
211
|
+
"""
|
|
212
|
+
# Use CONTAINS for simple text matching
|
|
213
|
+
results = self.connection.execute("""
|
|
214
|
+
MATCH (pr:PullRequest)
|
|
215
|
+
WHERE lower(pr.title) CONTAINS lower($search_term)
|
|
216
|
+
OR lower(pr.description) CONTAINS lower($search_term)
|
|
217
|
+
RETURN pr.number AS number,
|
|
218
|
+
pr.title AS title,
|
|
219
|
+
pr.description AS description,
|
|
220
|
+
pr.author AS author,
|
|
221
|
+
pr.state AS state,
|
|
222
|
+
pr.labels AS labels,
|
|
223
|
+
pr.files_changed AS files_changed,
|
|
224
|
+
pr.created_at AS created_at
|
|
225
|
+
ORDER BY pr.created_at DESC
|
|
226
|
+
LIMIT $limit
|
|
227
|
+
""", {"search_term": query, "limit": limit})
|
|
228
|
+
|
|
229
|
+
# Add default score
|
|
230
|
+
for r in results:
|
|
231
|
+
r['score'] = 1.0
|
|
232
|
+
|
|
233
|
+
return results
|
|
234
|
+
|
|
235
|
+
def _fallback_code_search(
|
|
236
|
+
self,
|
|
237
|
+
query: str,
|
|
238
|
+
entity_types: list[str],
|
|
239
|
+
limit: int,
|
|
240
|
+
) -> list[dict]:
|
|
241
|
+
"""Fallback to text search when vector search is unavailable.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
query: Search query
|
|
245
|
+
entity_types: Entity types to search
|
|
246
|
+
limit: Maximum number of results
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of matching code entities
|
|
250
|
+
"""
|
|
251
|
+
results = []
|
|
252
|
+
|
|
253
|
+
if "Function" in entity_types:
|
|
254
|
+
func_results = self.connection.execute("""
|
|
255
|
+
MATCH (f:Function)
|
|
256
|
+
WHERE lower(f.name) CONTAINS lower($search_term)
|
|
257
|
+
OR lower(f.docstring) CONTAINS lower($search_term)
|
|
258
|
+
RETURN 'Function' AS type,
|
|
259
|
+
f.name AS name,
|
|
260
|
+
f.qualified_name AS qualified_name,
|
|
261
|
+
f.docstring AS docstring,
|
|
262
|
+
f.file_path AS file_path
|
|
263
|
+
LIMIT $limit
|
|
264
|
+
""", {"search_term": query, "limit": limit})
|
|
265
|
+
|
|
266
|
+
for r in func_results:
|
|
267
|
+
r['score'] = 1.0
|
|
268
|
+
results.extend(func_results)
|
|
269
|
+
|
|
270
|
+
if "Class" in entity_types:
|
|
271
|
+
class_results = self.connection.execute("""
|
|
272
|
+
MATCH (c:Class)
|
|
273
|
+
WHERE lower(c.name) CONTAINS lower($search_term)
|
|
274
|
+
OR lower(c.docstring) CONTAINS lower($search_term)
|
|
275
|
+
RETURN 'Class' AS type,
|
|
276
|
+
c.name AS name,
|
|
277
|
+
c.qualified_name AS qualified_name,
|
|
278
|
+
c.docstring AS docstring,
|
|
279
|
+
c.file_path AS file_path
|
|
280
|
+
LIMIT $limit
|
|
281
|
+
""", {"search_term": query, "limit": limit})
|
|
282
|
+
|
|
283
|
+
for r in class_results:
|
|
284
|
+
r['score'] = 1.0
|
|
285
|
+
results.extend(class_results)
|
|
286
|
+
|
|
287
|
+
return results[:limit]
|
|
288
|
+
|
|
289
|
+
def importance_weighted_search(
|
|
290
|
+
self,
|
|
291
|
+
query: str,
|
|
292
|
+
entity_types: list[str] = None,
|
|
293
|
+
limit: int = 10,
|
|
294
|
+
min_score: float = 0.3,
|
|
295
|
+
days: int = 30,
|
|
296
|
+
semantic_weight: float = 0.4,
|
|
297
|
+
importance_weight: float = 0.35,
|
|
298
|
+
pagerank_weight: float = 0.25,
|
|
299
|
+
) -> list[dict]:
|
|
300
|
+
"""Find code entities with importance-weighted ranking.
|
|
301
|
+
|
|
302
|
+
Combines semantic similarity with file importance and PageRank
|
|
303
|
+
to find code that is both relevant AND important to the team.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
query: Feature description or search query
|
|
307
|
+
entity_types: List of entity types to search (Function, Class)
|
|
308
|
+
limit: Maximum number of results
|
|
309
|
+
min_score: Minimum semantic similarity score (0-1)
|
|
310
|
+
days: Time window for importance calculation
|
|
311
|
+
semantic_weight: Weight for semantic similarity (0-1)
|
|
312
|
+
importance_weight: Weight for file importance (0-1)
|
|
313
|
+
pagerank_weight: Weight for PageRank centrality (0-1)
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List of code entities with combined scores, sorted by relevance
|
|
317
|
+
"""
|
|
318
|
+
if entity_types is None:
|
|
319
|
+
entity_types = ["Function", "Class"]
|
|
320
|
+
|
|
321
|
+
# Get more candidates than needed for re-ranking
|
|
322
|
+
candidates = self.find_similar_code(
|
|
323
|
+
query=query,
|
|
324
|
+
entity_types=entity_types,
|
|
325
|
+
limit=limit * 3, # Get extra for re-ranking
|
|
326
|
+
min_score=min_score,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
if not candidates:
|
|
330
|
+
log.info("No semantic search results found")
|
|
331
|
+
return []
|
|
332
|
+
|
|
333
|
+
log.info(f"Found {len(candidates)} semantic candidates, fetching importance scores...")
|
|
334
|
+
|
|
335
|
+
# Fetch importance and PageRank scores for candidates
|
|
336
|
+
enriched = self._enrich_with_importance(candidates, days=days)
|
|
337
|
+
|
|
338
|
+
# Normalize and combine scores
|
|
339
|
+
ranked = self._compute_combined_scores(
|
|
340
|
+
enriched,
|
|
341
|
+
semantic_weight=semantic_weight,
|
|
342
|
+
importance_weight=importance_weight,
|
|
343
|
+
pagerank_weight=pagerank_weight,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Sort by combined score
|
|
347
|
+
ranked.sort(key=lambda x: x.get("combined_score", 0), reverse=True)
|
|
348
|
+
|
|
349
|
+
log.info(f"Re-ranked results by combined score (semantic={semantic_weight}, importance={importance_weight}, pagerank={pagerank_weight})")
|
|
350
|
+
|
|
351
|
+
return ranked[:limit]
|
|
352
|
+
|
|
353
|
+
def _enrich_with_importance(
|
|
354
|
+
self,
|
|
355
|
+
candidates: list[dict],
|
|
356
|
+
days: int = 30,
|
|
357
|
+
) -> list[dict]:
|
|
358
|
+
"""Enrich candidates with importance and PageRank scores.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
candidates: List of semantic search results
|
|
362
|
+
days: Time window for importance calculation
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Candidates enriched with importance_score and pagerank_score
|
|
366
|
+
"""
|
|
367
|
+
cutoff_date = datetime.now() - timedelta(days=days)
|
|
368
|
+
cutoff_str = cutoff_date.strftime("%Y-%m-%dT%H:%M:%S")
|
|
369
|
+
|
|
370
|
+
# Group by file path for efficient querying
|
|
371
|
+
file_paths = set()
|
|
372
|
+
qualified_names = set()
|
|
373
|
+
for c in candidates:
|
|
374
|
+
if c.get("file_path"):
|
|
375
|
+
file_paths.add(c["file_path"])
|
|
376
|
+
if c.get("qualified_name"):
|
|
377
|
+
qualified_names.add(c["qualified_name"])
|
|
378
|
+
|
|
379
|
+
# Fetch file importance scores
|
|
380
|
+
file_importance = {}
|
|
381
|
+
if file_paths:
|
|
382
|
+
try:
|
|
383
|
+
results = self.connection.execute("""
|
|
384
|
+
MATCH (f:File)
|
|
385
|
+
WHERE f.path IN $paths
|
|
386
|
+
OPTIONAL MATCH (c:GitCommit)-[:COMMIT_MODIFIES]->(f)
|
|
387
|
+
WITH f, collect(c) AS commits
|
|
388
|
+
WHERE size(commits) > 0
|
|
389
|
+
|
|
390
|
+
UNWIND commits AS commit
|
|
391
|
+
OPTIONAL MATCH (commit)-[:AUTHORED_BY]->(a:Author)
|
|
392
|
+
WITH f, commits,
|
|
393
|
+
count(DISTINCT commit) AS commit_count,
|
|
394
|
+
count(DISTINCT a) AS author_count
|
|
395
|
+
|
|
396
|
+
WITH f, commit_count, author_count,
|
|
397
|
+
commit_count * (1.0 + log(author_count + 1)) AS base_importance
|
|
398
|
+
|
|
399
|
+
RETURN f.path AS path,
|
|
400
|
+
base_importance AS importance_score,
|
|
401
|
+
commit_count AS commits,
|
|
402
|
+
author_count AS authors
|
|
403
|
+
""", {"paths": list(file_paths)})
|
|
404
|
+
|
|
405
|
+
for record in results:
|
|
406
|
+
file_importance[record["path"]] = {
|
|
407
|
+
"importance_score": record["importance_score"] or 0,
|
|
408
|
+
"commits": record["commits"] or 0,
|
|
409
|
+
"authors": record["authors"] or 0,
|
|
410
|
+
}
|
|
411
|
+
except Exception as e:
|
|
412
|
+
log.debug(f"Failed to fetch file importance: {e}")
|
|
413
|
+
|
|
414
|
+
# Fetch PageRank scores for entities
|
|
415
|
+
pagerank_scores = {}
|
|
416
|
+
if qualified_names:
|
|
417
|
+
try:
|
|
418
|
+
# Query functions
|
|
419
|
+
func_results = self.connection.execute("""
|
|
420
|
+
MATCH (f:Function)
|
|
421
|
+
WHERE f.qualified_name IN $names
|
|
422
|
+
RETURN f.qualified_name AS name, f.pagerank AS pagerank
|
|
423
|
+
""", {"names": list(qualified_names)})
|
|
424
|
+
|
|
425
|
+
for record in func_results:
|
|
426
|
+
if record.get("pagerank"):
|
|
427
|
+
pagerank_scores[record["name"]] = record["pagerank"]
|
|
428
|
+
|
|
429
|
+
# Query classes
|
|
430
|
+
class_results = self.connection.execute("""
|
|
431
|
+
MATCH (c:Class)
|
|
432
|
+
WHERE c.qualified_name IN $names
|
|
433
|
+
RETURN c.qualified_name AS name, c.pagerank AS pagerank
|
|
434
|
+
""", {"names": list(qualified_names)})
|
|
435
|
+
|
|
436
|
+
for record in class_results:
|
|
437
|
+
if record.get("pagerank"):
|
|
438
|
+
pagerank_scores[record["name"]] = record["pagerank"]
|
|
439
|
+
except Exception as e:
|
|
440
|
+
log.debug(f"Failed to fetch PageRank scores: {e}")
|
|
441
|
+
|
|
442
|
+
# Enrich candidates
|
|
443
|
+
for c in candidates:
|
|
444
|
+
file_path = c.get("file_path", "")
|
|
445
|
+
qualified_name = c.get("qualified_name", "")
|
|
446
|
+
|
|
447
|
+
# Add file importance
|
|
448
|
+
if file_path in file_importance:
|
|
449
|
+
fi = file_importance[file_path]
|
|
450
|
+
c["importance_score"] = fi["importance_score"]
|
|
451
|
+
c["file_commits"] = fi["commits"]
|
|
452
|
+
c["file_authors"] = fi["authors"]
|
|
453
|
+
else:
|
|
454
|
+
c["importance_score"] = 0
|
|
455
|
+
c["file_commits"] = 0
|
|
456
|
+
c["file_authors"] = 0
|
|
457
|
+
|
|
458
|
+
# Add PageRank
|
|
459
|
+
c["pagerank_score"] = pagerank_scores.get(qualified_name, 0)
|
|
460
|
+
|
|
461
|
+
return candidates
|
|
462
|
+
|
|
463
|
+
def _compute_combined_scores(
|
|
464
|
+
self,
|
|
465
|
+
candidates: list[dict],
|
|
466
|
+
semantic_weight: float,
|
|
467
|
+
importance_weight: float,
|
|
468
|
+
pagerank_weight: float,
|
|
469
|
+
) -> list[dict]:
|
|
470
|
+
"""Compute combined scores for candidates.
|
|
471
|
+
|
|
472
|
+
Normalizes each score type and combines them with weights.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
candidates: Enriched candidates
|
|
476
|
+
semantic_weight: Weight for semantic score
|
|
477
|
+
importance_weight: Weight for importance score
|
|
478
|
+
pagerank_weight: Weight for PageRank score
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
Candidates with combined_score added
|
|
482
|
+
"""
|
|
483
|
+
if not candidates:
|
|
484
|
+
return candidates
|
|
485
|
+
|
|
486
|
+
# Find max values for normalization
|
|
487
|
+
max_semantic = max(c.get("score", 0) for c in candidates) or 1
|
|
488
|
+
max_importance = max(c.get("importance_score", 0) for c in candidates) or 1
|
|
489
|
+
max_pagerank = max(c.get("pagerank_score", 0) for c in candidates) or 1
|
|
490
|
+
|
|
491
|
+
for c in candidates:
|
|
492
|
+
# Normalize to 0-1
|
|
493
|
+
norm_semantic = c.get("score", 0) / max_semantic
|
|
494
|
+
norm_importance = c.get("importance_score", 0) / max_importance
|
|
495
|
+
norm_pagerank = c.get("pagerank_score", 0) / max_pagerank
|
|
496
|
+
|
|
497
|
+
# Compute combined score
|
|
498
|
+
c["combined_score"] = (
|
|
499
|
+
norm_semantic * semantic_weight +
|
|
500
|
+
norm_importance * importance_weight +
|
|
501
|
+
norm_pagerank * pagerank_weight
|
|
502
|
+
)
|
|
503
|
+
|
|
504
|
+
# Store normalized scores for debugging
|
|
505
|
+
c["norm_semantic"] = round(norm_semantic, 3)
|
|
506
|
+
c["norm_importance"] = round(norm_importance, 3)
|
|
507
|
+
c["norm_pagerank"] = round(norm_pagerank, 3)
|
|
508
|
+
|
|
509
|
+
return candidates
|