emdash-core 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emdash_core/__init__.py +3 -0
- emdash_core/agent/__init__.py +37 -0
- emdash_core/agent/agents.py +225 -0
- emdash_core/agent/code_reviewer.py +476 -0
- emdash_core/agent/compaction.py +143 -0
- emdash_core/agent/context_manager.py +140 -0
- emdash_core/agent/events.py +338 -0
- emdash_core/agent/handlers.py +224 -0
- emdash_core/agent/inprocess_subagent.py +377 -0
- emdash_core/agent/mcp/__init__.py +50 -0
- emdash_core/agent/mcp/client.py +346 -0
- emdash_core/agent/mcp/config.py +302 -0
- emdash_core/agent/mcp/manager.py +496 -0
- emdash_core/agent/mcp/tool_factory.py +213 -0
- emdash_core/agent/prompts/__init__.py +38 -0
- emdash_core/agent/prompts/main_agent.py +104 -0
- emdash_core/agent/prompts/subagents.py +131 -0
- emdash_core/agent/prompts/workflow.py +136 -0
- emdash_core/agent/providers/__init__.py +34 -0
- emdash_core/agent/providers/base.py +143 -0
- emdash_core/agent/providers/factory.py +80 -0
- emdash_core/agent/providers/models.py +220 -0
- emdash_core/agent/providers/openai_provider.py +463 -0
- emdash_core/agent/providers/transformers_provider.py +217 -0
- emdash_core/agent/research/__init__.py +81 -0
- emdash_core/agent/research/agent.py +143 -0
- emdash_core/agent/research/controller.py +254 -0
- emdash_core/agent/research/critic.py +428 -0
- emdash_core/agent/research/macros.py +469 -0
- emdash_core/agent/research/planner.py +449 -0
- emdash_core/agent/research/researcher.py +436 -0
- emdash_core/agent/research/state.py +523 -0
- emdash_core/agent/research/synthesizer.py +594 -0
- emdash_core/agent/reviewer_profile.py +475 -0
- emdash_core/agent/rules.py +123 -0
- emdash_core/agent/runner.py +601 -0
- emdash_core/agent/session.py +262 -0
- emdash_core/agent/spec_schema.py +66 -0
- emdash_core/agent/specification.py +479 -0
- emdash_core/agent/subagent.py +397 -0
- emdash_core/agent/subagent_prompts.py +13 -0
- emdash_core/agent/toolkit.py +482 -0
- emdash_core/agent/toolkits/__init__.py +64 -0
- emdash_core/agent/toolkits/base.py +96 -0
- emdash_core/agent/toolkits/explore.py +47 -0
- emdash_core/agent/toolkits/plan.py +55 -0
- emdash_core/agent/tools/__init__.py +141 -0
- emdash_core/agent/tools/analytics.py +436 -0
- emdash_core/agent/tools/base.py +131 -0
- emdash_core/agent/tools/coding.py +484 -0
- emdash_core/agent/tools/github_mcp.py +592 -0
- emdash_core/agent/tools/history.py +13 -0
- emdash_core/agent/tools/modes.py +153 -0
- emdash_core/agent/tools/plan.py +206 -0
- emdash_core/agent/tools/plan_write.py +135 -0
- emdash_core/agent/tools/search.py +412 -0
- emdash_core/agent/tools/spec.py +341 -0
- emdash_core/agent/tools/task.py +262 -0
- emdash_core/agent/tools/task_output.py +204 -0
- emdash_core/agent/tools/tasks.py +454 -0
- emdash_core/agent/tools/traversal.py +588 -0
- emdash_core/agent/tools/web.py +179 -0
- emdash_core/analytics/__init__.py +5 -0
- emdash_core/analytics/engine.py +1286 -0
- emdash_core/api/__init__.py +5 -0
- emdash_core/api/agent.py +308 -0
- emdash_core/api/agents.py +154 -0
- emdash_core/api/analyze.py +264 -0
- emdash_core/api/auth.py +173 -0
- emdash_core/api/context.py +77 -0
- emdash_core/api/db.py +121 -0
- emdash_core/api/embed.py +131 -0
- emdash_core/api/feature.py +143 -0
- emdash_core/api/health.py +93 -0
- emdash_core/api/index.py +162 -0
- emdash_core/api/plan.py +110 -0
- emdash_core/api/projectmd.py +210 -0
- emdash_core/api/query.py +320 -0
- emdash_core/api/research.py +122 -0
- emdash_core/api/review.py +161 -0
- emdash_core/api/router.py +76 -0
- emdash_core/api/rules.py +116 -0
- emdash_core/api/search.py +119 -0
- emdash_core/api/spec.py +99 -0
- emdash_core/api/swarm.py +223 -0
- emdash_core/api/tasks.py +109 -0
- emdash_core/api/team.py +120 -0
- emdash_core/auth/__init__.py +17 -0
- emdash_core/auth/github.py +389 -0
- emdash_core/config.py +74 -0
- emdash_core/context/__init__.py +52 -0
- emdash_core/context/models.py +50 -0
- emdash_core/context/providers/__init__.py +11 -0
- emdash_core/context/providers/base.py +74 -0
- emdash_core/context/providers/explored_areas.py +183 -0
- emdash_core/context/providers/touched_areas.py +360 -0
- emdash_core/context/registry.py +73 -0
- emdash_core/context/reranker.py +199 -0
- emdash_core/context/service.py +260 -0
- emdash_core/context/session.py +352 -0
- emdash_core/core/__init__.py +104 -0
- emdash_core/core/config.py +454 -0
- emdash_core/core/exceptions.py +55 -0
- emdash_core/core/models.py +265 -0
- emdash_core/core/review_config.py +57 -0
- emdash_core/db/__init__.py +67 -0
- emdash_core/db/auth.py +134 -0
- emdash_core/db/models.py +91 -0
- emdash_core/db/provider.py +222 -0
- emdash_core/db/providers/__init__.py +5 -0
- emdash_core/db/providers/supabase.py +452 -0
- emdash_core/embeddings/__init__.py +24 -0
- emdash_core/embeddings/indexer.py +534 -0
- emdash_core/embeddings/models.py +192 -0
- emdash_core/embeddings/providers/__init__.py +7 -0
- emdash_core/embeddings/providers/base.py +112 -0
- emdash_core/embeddings/providers/fireworks.py +141 -0
- emdash_core/embeddings/providers/openai.py +104 -0
- emdash_core/embeddings/registry.py +146 -0
- emdash_core/embeddings/service.py +215 -0
- emdash_core/graph/__init__.py +26 -0
- emdash_core/graph/builder.py +134 -0
- emdash_core/graph/connection.py +692 -0
- emdash_core/graph/schema.py +416 -0
- emdash_core/graph/writer.py +667 -0
- emdash_core/ingestion/__init__.py +7 -0
- emdash_core/ingestion/change_detector.py +150 -0
- emdash_core/ingestion/git/__init__.py +5 -0
- emdash_core/ingestion/git/commit_analyzer.py +196 -0
- emdash_core/ingestion/github/__init__.py +6 -0
- emdash_core/ingestion/github/pr_fetcher.py +296 -0
- emdash_core/ingestion/github/task_extractor.py +100 -0
- emdash_core/ingestion/orchestrator.py +540 -0
- emdash_core/ingestion/parsers/__init__.py +10 -0
- emdash_core/ingestion/parsers/base_parser.py +66 -0
- emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
- emdash_core/ingestion/parsers/class_extractor.py +154 -0
- emdash_core/ingestion/parsers/function_extractor.py +202 -0
- emdash_core/ingestion/parsers/import_analyzer.py +119 -0
- emdash_core/ingestion/parsers/python_parser.py +123 -0
- emdash_core/ingestion/parsers/registry.py +72 -0
- emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
- emdash_core/ingestion/parsers/typescript_parser.py +278 -0
- emdash_core/ingestion/repository.py +346 -0
- emdash_core/models/__init__.py +38 -0
- emdash_core/models/agent.py +68 -0
- emdash_core/models/index.py +77 -0
- emdash_core/models/query.py +113 -0
- emdash_core/planning/__init__.py +7 -0
- emdash_core/planning/agent_api.py +413 -0
- emdash_core/planning/context_builder.py +265 -0
- emdash_core/planning/feature_context.py +232 -0
- emdash_core/planning/feature_expander.py +646 -0
- emdash_core/planning/llm_explainer.py +198 -0
- emdash_core/planning/similarity.py +509 -0
- emdash_core/planning/team_focus.py +821 -0
- emdash_core/server.py +153 -0
- emdash_core/sse/__init__.py +5 -0
- emdash_core/sse/stream.py +196 -0
- emdash_core/swarm/__init__.py +17 -0
- emdash_core/swarm/merge_agent.py +383 -0
- emdash_core/swarm/session_manager.py +274 -0
- emdash_core/swarm/swarm_runner.py +226 -0
- emdash_core/swarm/task_definition.py +137 -0
- emdash_core/swarm/worker_spawner.py +319 -0
- emdash_core/swarm/worktree_manager.py +278 -0
- emdash_core/templates/__init__.py +10 -0
- emdash_core/templates/defaults/agent-builder.md.template +82 -0
- emdash_core/templates/defaults/focus.md.template +115 -0
- emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
- emdash_core/templates/defaults/pr-review.md.template +80 -0
- emdash_core/templates/defaults/project.md.template +85 -0
- emdash_core/templates/defaults/research_critic.md.template +112 -0
- emdash_core/templates/defaults/research_planner.md.template +85 -0
- emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
- emdash_core/templates/defaults/reviewer.md.template +81 -0
- emdash_core/templates/defaults/spec.md.template +41 -0
- emdash_core/templates/defaults/tasks.md.template +78 -0
- emdash_core/templates/loader.py +296 -0
- emdash_core/utils/__init__.py +45 -0
- emdash_core/utils/git.py +84 -0
- emdash_core/utils/image.py +502 -0
- emdash_core/utils/logger.py +51 -0
- emdash_core-0.1.7.dist-info/METADATA +35 -0
- emdash_core-0.1.7.dist-info/RECORD +187 -0
- emdash_core-0.1.7.dist-info/WHEEL +4 -0
- emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""Batch embedding indexer for Kuzu entities."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from ..graph.connection import KuzuConnection, get_connection
|
|
6
|
+
from .service import EmbeddingService
|
|
7
|
+
from ..utils.logger import log
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class EmbeddingIndexer:
|
|
11
|
+
"""Generates and stores embeddings for graph entities."""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
connection: Optional[KuzuConnection] = None,
|
|
16
|
+
embedding_service: Optional[EmbeddingService] = None,
|
|
17
|
+
):
|
|
18
|
+
"""Initialize embedding indexer.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
connection: Kuzu connection. If None, uses global connection.
|
|
22
|
+
embedding_service: Embedding service. If None, creates new one.
|
|
23
|
+
"""
|
|
24
|
+
self.connection = connection or get_connection()
|
|
25
|
+
self.embedding_service = embedding_service or EmbeddingService()
|
|
26
|
+
|
|
27
|
+
def index_pull_requests(self, batch_size: int = 50) -> int:
|
|
28
|
+
"""Generate embeddings for PRs without embeddings.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
batch_size: Number of PRs to process per batch
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Number of PRs indexed
|
|
35
|
+
"""
|
|
36
|
+
if not self.embedding_service.is_available:
|
|
37
|
+
log.warning("OpenAI API key not configured. Skipping PR embedding.")
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
log.info("Indexing PR embeddings...")
|
|
41
|
+
indexed_count = 0
|
|
42
|
+
|
|
43
|
+
# Get PRs without embeddings
|
|
44
|
+
prs = self.connection.execute("""
|
|
45
|
+
MATCH (pr:PullRequest)
|
|
46
|
+
WHERE pr.embedding IS NULL
|
|
47
|
+
RETURN pr.number AS number,
|
|
48
|
+
pr.title AS title,
|
|
49
|
+
pr.description AS description
|
|
50
|
+
LIMIT $limit
|
|
51
|
+
""", {"limit": batch_size})
|
|
52
|
+
|
|
53
|
+
while prs:
|
|
54
|
+
# Generate embeddings
|
|
55
|
+
texts = [
|
|
56
|
+
f"{pr['title']}\n\n{pr['description'] or ''}"
|
|
57
|
+
for pr in prs
|
|
58
|
+
]
|
|
59
|
+
embeddings = self.embedding_service.embed_texts(texts)
|
|
60
|
+
|
|
61
|
+
# Update PRs with embeddings
|
|
62
|
+
for pr, embedding in zip(prs, embeddings):
|
|
63
|
+
if embedding:
|
|
64
|
+
self.connection.execute_write("""
|
|
65
|
+
MATCH (pr:PullRequest {number: $number})
|
|
66
|
+
SET pr.embedding = $embedding
|
|
67
|
+
""", {"number": pr['number'], "embedding": list(embedding)})
|
|
68
|
+
indexed_count += 1
|
|
69
|
+
|
|
70
|
+
log.info(f"Indexed {indexed_count} PR embeddings...")
|
|
71
|
+
|
|
72
|
+
# Get next batch
|
|
73
|
+
prs = self.connection.execute("""
|
|
74
|
+
MATCH (pr:PullRequest)
|
|
75
|
+
WHERE pr.embedding IS NULL
|
|
76
|
+
RETURN pr.number AS number,
|
|
77
|
+
pr.title AS title,
|
|
78
|
+
pr.description AS description
|
|
79
|
+
LIMIT $limit
|
|
80
|
+
""", {"limit": batch_size})
|
|
81
|
+
|
|
82
|
+
log.info(f"Completed indexing {indexed_count} PR embeddings")
|
|
83
|
+
return indexed_count
|
|
84
|
+
|
|
85
|
+
def _build_function_text(self, func: dict) -> str:
|
|
86
|
+
"""Build rich text for function embedding.
|
|
87
|
+
|
|
88
|
+
Includes file path, name, signature, and docstring for better semantic matching.
|
|
89
|
+
"""
|
|
90
|
+
parts = []
|
|
91
|
+
|
|
92
|
+
# File path provides directory/module context (e.g., "components/Button.tsx")
|
|
93
|
+
if func.get('file_path'):
|
|
94
|
+
# Use just the relative path portion
|
|
95
|
+
path = func['file_path']
|
|
96
|
+
if '/' in path:
|
|
97
|
+
# Take last 3 parts of path for context
|
|
98
|
+
path_parts = path.split('/')
|
|
99
|
+
path = '/'.join(path_parts[-3:]) if len(path_parts) > 3 else path
|
|
100
|
+
parts.append(f"File: {path}")
|
|
101
|
+
|
|
102
|
+
# Function name (often descriptive: handleClick, fetchUserData, etc.)
|
|
103
|
+
parts.append(f"Function: {func['name']}")
|
|
104
|
+
|
|
105
|
+
# Signature provides parameter context
|
|
106
|
+
if func.get('signature'):
|
|
107
|
+
parts.append(f"Signature: {func['signature']}")
|
|
108
|
+
|
|
109
|
+
# Docstring is the most semantic-rich when available
|
|
110
|
+
if func.get('docstring'):
|
|
111
|
+
parts.append(f"Description: {func['docstring']}")
|
|
112
|
+
|
|
113
|
+
return "\n".join(parts)
|
|
114
|
+
|
|
115
|
+
def index_functions(self, batch_size: int = 100, reindex: bool = False) -> int:
|
|
116
|
+
"""Generate embeddings for all functions.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
batch_size: Number of functions to process per batch
|
|
120
|
+
reindex: If True, re-generate embeddings even for functions that have them
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Number of functions indexed
|
|
124
|
+
"""
|
|
125
|
+
if not self.embedding_service.is_available:
|
|
126
|
+
log.warning("OpenAI API key not configured. Skipping function embedding.")
|
|
127
|
+
return 0
|
|
128
|
+
|
|
129
|
+
log.info("Indexing function embeddings (all functions)...")
|
|
130
|
+
indexed_count = 0
|
|
131
|
+
|
|
132
|
+
# Get functions without embeddings (or all if reindexing)
|
|
133
|
+
where_clause = "" if reindex else "WHERE f.embedding IS NULL"
|
|
134
|
+
functions = self.connection.execute(f"""
|
|
135
|
+
MATCH (f:Function)
|
|
136
|
+
{where_clause}
|
|
137
|
+
RETURN f.qualified_name AS qualified_name,
|
|
138
|
+
f.name AS name,
|
|
139
|
+
f.docstring AS docstring,
|
|
140
|
+
f.file_path AS file_path
|
|
141
|
+
LIMIT $limit
|
|
142
|
+
""", {"limit": batch_size})
|
|
143
|
+
|
|
144
|
+
while functions:
|
|
145
|
+
# Generate embeddings with rich context
|
|
146
|
+
texts = [self._build_function_text(func) for func in functions]
|
|
147
|
+
embeddings = self.embedding_service.embed_texts(texts)
|
|
148
|
+
|
|
149
|
+
# Update functions with embeddings
|
|
150
|
+
for func, embedding in zip(functions, embeddings):
|
|
151
|
+
if embedding:
|
|
152
|
+
self.connection.execute_write("""
|
|
153
|
+
MATCH (f:Function {qualified_name: $qualified_name})
|
|
154
|
+
SET f.embedding = $embedding
|
|
155
|
+
""", {"qualified_name": func['qualified_name'], "embedding": list(embedding)})
|
|
156
|
+
indexed_count += 1
|
|
157
|
+
|
|
158
|
+
log.info(f"Indexed {indexed_count} function embeddings...")
|
|
159
|
+
|
|
160
|
+
# Get next batch
|
|
161
|
+
functions = self.connection.execute(f"""
|
|
162
|
+
MATCH (f:Function)
|
|
163
|
+
{where_clause}
|
|
164
|
+
RETURN f.qualified_name AS qualified_name,
|
|
165
|
+
f.name AS name,
|
|
166
|
+
f.docstring AS docstring,
|
|
167
|
+
f.file_path AS file_path
|
|
168
|
+
LIMIT $limit
|
|
169
|
+
""", {"limit": batch_size})
|
|
170
|
+
|
|
171
|
+
log.info(f"Completed indexing {indexed_count} function embeddings")
|
|
172
|
+
return indexed_count
|
|
173
|
+
|
|
174
|
+
def _build_class_text(self, cls: dict) -> str:
|
|
175
|
+
"""Build rich text for class embedding.
|
|
176
|
+
|
|
177
|
+
Includes file path, name, and docstring for better semantic matching.
|
|
178
|
+
"""
|
|
179
|
+
parts = []
|
|
180
|
+
|
|
181
|
+
# File path provides directory/module context
|
|
182
|
+
if cls.get('file_path'):
|
|
183
|
+
path = cls['file_path']
|
|
184
|
+
if '/' in path:
|
|
185
|
+
path_parts = path.split('/')
|
|
186
|
+
path = '/'.join(path_parts[-3:]) if len(path_parts) > 3 else path
|
|
187
|
+
parts.append(f"File: {path}")
|
|
188
|
+
|
|
189
|
+
# Class name
|
|
190
|
+
parts.append(f"Class: {cls['name']}")
|
|
191
|
+
|
|
192
|
+
# Docstring when available
|
|
193
|
+
if cls.get('docstring'):
|
|
194
|
+
parts.append(f"Description: {cls['docstring']}")
|
|
195
|
+
|
|
196
|
+
return "\n".join(parts)
|
|
197
|
+
|
|
198
|
+
def index_classes(self, batch_size: int = 100, reindex: bool = False) -> int:
|
|
199
|
+
"""Generate embeddings for all classes.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
batch_size: Number of classes to process per batch
|
|
203
|
+
reindex: If True, re-generate embeddings even for classes that have them
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Number of classes indexed
|
|
207
|
+
"""
|
|
208
|
+
if not self.embedding_service.is_available:
|
|
209
|
+
log.warning("OpenAI API key not configured. Skipping class embedding.")
|
|
210
|
+
return 0
|
|
211
|
+
|
|
212
|
+
log.info("Indexing class embeddings (all classes)...")
|
|
213
|
+
indexed_count = 0
|
|
214
|
+
|
|
215
|
+
# Get classes without embeddings (or all if reindexing)
|
|
216
|
+
where_clause = "" if reindex else "WHERE c.embedding IS NULL"
|
|
217
|
+
classes = self.connection.execute(f"""
|
|
218
|
+
MATCH (c:Class)
|
|
219
|
+
{where_clause}
|
|
220
|
+
RETURN c.qualified_name AS qualified_name,
|
|
221
|
+
c.name AS name,
|
|
222
|
+
c.docstring AS docstring,
|
|
223
|
+
c.file_path AS file_path
|
|
224
|
+
LIMIT $limit
|
|
225
|
+
""", {"limit": batch_size})
|
|
226
|
+
|
|
227
|
+
while classes:
|
|
228
|
+
# Generate embeddings with rich context
|
|
229
|
+
texts = [self._build_class_text(cls) for cls in classes]
|
|
230
|
+
embeddings = self.embedding_service.embed_texts(texts)
|
|
231
|
+
|
|
232
|
+
# Update classes with embeddings
|
|
233
|
+
for cls, embedding in zip(classes, embeddings):
|
|
234
|
+
if embedding:
|
|
235
|
+
self.connection.execute_write("""
|
|
236
|
+
MATCH (c:Class {qualified_name: $qualified_name})
|
|
237
|
+
SET c.embedding = $embedding
|
|
238
|
+
""", {"qualified_name": cls['qualified_name'], "embedding": list(embedding)})
|
|
239
|
+
indexed_count += 1
|
|
240
|
+
|
|
241
|
+
log.info(f"Indexed {indexed_count} class embeddings...")
|
|
242
|
+
|
|
243
|
+
# Get next batch
|
|
244
|
+
classes = self.connection.execute(f"""
|
|
245
|
+
MATCH (c:Class)
|
|
246
|
+
{where_clause}
|
|
247
|
+
RETURN c.qualified_name AS qualified_name,
|
|
248
|
+
c.name AS name,
|
|
249
|
+
c.docstring AS docstring,
|
|
250
|
+
c.file_path AS file_path
|
|
251
|
+
LIMIT $limit
|
|
252
|
+
""", {"limit": batch_size})
|
|
253
|
+
|
|
254
|
+
log.info(f"Completed indexing {indexed_count} class embeddings")
|
|
255
|
+
return indexed_count
|
|
256
|
+
|
|
257
|
+
def _build_community_text(self, community: dict) -> str:
|
|
258
|
+
"""Build rich text for community embedding.
|
|
259
|
+
|
|
260
|
+
Includes community ID and description for semantic matching.
|
|
261
|
+
"""
|
|
262
|
+
parts = []
|
|
263
|
+
parts.append(f"Community {community['community_id']}")
|
|
264
|
+
if community.get('description'):
|
|
265
|
+
parts.append(f"Description: {community['description']}")
|
|
266
|
+
return "\n".join(parts)
|
|
267
|
+
|
|
268
|
+
def index_communities(self, batch_size: int = 50, reindex: bool = False) -> int:
|
|
269
|
+
"""Generate embeddings for community descriptions.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
batch_size: Number of communities to process per batch
|
|
273
|
+
reindex: If True, re-generate embeddings even for communities that have them
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Number of communities indexed
|
|
277
|
+
"""
|
|
278
|
+
if not self.embedding_service.is_available:
|
|
279
|
+
log.warning("Embedding service not available. Skipping community embedding.")
|
|
280
|
+
return 0
|
|
281
|
+
|
|
282
|
+
log.info("Indexing community embeddings...")
|
|
283
|
+
indexed_count = 0
|
|
284
|
+
|
|
285
|
+
# Get communities with descriptions but without embeddings (or all if reindexing)
|
|
286
|
+
where_clause = "WHERE c.description IS NOT NULL" + ("" if reindex else " AND c.embedding IS NULL")
|
|
287
|
+
communities = self.connection.execute(f"""
|
|
288
|
+
MATCH (c:Community)
|
|
289
|
+
{where_clause}
|
|
290
|
+
RETURN c.community_id AS community_id,
|
|
291
|
+
c.description AS description
|
|
292
|
+
LIMIT $limit
|
|
293
|
+
""", {"limit": batch_size})
|
|
294
|
+
|
|
295
|
+
while communities:
|
|
296
|
+
# Generate embeddings
|
|
297
|
+
texts = [self._build_community_text(c) for c in communities]
|
|
298
|
+
embeddings = self.embedding_service.embed_texts(texts)
|
|
299
|
+
|
|
300
|
+
# Update communities with embeddings
|
|
301
|
+
for community, embedding in zip(communities, embeddings):
|
|
302
|
+
if embedding:
|
|
303
|
+
self.connection.execute_write("""
|
|
304
|
+
MATCH (c:Community {community_id: $community_id})
|
|
305
|
+
SET c.embedding = $embedding
|
|
306
|
+
""", {"community_id": community['community_id'], "embedding": list(embedding)})
|
|
307
|
+
indexed_count += 1
|
|
308
|
+
|
|
309
|
+
log.info(f"Indexed {indexed_count} community embeddings...")
|
|
310
|
+
|
|
311
|
+
# Get next batch
|
|
312
|
+
communities = self.connection.execute(f"""
|
|
313
|
+
MATCH (c:Community)
|
|
314
|
+
{where_clause}
|
|
315
|
+
RETURN c.community_id AS community_id,
|
|
316
|
+
c.description AS description
|
|
317
|
+
LIMIT $limit
|
|
318
|
+
""", {"limit": batch_size})
|
|
319
|
+
|
|
320
|
+
log.info(f"Completed indexing {indexed_count} community embeddings")
|
|
321
|
+
return indexed_count
|
|
322
|
+
|
|
323
|
+
def index_all(self, reindex: bool = False) -> dict:
|
|
324
|
+
"""Index embeddings for all entity types.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
reindex: If True, re-generate all embeddings (useful after improving
|
|
328
|
+
embedding quality or changing the text format)
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
Dictionary with counts per entity type
|
|
332
|
+
"""
|
|
333
|
+
return {
|
|
334
|
+
"pull_requests": self.index_pull_requests(),
|
|
335
|
+
"functions": self.index_functions(reindex=reindex),
|
|
336
|
+
"classes": self.index_classes(reindex=reindex),
|
|
337
|
+
"communities": self.index_communities(reindex=reindex),
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
def search(
|
|
341
|
+
self,
|
|
342
|
+
query: str,
|
|
343
|
+
entity_types: list[str] | None = None,
|
|
344
|
+
limit: int = 10,
|
|
345
|
+
min_score: float = 0.5,
|
|
346
|
+
) -> list[dict]:
|
|
347
|
+
"""Search for entities using semantic similarity.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
query: Natural language search query
|
|
351
|
+
entity_types: Types to search (Function, Class, File). If None, searches all.
|
|
352
|
+
limit: Maximum results
|
|
353
|
+
min_score: Minimum similarity score (0-1)
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
List of matching entities with scores
|
|
357
|
+
"""
|
|
358
|
+
if not self.embedding_service.is_available:
|
|
359
|
+
return []
|
|
360
|
+
|
|
361
|
+
# Generate query embedding
|
|
362
|
+
query_embedding = self.embedding_service.embed_query(query)
|
|
363
|
+
if query_embedding is None:
|
|
364
|
+
return []
|
|
365
|
+
|
|
366
|
+
results = []
|
|
367
|
+
types_to_search = entity_types or ["Function", "Class"]
|
|
368
|
+
|
|
369
|
+
for entity_type in types_to_search:
|
|
370
|
+
if entity_type == "Function":
|
|
371
|
+
matches = self._search_functions(query_embedding, limit, min_score)
|
|
372
|
+
elif entity_type == "Class":
|
|
373
|
+
matches = self._search_classes(query_embedding, limit, min_score)
|
|
374
|
+
else:
|
|
375
|
+
continue
|
|
376
|
+
results.extend(matches)
|
|
377
|
+
|
|
378
|
+
# Sort by score and limit
|
|
379
|
+
results.sort(key=lambda x: x.get("score", 0), reverse=True)
|
|
380
|
+
return results[:limit]
|
|
381
|
+
|
|
382
|
+
def _cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
|
|
383
|
+
"""Calculate cosine similarity between two vectors."""
|
|
384
|
+
import math
|
|
385
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
386
|
+
norm1 = math.sqrt(sum(a * a for a in vec1))
|
|
387
|
+
norm2 = math.sqrt(sum(b * b for b in vec2))
|
|
388
|
+
if norm1 == 0 or norm2 == 0:
|
|
389
|
+
return 0.0
|
|
390
|
+
return dot_product / (norm1 * norm2)
|
|
391
|
+
|
|
392
|
+
def _search_functions(
|
|
393
|
+
self, query_embedding: list[float], limit: int, min_score: float
|
|
394
|
+
) -> list[dict]:
|
|
395
|
+
"""Search functions by embedding similarity."""
|
|
396
|
+
results = []
|
|
397
|
+
|
|
398
|
+
# Get all functions with embeddings
|
|
399
|
+
try:
|
|
400
|
+
functions = self.connection.execute("""
|
|
401
|
+
MATCH (f:Function)
|
|
402
|
+
WHERE f.embedding IS NOT NULL
|
|
403
|
+
RETURN f.qualified_name AS qualified_name,
|
|
404
|
+
f.name AS name,
|
|
405
|
+
f.file_path AS file_path,
|
|
406
|
+
f.docstring AS docstring,
|
|
407
|
+
f.embedding AS embedding
|
|
408
|
+
""")
|
|
409
|
+
except Exception:
|
|
410
|
+
# Table doesn't exist or other error - return empty results
|
|
411
|
+
return []
|
|
412
|
+
|
|
413
|
+
for func in functions:
|
|
414
|
+
if func.get("embedding"):
|
|
415
|
+
score = self._cosine_similarity(query_embedding, func["embedding"])
|
|
416
|
+
if score >= min_score:
|
|
417
|
+
results.append({
|
|
418
|
+
"qualified_name": func["qualified_name"],
|
|
419
|
+
"name": func["name"],
|
|
420
|
+
"file_path": func["file_path"],
|
|
421
|
+
"type": "Function",
|
|
422
|
+
"node_type": "Function",
|
|
423
|
+
"score": score,
|
|
424
|
+
"docstring": func.get("docstring"),
|
|
425
|
+
})
|
|
426
|
+
|
|
427
|
+
results.sort(key=lambda x: x["score"], reverse=True)
|
|
428
|
+
return results[:limit]
|
|
429
|
+
|
|
430
|
+
def _search_classes(
|
|
431
|
+
self, query_embedding: list[float], limit: int, min_score: float
|
|
432
|
+
) -> list[dict]:
|
|
433
|
+
"""Search classes by embedding similarity."""
|
|
434
|
+
results = []
|
|
435
|
+
|
|
436
|
+
# Get all classes with embeddings
|
|
437
|
+
try:
|
|
438
|
+
classes = self.connection.execute("""
|
|
439
|
+
MATCH (c:Class)
|
|
440
|
+
WHERE c.embedding IS NOT NULL
|
|
441
|
+
RETURN c.qualified_name AS qualified_name,
|
|
442
|
+
c.name AS name,
|
|
443
|
+
c.file_path AS file_path,
|
|
444
|
+
c.docstring AS docstring,
|
|
445
|
+
c.embedding AS embedding
|
|
446
|
+
""")
|
|
447
|
+
except Exception:
|
|
448
|
+
# Table doesn't exist or other error - return empty results
|
|
449
|
+
return []
|
|
450
|
+
|
|
451
|
+
for cls in classes:
|
|
452
|
+
if cls.get("embedding"):
|
|
453
|
+
score = self._cosine_similarity(query_embedding, cls["embedding"])
|
|
454
|
+
if score >= min_score:
|
|
455
|
+
results.append({
|
|
456
|
+
"qualified_name": cls["qualified_name"],
|
|
457
|
+
"name": cls["name"],
|
|
458
|
+
"file_path": cls["file_path"],
|
|
459
|
+
"type": "Class",
|
|
460
|
+
"node_type": "Class",
|
|
461
|
+
"score": score,
|
|
462
|
+
"docstring": cls.get("docstring"),
|
|
463
|
+
})
|
|
464
|
+
|
|
465
|
+
results.sort(key=lambda x: x["score"], reverse=True)
|
|
466
|
+
return results[:limit]
|
|
467
|
+
|
|
468
|
+
def get_embedding_stats(self) -> dict:
|
|
469
|
+
"""Get embedding coverage statistics.
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Dictionary with embedding stats per entity type
|
|
473
|
+
"""
|
|
474
|
+
stats = {}
|
|
475
|
+
|
|
476
|
+
# PR stats
|
|
477
|
+
pr_results = self.connection.execute("""
|
|
478
|
+
MATCH (pr:PullRequest)
|
|
479
|
+
RETURN count(pr) AS total,
|
|
480
|
+
count(pr.embedding) AS with_embedding
|
|
481
|
+
""")
|
|
482
|
+
if pr_results:
|
|
483
|
+
record = pr_results[0]
|
|
484
|
+
stats["pull_requests"] = {
|
|
485
|
+
"total": record["total"],
|
|
486
|
+
"with_embedding": record["with_embedding"],
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
# Function stats
|
|
490
|
+
func_results = self.connection.execute("""
|
|
491
|
+
MATCH (f:Function)
|
|
492
|
+
RETURN count(f) AS total,
|
|
493
|
+
count(f.embedding) AS with_embedding,
|
|
494
|
+
count(CASE WHEN f.docstring IS NOT NULL THEN 1 END) AS with_docstring
|
|
495
|
+
""")
|
|
496
|
+
if func_results:
|
|
497
|
+
record = func_results[0]
|
|
498
|
+
stats["functions"] = {
|
|
499
|
+
"total": record["total"],
|
|
500
|
+
"with_embedding": record["with_embedding"],
|
|
501
|
+
"with_docstring": record["with_docstring"],
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
# Class stats
|
|
505
|
+
class_results = self.connection.execute("""
|
|
506
|
+
MATCH (c:Class)
|
|
507
|
+
RETURN count(c) AS total,
|
|
508
|
+
count(c.embedding) AS with_embedding,
|
|
509
|
+
count(CASE WHEN c.docstring IS NOT NULL THEN 1 END) AS with_docstring
|
|
510
|
+
""")
|
|
511
|
+
if class_results:
|
|
512
|
+
record = class_results[0]
|
|
513
|
+
stats["classes"] = {
|
|
514
|
+
"total": record["total"],
|
|
515
|
+
"with_embedding": record["with_embedding"],
|
|
516
|
+
"with_docstring": record["with_docstring"],
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
# Community stats
|
|
520
|
+
community_results = self.connection.execute("""
|
|
521
|
+
MATCH (c:Community)
|
|
522
|
+
RETURN count(c) AS total,
|
|
523
|
+
count(c.embedding) AS with_embedding,
|
|
524
|
+
count(CASE WHEN c.description IS NOT NULL THEN 1 END) AS with_description
|
|
525
|
+
""")
|
|
526
|
+
if community_results:
|
|
527
|
+
record = community_results[0]
|
|
528
|
+
stats["communities"] = {
|
|
529
|
+
"total": record["total"],
|
|
530
|
+
"with_embedding": record["with_embedding"],
|
|
531
|
+
"with_description": record["with_description"],
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
return stats
|