emdash-core 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. emdash_core/__init__.py +3 -0
  2. emdash_core/agent/__init__.py +37 -0
  3. emdash_core/agent/agents.py +225 -0
  4. emdash_core/agent/code_reviewer.py +476 -0
  5. emdash_core/agent/compaction.py +143 -0
  6. emdash_core/agent/context_manager.py +140 -0
  7. emdash_core/agent/events.py +338 -0
  8. emdash_core/agent/handlers.py +224 -0
  9. emdash_core/agent/inprocess_subagent.py +377 -0
  10. emdash_core/agent/mcp/__init__.py +50 -0
  11. emdash_core/agent/mcp/client.py +346 -0
  12. emdash_core/agent/mcp/config.py +302 -0
  13. emdash_core/agent/mcp/manager.py +496 -0
  14. emdash_core/agent/mcp/tool_factory.py +213 -0
  15. emdash_core/agent/prompts/__init__.py +38 -0
  16. emdash_core/agent/prompts/main_agent.py +104 -0
  17. emdash_core/agent/prompts/subagents.py +131 -0
  18. emdash_core/agent/prompts/workflow.py +136 -0
  19. emdash_core/agent/providers/__init__.py +34 -0
  20. emdash_core/agent/providers/base.py +143 -0
  21. emdash_core/agent/providers/factory.py +80 -0
  22. emdash_core/agent/providers/models.py +220 -0
  23. emdash_core/agent/providers/openai_provider.py +463 -0
  24. emdash_core/agent/providers/transformers_provider.py +217 -0
  25. emdash_core/agent/research/__init__.py +81 -0
  26. emdash_core/agent/research/agent.py +143 -0
  27. emdash_core/agent/research/controller.py +254 -0
  28. emdash_core/agent/research/critic.py +428 -0
  29. emdash_core/agent/research/macros.py +469 -0
  30. emdash_core/agent/research/planner.py +449 -0
  31. emdash_core/agent/research/researcher.py +436 -0
  32. emdash_core/agent/research/state.py +523 -0
  33. emdash_core/agent/research/synthesizer.py +594 -0
  34. emdash_core/agent/reviewer_profile.py +475 -0
  35. emdash_core/agent/rules.py +123 -0
  36. emdash_core/agent/runner.py +601 -0
  37. emdash_core/agent/session.py +262 -0
  38. emdash_core/agent/spec_schema.py +66 -0
  39. emdash_core/agent/specification.py +479 -0
  40. emdash_core/agent/subagent.py +397 -0
  41. emdash_core/agent/subagent_prompts.py +13 -0
  42. emdash_core/agent/toolkit.py +482 -0
  43. emdash_core/agent/toolkits/__init__.py +64 -0
  44. emdash_core/agent/toolkits/base.py +96 -0
  45. emdash_core/agent/toolkits/explore.py +47 -0
  46. emdash_core/agent/toolkits/plan.py +55 -0
  47. emdash_core/agent/tools/__init__.py +141 -0
  48. emdash_core/agent/tools/analytics.py +436 -0
  49. emdash_core/agent/tools/base.py +131 -0
  50. emdash_core/agent/tools/coding.py +484 -0
  51. emdash_core/agent/tools/github_mcp.py +592 -0
  52. emdash_core/agent/tools/history.py +13 -0
  53. emdash_core/agent/tools/modes.py +153 -0
  54. emdash_core/agent/tools/plan.py +206 -0
  55. emdash_core/agent/tools/plan_write.py +135 -0
  56. emdash_core/agent/tools/search.py +412 -0
  57. emdash_core/agent/tools/spec.py +341 -0
  58. emdash_core/agent/tools/task.py +262 -0
  59. emdash_core/agent/tools/task_output.py +204 -0
  60. emdash_core/agent/tools/tasks.py +454 -0
  61. emdash_core/agent/tools/traversal.py +588 -0
  62. emdash_core/agent/tools/web.py +179 -0
  63. emdash_core/analytics/__init__.py +5 -0
  64. emdash_core/analytics/engine.py +1286 -0
  65. emdash_core/api/__init__.py +5 -0
  66. emdash_core/api/agent.py +308 -0
  67. emdash_core/api/agents.py +154 -0
  68. emdash_core/api/analyze.py +264 -0
  69. emdash_core/api/auth.py +173 -0
  70. emdash_core/api/context.py +77 -0
  71. emdash_core/api/db.py +121 -0
  72. emdash_core/api/embed.py +131 -0
  73. emdash_core/api/feature.py +143 -0
  74. emdash_core/api/health.py +93 -0
  75. emdash_core/api/index.py +162 -0
  76. emdash_core/api/plan.py +110 -0
  77. emdash_core/api/projectmd.py +210 -0
  78. emdash_core/api/query.py +320 -0
  79. emdash_core/api/research.py +122 -0
  80. emdash_core/api/review.py +161 -0
  81. emdash_core/api/router.py +76 -0
  82. emdash_core/api/rules.py +116 -0
  83. emdash_core/api/search.py +119 -0
  84. emdash_core/api/spec.py +99 -0
  85. emdash_core/api/swarm.py +223 -0
  86. emdash_core/api/tasks.py +109 -0
  87. emdash_core/api/team.py +120 -0
  88. emdash_core/auth/__init__.py +17 -0
  89. emdash_core/auth/github.py +389 -0
  90. emdash_core/config.py +74 -0
  91. emdash_core/context/__init__.py +52 -0
  92. emdash_core/context/models.py +50 -0
  93. emdash_core/context/providers/__init__.py +11 -0
  94. emdash_core/context/providers/base.py +74 -0
  95. emdash_core/context/providers/explored_areas.py +183 -0
  96. emdash_core/context/providers/touched_areas.py +360 -0
  97. emdash_core/context/registry.py +73 -0
  98. emdash_core/context/reranker.py +199 -0
  99. emdash_core/context/service.py +260 -0
  100. emdash_core/context/session.py +352 -0
  101. emdash_core/core/__init__.py +104 -0
  102. emdash_core/core/config.py +454 -0
  103. emdash_core/core/exceptions.py +55 -0
  104. emdash_core/core/models.py +265 -0
  105. emdash_core/core/review_config.py +57 -0
  106. emdash_core/db/__init__.py +67 -0
  107. emdash_core/db/auth.py +134 -0
  108. emdash_core/db/models.py +91 -0
  109. emdash_core/db/provider.py +222 -0
  110. emdash_core/db/providers/__init__.py +5 -0
  111. emdash_core/db/providers/supabase.py +452 -0
  112. emdash_core/embeddings/__init__.py +24 -0
  113. emdash_core/embeddings/indexer.py +534 -0
  114. emdash_core/embeddings/models.py +192 -0
  115. emdash_core/embeddings/providers/__init__.py +7 -0
  116. emdash_core/embeddings/providers/base.py +112 -0
  117. emdash_core/embeddings/providers/fireworks.py +141 -0
  118. emdash_core/embeddings/providers/openai.py +104 -0
  119. emdash_core/embeddings/registry.py +146 -0
  120. emdash_core/embeddings/service.py +215 -0
  121. emdash_core/graph/__init__.py +26 -0
  122. emdash_core/graph/builder.py +134 -0
  123. emdash_core/graph/connection.py +692 -0
  124. emdash_core/graph/schema.py +416 -0
  125. emdash_core/graph/writer.py +667 -0
  126. emdash_core/ingestion/__init__.py +7 -0
  127. emdash_core/ingestion/change_detector.py +150 -0
  128. emdash_core/ingestion/git/__init__.py +5 -0
  129. emdash_core/ingestion/git/commit_analyzer.py +196 -0
  130. emdash_core/ingestion/github/__init__.py +6 -0
  131. emdash_core/ingestion/github/pr_fetcher.py +296 -0
  132. emdash_core/ingestion/github/task_extractor.py +100 -0
  133. emdash_core/ingestion/orchestrator.py +540 -0
  134. emdash_core/ingestion/parsers/__init__.py +10 -0
  135. emdash_core/ingestion/parsers/base_parser.py +66 -0
  136. emdash_core/ingestion/parsers/call_graph_builder.py +121 -0
  137. emdash_core/ingestion/parsers/class_extractor.py +154 -0
  138. emdash_core/ingestion/parsers/function_extractor.py +202 -0
  139. emdash_core/ingestion/parsers/import_analyzer.py +119 -0
  140. emdash_core/ingestion/parsers/python_parser.py +123 -0
  141. emdash_core/ingestion/parsers/registry.py +72 -0
  142. emdash_core/ingestion/parsers/ts_ast_parser.js +313 -0
  143. emdash_core/ingestion/parsers/typescript_parser.py +278 -0
  144. emdash_core/ingestion/repository.py +346 -0
  145. emdash_core/models/__init__.py +38 -0
  146. emdash_core/models/agent.py +68 -0
  147. emdash_core/models/index.py +77 -0
  148. emdash_core/models/query.py +113 -0
  149. emdash_core/planning/__init__.py +7 -0
  150. emdash_core/planning/agent_api.py +413 -0
  151. emdash_core/planning/context_builder.py +265 -0
  152. emdash_core/planning/feature_context.py +232 -0
  153. emdash_core/planning/feature_expander.py +646 -0
  154. emdash_core/planning/llm_explainer.py +198 -0
  155. emdash_core/planning/similarity.py +509 -0
  156. emdash_core/planning/team_focus.py +821 -0
  157. emdash_core/server.py +153 -0
  158. emdash_core/sse/__init__.py +5 -0
  159. emdash_core/sse/stream.py +196 -0
  160. emdash_core/swarm/__init__.py +17 -0
  161. emdash_core/swarm/merge_agent.py +383 -0
  162. emdash_core/swarm/session_manager.py +274 -0
  163. emdash_core/swarm/swarm_runner.py +226 -0
  164. emdash_core/swarm/task_definition.py +137 -0
  165. emdash_core/swarm/worker_spawner.py +319 -0
  166. emdash_core/swarm/worktree_manager.py +278 -0
  167. emdash_core/templates/__init__.py +10 -0
  168. emdash_core/templates/defaults/agent-builder.md.template +82 -0
  169. emdash_core/templates/defaults/focus.md.template +115 -0
  170. emdash_core/templates/defaults/pr-review-enhanced.md.template +309 -0
  171. emdash_core/templates/defaults/pr-review.md.template +80 -0
  172. emdash_core/templates/defaults/project.md.template +85 -0
  173. emdash_core/templates/defaults/research_critic.md.template +112 -0
  174. emdash_core/templates/defaults/research_planner.md.template +85 -0
  175. emdash_core/templates/defaults/research_synthesizer.md.template +128 -0
  176. emdash_core/templates/defaults/reviewer.md.template +81 -0
  177. emdash_core/templates/defaults/spec.md.template +41 -0
  178. emdash_core/templates/defaults/tasks.md.template +78 -0
  179. emdash_core/templates/loader.py +296 -0
  180. emdash_core/utils/__init__.py +45 -0
  181. emdash_core/utils/git.py +84 -0
  182. emdash_core/utils/image.py +502 -0
  183. emdash_core/utils/logger.py +51 -0
  184. emdash_core-0.1.7.dist-info/METADATA +35 -0
  185. emdash_core-0.1.7.dist-info/RECORD +187 -0
  186. emdash_core-0.1.7.dist-info/WHEEL +4 -0
  187. emdash_core-0.1.7.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,534 @@
1
+ """Batch embedding indexer for Kuzu entities."""
2
+
3
+ from typing import Optional
4
+
5
+ from ..graph.connection import KuzuConnection, get_connection
6
+ from .service import EmbeddingService
7
+ from ..utils.logger import log
8
+
9
+
10
+ class EmbeddingIndexer:
11
+ """Generates and stores embeddings for graph entities."""
12
+
13
+ def __init__(
14
+ self,
15
+ connection: Optional[KuzuConnection] = None,
16
+ embedding_service: Optional[EmbeddingService] = None,
17
+ ):
18
+ """Initialize embedding indexer.
19
+
20
+ Args:
21
+ connection: Kuzu connection. If None, uses global connection.
22
+ embedding_service: Embedding service. If None, creates new one.
23
+ """
24
+ self.connection = connection or get_connection()
25
+ self.embedding_service = embedding_service or EmbeddingService()
26
+
27
+ def index_pull_requests(self, batch_size: int = 50) -> int:
28
+ """Generate embeddings for PRs without embeddings.
29
+
30
+ Args:
31
+ batch_size: Number of PRs to process per batch
32
+
33
+ Returns:
34
+ Number of PRs indexed
35
+ """
36
+ if not self.embedding_service.is_available:
37
+ log.warning("OpenAI API key not configured. Skipping PR embedding.")
38
+ return 0
39
+
40
+ log.info("Indexing PR embeddings...")
41
+ indexed_count = 0
42
+
43
+ # Get PRs without embeddings
44
+ prs = self.connection.execute("""
45
+ MATCH (pr:PullRequest)
46
+ WHERE pr.embedding IS NULL
47
+ RETURN pr.number AS number,
48
+ pr.title AS title,
49
+ pr.description AS description
50
+ LIMIT $limit
51
+ """, {"limit": batch_size})
52
+
53
+ while prs:
54
+ # Generate embeddings
55
+ texts = [
56
+ f"{pr['title']}\n\n{pr['description'] or ''}"
57
+ for pr in prs
58
+ ]
59
+ embeddings = self.embedding_service.embed_texts(texts)
60
+
61
+ # Update PRs with embeddings
62
+ for pr, embedding in zip(prs, embeddings):
63
+ if embedding:
64
+ self.connection.execute_write("""
65
+ MATCH (pr:PullRequest {number: $number})
66
+ SET pr.embedding = $embedding
67
+ """, {"number": pr['number'], "embedding": list(embedding)})
68
+ indexed_count += 1
69
+
70
+ log.info(f"Indexed {indexed_count} PR embeddings...")
71
+
72
+ # Get next batch
73
+ prs = self.connection.execute("""
74
+ MATCH (pr:PullRequest)
75
+ WHERE pr.embedding IS NULL
76
+ RETURN pr.number AS number,
77
+ pr.title AS title,
78
+ pr.description AS description
79
+ LIMIT $limit
80
+ """, {"limit": batch_size})
81
+
82
+ log.info(f"Completed indexing {indexed_count} PR embeddings")
83
+ return indexed_count
84
+
85
+ def _build_function_text(self, func: dict) -> str:
86
+ """Build rich text for function embedding.
87
+
88
+ Includes file path, name, signature, and docstring for better semantic matching.
89
+ """
90
+ parts = []
91
+
92
+ # File path provides directory/module context (e.g., "components/Button.tsx")
93
+ if func.get('file_path'):
94
+ # Use just the relative path portion
95
+ path = func['file_path']
96
+ if '/' in path:
97
+ # Take last 3 parts of path for context
98
+ path_parts = path.split('/')
99
+ path = '/'.join(path_parts[-3:]) if len(path_parts) > 3 else path
100
+ parts.append(f"File: {path}")
101
+
102
+ # Function name (often descriptive: handleClick, fetchUserData, etc.)
103
+ parts.append(f"Function: {func['name']}")
104
+
105
+ # Signature provides parameter context
106
+ if func.get('signature'):
107
+ parts.append(f"Signature: {func['signature']}")
108
+
109
+ # Docstring is the most semantic-rich when available
110
+ if func.get('docstring'):
111
+ parts.append(f"Description: {func['docstring']}")
112
+
113
+ return "\n".join(parts)
114
+
115
+ def index_functions(self, batch_size: int = 100, reindex: bool = False) -> int:
116
+ """Generate embeddings for all functions.
117
+
118
+ Args:
119
+ batch_size: Number of functions to process per batch
120
+ reindex: If True, re-generate embeddings even for functions that have them
121
+
122
+ Returns:
123
+ Number of functions indexed
124
+ """
125
+ if not self.embedding_service.is_available:
126
+ log.warning("OpenAI API key not configured. Skipping function embedding.")
127
+ return 0
128
+
129
+ log.info("Indexing function embeddings (all functions)...")
130
+ indexed_count = 0
131
+
132
+ # Get functions without embeddings (or all if reindexing)
133
+ where_clause = "" if reindex else "WHERE f.embedding IS NULL"
134
+ functions = self.connection.execute(f"""
135
+ MATCH (f:Function)
136
+ {where_clause}
137
+ RETURN f.qualified_name AS qualified_name,
138
+ f.name AS name,
139
+ f.docstring AS docstring,
140
+ f.file_path AS file_path
141
+ LIMIT $limit
142
+ """, {"limit": batch_size})
143
+
144
+ while functions:
145
+ # Generate embeddings with rich context
146
+ texts = [self._build_function_text(func) for func in functions]
147
+ embeddings = self.embedding_service.embed_texts(texts)
148
+
149
+ # Update functions with embeddings
150
+ for func, embedding in zip(functions, embeddings):
151
+ if embedding:
152
+ self.connection.execute_write("""
153
+ MATCH (f:Function {qualified_name: $qualified_name})
154
+ SET f.embedding = $embedding
155
+ """, {"qualified_name": func['qualified_name'], "embedding": list(embedding)})
156
+ indexed_count += 1
157
+
158
+ log.info(f"Indexed {indexed_count} function embeddings...")
159
+
160
+ # Get next batch
161
+ functions = self.connection.execute(f"""
162
+ MATCH (f:Function)
163
+ {where_clause}
164
+ RETURN f.qualified_name AS qualified_name,
165
+ f.name AS name,
166
+ f.docstring AS docstring,
167
+ f.file_path AS file_path
168
+ LIMIT $limit
169
+ """, {"limit": batch_size})
170
+
171
+ log.info(f"Completed indexing {indexed_count} function embeddings")
172
+ return indexed_count
173
+
174
+ def _build_class_text(self, cls: dict) -> str:
175
+ """Build rich text for class embedding.
176
+
177
+ Includes file path, name, and docstring for better semantic matching.
178
+ """
179
+ parts = []
180
+
181
+ # File path provides directory/module context
182
+ if cls.get('file_path'):
183
+ path = cls['file_path']
184
+ if '/' in path:
185
+ path_parts = path.split('/')
186
+ path = '/'.join(path_parts[-3:]) if len(path_parts) > 3 else path
187
+ parts.append(f"File: {path}")
188
+
189
+ # Class name
190
+ parts.append(f"Class: {cls['name']}")
191
+
192
+ # Docstring when available
193
+ if cls.get('docstring'):
194
+ parts.append(f"Description: {cls['docstring']}")
195
+
196
+ return "\n".join(parts)
197
+
198
+ def index_classes(self, batch_size: int = 100, reindex: bool = False) -> int:
199
+ """Generate embeddings for all classes.
200
+
201
+ Args:
202
+ batch_size: Number of classes to process per batch
203
+ reindex: If True, re-generate embeddings even for classes that have them
204
+
205
+ Returns:
206
+ Number of classes indexed
207
+ """
208
+ if not self.embedding_service.is_available:
209
+ log.warning("OpenAI API key not configured. Skipping class embedding.")
210
+ return 0
211
+
212
+ log.info("Indexing class embeddings (all classes)...")
213
+ indexed_count = 0
214
+
215
+ # Get classes without embeddings (or all if reindexing)
216
+ where_clause = "" if reindex else "WHERE c.embedding IS NULL"
217
+ classes = self.connection.execute(f"""
218
+ MATCH (c:Class)
219
+ {where_clause}
220
+ RETURN c.qualified_name AS qualified_name,
221
+ c.name AS name,
222
+ c.docstring AS docstring,
223
+ c.file_path AS file_path
224
+ LIMIT $limit
225
+ """, {"limit": batch_size})
226
+
227
+ while classes:
228
+ # Generate embeddings with rich context
229
+ texts = [self._build_class_text(cls) for cls in classes]
230
+ embeddings = self.embedding_service.embed_texts(texts)
231
+
232
+ # Update classes with embeddings
233
+ for cls, embedding in zip(classes, embeddings):
234
+ if embedding:
235
+ self.connection.execute_write("""
236
+ MATCH (c:Class {qualified_name: $qualified_name})
237
+ SET c.embedding = $embedding
238
+ """, {"qualified_name": cls['qualified_name'], "embedding": list(embedding)})
239
+ indexed_count += 1
240
+
241
+ log.info(f"Indexed {indexed_count} class embeddings...")
242
+
243
+ # Get next batch
244
+ classes = self.connection.execute(f"""
245
+ MATCH (c:Class)
246
+ {where_clause}
247
+ RETURN c.qualified_name AS qualified_name,
248
+ c.name AS name,
249
+ c.docstring AS docstring,
250
+ c.file_path AS file_path
251
+ LIMIT $limit
252
+ """, {"limit": batch_size})
253
+
254
+ log.info(f"Completed indexing {indexed_count} class embeddings")
255
+ return indexed_count
256
+
257
+ def _build_community_text(self, community: dict) -> str:
258
+ """Build rich text for community embedding.
259
+
260
+ Includes community ID and description for semantic matching.
261
+ """
262
+ parts = []
263
+ parts.append(f"Community {community['community_id']}")
264
+ if community.get('description'):
265
+ parts.append(f"Description: {community['description']}")
266
+ return "\n".join(parts)
267
+
268
+ def index_communities(self, batch_size: int = 50, reindex: bool = False) -> int:
269
+ """Generate embeddings for community descriptions.
270
+
271
+ Args:
272
+ batch_size: Number of communities to process per batch
273
+ reindex: If True, re-generate embeddings even for communities that have them
274
+
275
+ Returns:
276
+ Number of communities indexed
277
+ """
278
+ if not self.embedding_service.is_available:
279
+ log.warning("Embedding service not available. Skipping community embedding.")
280
+ return 0
281
+
282
+ log.info("Indexing community embeddings...")
283
+ indexed_count = 0
284
+
285
+ # Get communities with descriptions but without embeddings (or all if reindexing)
286
+ where_clause = "WHERE c.description IS NOT NULL" + ("" if reindex else " AND c.embedding IS NULL")
287
+ communities = self.connection.execute(f"""
288
+ MATCH (c:Community)
289
+ {where_clause}
290
+ RETURN c.community_id AS community_id,
291
+ c.description AS description
292
+ LIMIT $limit
293
+ """, {"limit": batch_size})
294
+
295
+ while communities:
296
+ # Generate embeddings
297
+ texts = [self._build_community_text(c) for c in communities]
298
+ embeddings = self.embedding_service.embed_texts(texts)
299
+
300
+ # Update communities with embeddings
301
+ for community, embedding in zip(communities, embeddings):
302
+ if embedding:
303
+ self.connection.execute_write("""
304
+ MATCH (c:Community {community_id: $community_id})
305
+ SET c.embedding = $embedding
306
+ """, {"community_id": community['community_id'], "embedding": list(embedding)})
307
+ indexed_count += 1
308
+
309
+ log.info(f"Indexed {indexed_count} community embeddings...")
310
+
311
+ # Get next batch
312
+ communities = self.connection.execute(f"""
313
+ MATCH (c:Community)
314
+ {where_clause}
315
+ RETURN c.community_id AS community_id,
316
+ c.description AS description
317
+ LIMIT $limit
318
+ """, {"limit": batch_size})
319
+
320
+ log.info(f"Completed indexing {indexed_count} community embeddings")
321
+ return indexed_count
322
+
323
+ def index_all(self, reindex: bool = False) -> dict:
324
+ """Index embeddings for all entity types.
325
+
326
+ Args:
327
+ reindex: If True, re-generate all embeddings (useful after improving
328
+ embedding quality or changing the text format)
329
+
330
+ Returns:
331
+ Dictionary with counts per entity type
332
+ """
333
+ return {
334
+ "pull_requests": self.index_pull_requests(),
335
+ "functions": self.index_functions(reindex=reindex),
336
+ "classes": self.index_classes(reindex=reindex),
337
+ "communities": self.index_communities(reindex=reindex),
338
+ }
339
+
340
+ def search(
341
+ self,
342
+ query: str,
343
+ entity_types: list[str] | None = None,
344
+ limit: int = 10,
345
+ min_score: float = 0.5,
346
+ ) -> list[dict]:
347
+ """Search for entities using semantic similarity.
348
+
349
+ Args:
350
+ query: Natural language search query
351
+ entity_types: Types to search (Function, Class, File). If None, searches all.
352
+ limit: Maximum results
353
+ min_score: Minimum similarity score (0-1)
354
+
355
+ Returns:
356
+ List of matching entities with scores
357
+ """
358
+ if not self.embedding_service.is_available:
359
+ return []
360
+
361
+ # Generate query embedding
362
+ query_embedding = self.embedding_service.embed_query(query)
363
+ if query_embedding is None:
364
+ return []
365
+
366
+ results = []
367
+ types_to_search = entity_types or ["Function", "Class"]
368
+
369
+ for entity_type in types_to_search:
370
+ if entity_type == "Function":
371
+ matches = self._search_functions(query_embedding, limit, min_score)
372
+ elif entity_type == "Class":
373
+ matches = self._search_classes(query_embedding, limit, min_score)
374
+ else:
375
+ continue
376
+ results.extend(matches)
377
+
378
+ # Sort by score and limit
379
+ results.sort(key=lambda x: x.get("score", 0), reverse=True)
380
+ return results[:limit]
381
+
382
+ def _cosine_similarity(self, vec1: list[float], vec2: list[float]) -> float:
383
+ """Calculate cosine similarity between two vectors."""
384
+ import math
385
+ dot_product = sum(a * b for a, b in zip(vec1, vec2))
386
+ norm1 = math.sqrt(sum(a * a for a in vec1))
387
+ norm2 = math.sqrt(sum(b * b for b in vec2))
388
+ if norm1 == 0 or norm2 == 0:
389
+ return 0.0
390
+ return dot_product / (norm1 * norm2)
391
+
392
+ def _search_functions(
393
+ self, query_embedding: list[float], limit: int, min_score: float
394
+ ) -> list[dict]:
395
+ """Search functions by embedding similarity."""
396
+ results = []
397
+
398
+ # Get all functions with embeddings
399
+ try:
400
+ functions = self.connection.execute("""
401
+ MATCH (f:Function)
402
+ WHERE f.embedding IS NOT NULL
403
+ RETURN f.qualified_name AS qualified_name,
404
+ f.name AS name,
405
+ f.file_path AS file_path,
406
+ f.docstring AS docstring,
407
+ f.embedding AS embedding
408
+ """)
409
+ except Exception:
410
+ # Table doesn't exist or other error - return empty results
411
+ return []
412
+
413
+ for func in functions:
414
+ if func.get("embedding"):
415
+ score = self._cosine_similarity(query_embedding, func["embedding"])
416
+ if score >= min_score:
417
+ results.append({
418
+ "qualified_name": func["qualified_name"],
419
+ "name": func["name"],
420
+ "file_path": func["file_path"],
421
+ "type": "Function",
422
+ "node_type": "Function",
423
+ "score": score,
424
+ "docstring": func.get("docstring"),
425
+ })
426
+
427
+ results.sort(key=lambda x: x["score"], reverse=True)
428
+ return results[:limit]
429
+
430
+ def _search_classes(
431
+ self, query_embedding: list[float], limit: int, min_score: float
432
+ ) -> list[dict]:
433
+ """Search classes by embedding similarity."""
434
+ results = []
435
+
436
+ # Get all classes with embeddings
437
+ try:
438
+ classes = self.connection.execute("""
439
+ MATCH (c:Class)
440
+ WHERE c.embedding IS NOT NULL
441
+ RETURN c.qualified_name AS qualified_name,
442
+ c.name AS name,
443
+ c.file_path AS file_path,
444
+ c.docstring AS docstring,
445
+ c.embedding AS embedding
446
+ """)
447
+ except Exception:
448
+ # Table doesn't exist or other error - return empty results
449
+ return []
450
+
451
+ for cls in classes:
452
+ if cls.get("embedding"):
453
+ score = self._cosine_similarity(query_embedding, cls["embedding"])
454
+ if score >= min_score:
455
+ results.append({
456
+ "qualified_name": cls["qualified_name"],
457
+ "name": cls["name"],
458
+ "file_path": cls["file_path"],
459
+ "type": "Class",
460
+ "node_type": "Class",
461
+ "score": score,
462
+ "docstring": cls.get("docstring"),
463
+ })
464
+
465
+ results.sort(key=lambda x: x["score"], reverse=True)
466
+ return results[:limit]
467
+
468
+ def get_embedding_stats(self) -> dict:
469
+ """Get embedding coverage statistics.
470
+
471
+ Returns:
472
+ Dictionary with embedding stats per entity type
473
+ """
474
+ stats = {}
475
+
476
+ # PR stats
477
+ pr_results = self.connection.execute("""
478
+ MATCH (pr:PullRequest)
479
+ RETURN count(pr) AS total,
480
+ count(pr.embedding) AS with_embedding
481
+ """)
482
+ if pr_results:
483
+ record = pr_results[0]
484
+ stats["pull_requests"] = {
485
+ "total": record["total"],
486
+ "with_embedding": record["with_embedding"],
487
+ }
488
+
489
+ # Function stats
490
+ func_results = self.connection.execute("""
491
+ MATCH (f:Function)
492
+ RETURN count(f) AS total,
493
+ count(f.embedding) AS with_embedding,
494
+ count(CASE WHEN f.docstring IS NOT NULL THEN 1 END) AS with_docstring
495
+ """)
496
+ if func_results:
497
+ record = func_results[0]
498
+ stats["functions"] = {
499
+ "total": record["total"],
500
+ "with_embedding": record["with_embedding"],
501
+ "with_docstring": record["with_docstring"],
502
+ }
503
+
504
+ # Class stats
505
+ class_results = self.connection.execute("""
506
+ MATCH (c:Class)
507
+ RETURN count(c) AS total,
508
+ count(c.embedding) AS with_embedding,
509
+ count(CASE WHEN c.docstring IS NOT NULL THEN 1 END) AS with_docstring
510
+ """)
511
+ if class_results:
512
+ record = class_results[0]
513
+ stats["classes"] = {
514
+ "total": record["total"],
515
+ "with_embedding": record["with_embedding"],
516
+ "with_docstring": record["with_docstring"],
517
+ }
518
+
519
+ # Community stats
520
+ community_results = self.connection.execute("""
521
+ MATCH (c:Community)
522
+ RETURN count(c) AS total,
523
+ count(c.embedding) AS with_embedding,
524
+ count(CASE WHEN c.description IS NOT NULL THEN 1 END) AS with_description
525
+ """)
526
+ if community_results:
527
+ record = community_results[0]
528
+ stats["communities"] = {
529
+ "total": record["total"],
530
+ "with_embedding": record["with_embedding"],
531
+ "with_description": record["with_description"],
532
+ }
533
+
534
+ return stats