code-graph-builder 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_graph_builder/__init__.py +82 -0
- code_graph_builder/builder.py +366 -0
- code_graph_builder/cgb_cli.py +32 -0
- code_graph_builder/cli.py +564 -0
- code_graph_builder/commands_cli.py +1288 -0
- code_graph_builder/config.py +340 -0
- code_graph_builder/constants.py +708 -0
- code_graph_builder/embeddings/__init__.py +40 -0
- code_graph_builder/embeddings/qwen3_embedder.py +573 -0
- code_graph_builder/embeddings/vector_store.py +584 -0
- code_graph_builder/examples/__init__.py +0 -0
- code_graph_builder/examples/example_configuration.py +276 -0
- code_graph_builder/examples/example_kuzu_usage.py +109 -0
- code_graph_builder/examples/example_semantic_search_full.py +347 -0
- code_graph_builder/examples/generate_wiki.py +915 -0
- code_graph_builder/examples/graph_export_example.py +100 -0
- code_graph_builder/examples/rag_example.py +206 -0
- code_graph_builder/examples/test_cli_demo.py +129 -0
- code_graph_builder/examples/test_embedding_api.py +153 -0
- code_graph_builder/examples/test_kuzu_local.py +190 -0
- code_graph_builder/examples/test_rag_redis.py +390 -0
- code_graph_builder/graph_updater.py +605 -0
- code_graph_builder/guidance/__init__.py +1 -0
- code_graph_builder/guidance/agent.py +123 -0
- code_graph_builder/guidance/prompts.py +74 -0
- code_graph_builder/guidance/toolset.py +264 -0
- code_graph_builder/language_spec.py +536 -0
- code_graph_builder/mcp/__init__.py +21 -0
- code_graph_builder/mcp/api_doc_generator.py +764 -0
- code_graph_builder/mcp/file_editor.py +207 -0
- code_graph_builder/mcp/pipeline.py +777 -0
- code_graph_builder/mcp/server.py +161 -0
- code_graph_builder/mcp/tools.py +1800 -0
- code_graph_builder/models.py +115 -0
- code_graph_builder/parser_loader.py +344 -0
- code_graph_builder/parsers/__init__.py +7 -0
- code_graph_builder/parsers/call_processor.py +306 -0
- code_graph_builder/parsers/call_resolver.py +139 -0
- code_graph_builder/parsers/definition_processor.py +796 -0
- code_graph_builder/parsers/factory.py +119 -0
- code_graph_builder/parsers/import_processor.py +293 -0
- code_graph_builder/parsers/structure_processor.py +145 -0
- code_graph_builder/parsers/type_inference.py +143 -0
- code_graph_builder/parsers/utils.py +134 -0
- code_graph_builder/rag/__init__.py +68 -0
- code_graph_builder/rag/camel_agent.py +429 -0
- code_graph_builder/rag/client.py +298 -0
- code_graph_builder/rag/config.py +239 -0
- code_graph_builder/rag/cypher_generator.py +67 -0
- code_graph_builder/rag/llm_backend.py +210 -0
- code_graph_builder/rag/markdown_generator.py +352 -0
- code_graph_builder/rag/prompt_templates.py +440 -0
- code_graph_builder/rag/rag_engine.py +640 -0
- code_graph_builder/rag/review_report.md +172 -0
- code_graph_builder/rag/tests/__init__.py +3 -0
- code_graph_builder/rag/tests/test_camel_agent.py +313 -0
- code_graph_builder/rag/tests/test_client.py +221 -0
- code_graph_builder/rag/tests/test_config.py +177 -0
- code_graph_builder/rag/tests/test_markdown_generator.py +240 -0
- code_graph_builder/rag/tests/test_prompt_templates.py +160 -0
- code_graph_builder/services/__init__.py +39 -0
- code_graph_builder/services/graph_service.py +465 -0
- code_graph_builder/services/kuzu_service.py +665 -0
- code_graph_builder/services/memory_service.py +171 -0
- code_graph_builder/settings.py +75 -0
- code_graph_builder/tests/ACCEPTANCE_CRITERIA_PHASE2.md +401 -0
- code_graph_builder/tests/__init__.py +1 -0
- code_graph_builder/tests/run_acceptance_check.py +378 -0
- code_graph_builder/tests/test_api_find.py +231 -0
- code_graph_builder/tests/test_api_find_integration.py +226 -0
- code_graph_builder/tests/test_basic.py +78 -0
- code_graph_builder/tests/test_c_api_extraction.py +388 -0
- code_graph_builder/tests/test_call_resolution_scenarios.py +504 -0
- code_graph_builder/tests/test_embedder.py +411 -0
- code_graph_builder/tests/test_integration_semantic.py +434 -0
- code_graph_builder/tests/test_mcp_protocol.py +298 -0
- code_graph_builder/tests/test_mcp_user_flow.py +190 -0
- code_graph_builder/tests/test_rag.py +404 -0
- code_graph_builder/tests/test_settings.py +135 -0
- code_graph_builder/tests/test_step1_graph_build.py +264 -0
- code_graph_builder/tests/test_step2_api_docs.py +323 -0
- code_graph_builder/tests/test_step3_embedding.py +278 -0
- code_graph_builder/tests/test_vector_store.py +552 -0
- code_graph_builder/tools/__init__.py +40 -0
- code_graph_builder/tools/graph_query.py +495 -0
- code_graph_builder/tools/semantic_search.py +387 -0
- code_graph_builder/types.py +333 -0
- code_graph_builder/utils/__init__.py +0 -0
- code_graph_builder/utils/path_utils.py +30 -0
- code_graph_builder-0.2.0.dist-info/METADATA +321 -0
- code_graph_builder-0.2.0.dist-info/RECORD +93 -0
- code_graph_builder-0.2.0.dist-info/WHEEL +4 -0
- code_graph_builder-0.2.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,584 @@
|
|
|
1
|
+
"""Vector store for code embeddings.
|
|
2
|
+
|
|
3
|
+
This module provides abstract base class and implementations for storing
|
|
4
|
+
and searching code embeddings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from ..types import PropertyDict
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
21
|
+
"""Calculate cosine similarity between two vectors.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
a: First vector
|
|
25
|
+
b: Second vector
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Cosine similarity (-1 to 1)
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If vectors have different lengths or are zero vectors
|
|
32
|
+
"""
|
|
33
|
+
import math
|
|
34
|
+
|
|
35
|
+
if len(a) != len(b):
|
|
36
|
+
raise ValueError(f"Vectors have different lengths: {len(a)} vs {len(b)}")
|
|
37
|
+
|
|
38
|
+
dot_product = sum(x * y for x, y in zip(a, b))
|
|
39
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
40
|
+
norm_b = math.sqrt(sum(x * x for x in b))
|
|
41
|
+
|
|
42
|
+
if norm_a == 0 or norm_b == 0:
|
|
43
|
+
raise ValueError("Cannot compute cosine similarity for zero vectors")
|
|
44
|
+
|
|
45
|
+
return dot_product / (norm_a * norm_b)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class VectorRecord:
|
|
50
|
+
"""A record in the vector store.
|
|
51
|
+
|
|
52
|
+
Attributes:
|
|
53
|
+
node_id: Unique node identifier
|
|
54
|
+
qualified_name: Fully qualified name of the code entity
|
|
55
|
+
embedding: Embedding vector
|
|
56
|
+
metadata: Additional metadata
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
node_id: int
|
|
60
|
+
qualified_name: str
|
|
61
|
+
embedding: list[float]
|
|
62
|
+
metadata: dict[str, str | int | float | None] = field(default_factory=dict)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class SearchResult:
|
|
67
|
+
"""Result from vector similarity search.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
node_id: Node identifier
|
|
71
|
+
score: Similarity score (0-1, higher is better)
|
|
72
|
+
qualified_name: Fully qualified name
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
node_id: int
|
|
76
|
+
score: float
|
|
77
|
+
qualified_name: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class VectorStore(ABC):
|
|
81
|
+
"""Abstract base class for vector stores."""
|
|
82
|
+
|
|
83
|
+
@abstractmethod
|
|
84
|
+
def store_embedding(
|
|
85
|
+
self,
|
|
86
|
+
node_id: int,
|
|
87
|
+
qualified_name: str,
|
|
88
|
+
embedding: list[float],
|
|
89
|
+
metadata: PropertyDict | None = None,
|
|
90
|
+
**kwargs,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Store an embedding vector.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
node_id: Unique node identifier
|
|
96
|
+
qualified_name: Fully qualified name of the code entity
|
|
97
|
+
embedding: Embedding vector
|
|
98
|
+
metadata: Additional metadata
|
|
99
|
+
**kwargs: Additional keyword arguments (implementation-specific)
|
|
100
|
+
"""
|
|
101
|
+
...
|
|
102
|
+
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def store_embeddings_batch(
|
|
105
|
+
self,
|
|
106
|
+
records: list[VectorRecord],
|
|
107
|
+
) -> None:
|
|
108
|
+
"""Store multiple embeddings in batch.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
records: List of vector records to store
|
|
112
|
+
"""
|
|
113
|
+
...
|
|
114
|
+
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def search_similar(
|
|
117
|
+
self,
|
|
118
|
+
query_embedding: list[float],
|
|
119
|
+
top_k: int = 5,
|
|
120
|
+
filter_metadata: PropertyDict | None = None,
|
|
121
|
+
) -> list[SearchResult]:
|
|
122
|
+
"""Search for similar embeddings.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
query_embedding: Query embedding vector
|
|
126
|
+
top_k: Number of results to return
|
|
127
|
+
filter_metadata: Optional metadata filter
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List of search results
|
|
131
|
+
"""
|
|
132
|
+
...
|
|
133
|
+
|
|
134
|
+
@abstractmethod
|
|
135
|
+
def delete_by_node_id(self, node_id: int) -> bool:
|
|
136
|
+
"""Delete an embedding by node ID.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
node_id: Node identifier to delete
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
True if deleted, False if not found
|
|
143
|
+
"""
|
|
144
|
+
...
|
|
145
|
+
|
|
146
|
+
@abstractmethod
|
|
147
|
+
def clear(self) -> None:
|
|
148
|
+
"""Clear all embeddings from the store."""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
@abstractmethod
|
|
152
|
+
def get_stats(self) -> dict[str, int]:
|
|
153
|
+
"""Get store statistics.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Dictionary with statistics (count, dimension, etc.)
|
|
157
|
+
"""
|
|
158
|
+
...
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class MemoryVectorStore(VectorStore):
|
|
162
|
+
"""In-memory vector store implementation.
|
|
163
|
+
|
|
164
|
+
Uses cosine similarity for search. Suitable for testing and
|
|
165
|
+
small datasets.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
dimension: Expected embedding dimension
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
def __init__(self, dimension: int = 1024):
|
|
172
|
+
self.dimension = dimension
|
|
173
|
+
self._records: dict[int, VectorRecord] = {}
|
|
174
|
+
|
|
175
|
+
def __len__(self) -> int:
|
|
176
|
+
"""Return the number of stored embeddings."""
|
|
177
|
+
return len(self._records)
|
|
178
|
+
|
|
179
|
+
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
|
|
180
|
+
"""Calculate cosine similarity between two vectors.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
a: First vector
|
|
184
|
+
b: Second vector
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Cosine similarity (-1 to 1)
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
return cosine_similarity(a, b)
|
|
191
|
+
except ValueError:
|
|
192
|
+
return 0.0
|
|
193
|
+
|
|
194
|
+
def store_embedding(
|
|
195
|
+
self,
|
|
196
|
+
node_id: int,
|
|
197
|
+
qualified_name: str,
|
|
198
|
+
embedding: list[float],
|
|
199
|
+
metadata: PropertyDict | None = None,
|
|
200
|
+
**kwargs,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Store an embedding vector in memory.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
node_id: Unique node identifier
|
|
206
|
+
qualified_name: Fully qualified name of the code entity
|
|
207
|
+
embedding: Embedding vector
|
|
208
|
+
metadata: Additional metadata dictionary
|
|
209
|
+
**kwargs: Additional keyword arguments (stored as metadata)
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
ValueError: If embedding dimension doesn't match or embedding is empty
|
|
213
|
+
"""
|
|
214
|
+
if not embedding:
|
|
215
|
+
raise ValueError("Embedding cannot be empty")
|
|
216
|
+
|
|
217
|
+
if len(embedding) != self.dimension:
|
|
218
|
+
raise ValueError(
|
|
219
|
+
f"Embedding dimension mismatch: expected {self.dimension}, got {len(embedding)}"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
meta: dict[str, str | int | float | None] = {}
|
|
223
|
+
if metadata:
|
|
224
|
+
for k, v in metadata.items():
|
|
225
|
+
if isinstance(v, (str, int, float, type(None))):
|
|
226
|
+
meta[k] = v
|
|
227
|
+
elif isinstance(v, list):
|
|
228
|
+
meta[k] = str(v)
|
|
229
|
+
elif isinstance(v, bool):
|
|
230
|
+
meta[k] = int(v)
|
|
231
|
+
|
|
232
|
+
# Store additional kwargs as metadata
|
|
233
|
+
for k, v in kwargs.items():
|
|
234
|
+
if isinstance(v, (str, int, float, type(None))):
|
|
235
|
+
meta[k] = v
|
|
236
|
+
elif isinstance(v, list):
|
|
237
|
+
meta[k] = str(v)
|
|
238
|
+
elif isinstance(v, bool):
|
|
239
|
+
meta[k] = int(v)
|
|
240
|
+
|
|
241
|
+
self._records[node_id] = VectorRecord(
|
|
242
|
+
node_id=node_id,
|
|
243
|
+
qualified_name=qualified_name,
|
|
244
|
+
embedding=embedding,
|
|
245
|
+
metadata=meta,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def store_embeddings_batch(
|
|
249
|
+
self,
|
|
250
|
+
records: list[VectorRecord],
|
|
251
|
+
) -> None:
|
|
252
|
+
"""Store multiple embeddings in batch."""
|
|
253
|
+
for record in records:
|
|
254
|
+
self._records[record.node_id] = record
|
|
255
|
+
|
|
256
|
+
def search_similar(
|
|
257
|
+
self,
|
|
258
|
+
query_embedding: list[float],
|
|
259
|
+
top_k: int = 5,
|
|
260
|
+
filter_metadata: PropertyDict | None = None,
|
|
261
|
+
) -> list[SearchResult]:
|
|
262
|
+
"""Search for similar embeddings using cosine similarity."""
|
|
263
|
+
if top_k < 0:
|
|
264
|
+
raise ValueError(f"top_k must be non-negative, got {top_k}")
|
|
265
|
+
|
|
266
|
+
if not self._records:
|
|
267
|
+
return []
|
|
268
|
+
|
|
269
|
+
scores: list[tuple[int, float, str]] = []
|
|
270
|
+
|
|
271
|
+
for node_id, record in self._records.items():
|
|
272
|
+
if filter_metadata:
|
|
273
|
+
match = all(
|
|
274
|
+
record.metadata.get(k) == v for k, v in filter_metadata.items()
|
|
275
|
+
)
|
|
276
|
+
if not match:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
similarity = self._cosine_similarity(query_embedding, record.embedding)
|
|
280
|
+
scores.append((node_id, similarity, record.qualified_name))
|
|
281
|
+
|
|
282
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
|
283
|
+
|
|
284
|
+
return [
|
|
285
|
+
SearchResult(
|
|
286
|
+
node_id=node_id,
|
|
287
|
+
score=round(score, 4),
|
|
288
|
+
qualified_name=qn,
|
|
289
|
+
)
|
|
290
|
+
for node_id, score, qn in scores[:top_k]
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
def delete_by_node_id(self, node_id: int) -> bool:
|
|
294
|
+
"""Delete an embedding by node ID."""
|
|
295
|
+
if node_id in self._records:
|
|
296
|
+
del self._records[node_id]
|
|
297
|
+
return True
|
|
298
|
+
return False
|
|
299
|
+
|
|
300
|
+
# Alias for compatibility with tests
|
|
301
|
+
delete_embedding = delete_by_node_id
|
|
302
|
+
|
|
303
|
+
def get_embedding(self, node_id: int) -> "VectorRecord | None":
|
|
304
|
+
"""Get an embedding record by node ID.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
node_id: Node identifier
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
VectorRecord if found, None otherwise
|
|
311
|
+
"""
|
|
312
|
+
return self._records.get(node_id)
|
|
313
|
+
|
|
314
|
+
def clear(self) -> None:
|
|
315
|
+
"""Clear all embeddings."""
|
|
316
|
+
self._records.clear()
|
|
317
|
+
|
|
318
|
+
def get_stats(self) -> dict[str, int]:
|
|
319
|
+
"""Get store statistics."""
|
|
320
|
+
return {
|
|
321
|
+
"count": len(self._records),
|
|
322
|
+
"dimension": self.dimension,
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
def get_all_records(self) -> list[VectorRecord]:
|
|
326
|
+
"""Get all records (for testing/debugging).
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
List of all vector records
|
|
330
|
+
"""
|
|
331
|
+
return list(self._records.values())
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class QdrantVectorStore(VectorStore):
|
|
335
|
+
"""Qdrant-based vector store implementation.
|
|
336
|
+
|
|
337
|
+
Requires qdrant-client to be installed.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
collection_name: Name of the Qdrant collection
|
|
341
|
+
dimension: Embedding dimension
|
|
342
|
+
db_path: Path for local Qdrant storage (optional)
|
|
343
|
+
host: Qdrant server host (if not using local)
|
|
344
|
+
port: Qdrant server port
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
def __init__(
|
|
348
|
+
self,
|
|
349
|
+
collection_name: str = "code_embeddings",
|
|
350
|
+
dimension: int = 1024,
|
|
351
|
+
db_path: str | Path | None = None,
|
|
352
|
+
host: str | None = None,
|
|
353
|
+
port: int = 6333,
|
|
354
|
+
):
|
|
355
|
+
self.collection_name = collection_name
|
|
356
|
+
self.dimension = dimension
|
|
357
|
+
self.db_path = Path(db_path) if db_path else None
|
|
358
|
+
self.host = host
|
|
359
|
+
self.port = port
|
|
360
|
+
|
|
361
|
+
self._client: "QdrantClient | None" = None
|
|
362
|
+
self._initialized = False
|
|
363
|
+
|
|
364
|
+
def _lazy_init(self) -> None:
|
|
365
|
+
"""Lazy initialization of Qdrant client."""
|
|
366
|
+
if self._initialized:
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
try:
|
|
370
|
+
from qdrant_client import QdrantClient
|
|
371
|
+
from qdrant_client.models import Distance, VectorParams
|
|
372
|
+
|
|
373
|
+
if self.db_path:
|
|
374
|
+
self._client = QdrantClient(path=str(self.db_path))
|
|
375
|
+
elif self.host:
|
|
376
|
+
self._client = QdrantClient(host=self.host, port=self.port)
|
|
377
|
+
else:
|
|
378
|
+
self._client = QdrantClient(location=":memory:")
|
|
379
|
+
|
|
380
|
+
if not self._client.collection_exists(self.collection_name):
|
|
381
|
+
self._client.create_collection(
|
|
382
|
+
collection_name=self.collection_name,
|
|
383
|
+
vectors_config=VectorParams(
|
|
384
|
+
size=self.dimension,
|
|
385
|
+
distance=Distance.COSINE,
|
|
386
|
+
),
|
|
387
|
+
)
|
|
388
|
+
logger.info(f"Created Qdrant collection: {self.collection_name}")
|
|
389
|
+
|
|
390
|
+
self._initialized = True
|
|
391
|
+
|
|
392
|
+
except ImportError as e:
|
|
393
|
+
logger.error(f"Failed to import qdrant-client: {e}")
|
|
394
|
+
raise RuntimeError(
|
|
395
|
+
"qdrant-client required for QdrantVectorStore. "
|
|
396
|
+
"Install with: pip install qdrant-client"
|
|
397
|
+
) from e
|
|
398
|
+
except Exception as e:
|
|
399
|
+
logger.error(f"Failed to initialize Qdrant: {e}")
|
|
400
|
+
raise
|
|
401
|
+
|
|
402
|
+
def store_embedding(
|
|
403
|
+
self,
|
|
404
|
+
node_id: int,
|
|
405
|
+
qualified_name: str,
|
|
406
|
+
embedding: list[float],
|
|
407
|
+
metadata: PropertyDict | None = None,
|
|
408
|
+
**kwargs,
|
|
409
|
+
) -> None:
|
|
410
|
+
"""Store an embedding vector in Qdrant."""
|
|
411
|
+
self._lazy_init()
|
|
412
|
+
|
|
413
|
+
from qdrant_client.models import PointStruct
|
|
414
|
+
|
|
415
|
+
payload: dict[str, str | int | float | None] = {
|
|
416
|
+
"node_id": node_id,
|
|
417
|
+
"qualified_name": qualified_name,
|
|
418
|
+
}
|
|
419
|
+
if metadata:
|
|
420
|
+
for k, v in metadata.items():
|
|
421
|
+
if isinstance(v, (str, int, float, type(None))):
|
|
422
|
+
payload[k] = v
|
|
423
|
+
elif isinstance(v, list):
|
|
424
|
+
payload[k] = str(v)
|
|
425
|
+
elif isinstance(v, bool):
|
|
426
|
+
payload[k] = int(v)
|
|
427
|
+
|
|
428
|
+
# Store additional kwargs as metadata
|
|
429
|
+
for k, v in kwargs.items():
|
|
430
|
+
if isinstance(v, (str, int, float, type(None))):
|
|
431
|
+
payload[k] = v
|
|
432
|
+
elif isinstance(v, list):
|
|
433
|
+
payload[k] = str(v)
|
|
434
|
+
elif isinstance(v, bool):
|
|
435
|
+
payload[k] = int(v)
|
|
436
|
+
|
|
437
|
+
assert self._client is not None
|
|
438
|
+
self._client.upsert(
|
|
439
|
+
collection_name=self.collection_name,
|
|
440
|
+
points=[
|
|
441
|
+
PointStruct(
|
|
442
|
+
id=node_id,
|
|
443
|
+
vector=embedding,
|
|
444
|
+
payload=payload,
|
|
445
|
+
)
|
|
446
|
+
],
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def store_embeddings_batch(
|
|
450
|
+
self,
|
|
451
|
+
records: list[VectorRecord],
|
|
452
|
+
) -> None:
|
|
453
|
+
"""Store multiple embeddings in batch."""
|
|
454
|
+
self._lazy_init()
|
|
455
|
+
|
|
456
|
+
from qdrant_client.models import PointStruct
|
|
457
|
+
|
|
458
|
+
points = []
|
|
459
|
+
for record in records:
|
|
460
|
+
payload: dict[str, str | int | float | None] = {
|
|
461
|
+
"node_id": record.node_id,
|
|
462
|
+
"qualified_name": record.qualified_name,
|
|
463
|
+
}
|
|
464
|
+
payload.update(record.metadata)
|
|
465
|
+
|
|
466
|
+
points.append(
|
|
467
|
+
PointStruct(
|
|
468
|
+
id=record.node_id,
|
|
469
|
+
vector=record.embedding,
|
|
470
|
+
payload=payload,
|
|
471
|
+
)
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
if points:
|
|
475
|
+
assert self._client is not None
|
|
476
|
+
self._client.upsert(
|
|
477
|
+
collection_name=self.collection_name,
|
|
478
|
+
points=points,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
def search_similar(
|
|
482
|
+
self,
|
|
483
|
+
query_embedding: list[float],
|
|
484
|
+
top_k: int = 5,
|
|
485
|
+
filter_metadata: PropertyDict | None = None,
|
|
486
|
+
) -> list[SearchResult]:
|
|
487
|
+
"""Search for similar embeddings in Qdrant."""
|
|
488
|
+
self._lazy_init()
|
|
489
|
+
|
|
490
|
+
from qdrant_client.models import Filter, FieldCondition, MatchValue
|
|
491
|
+
|
|
492
|
+
search_filter = None
|
|
493
|
+
if filter_metadata:
|
|
494
|
+
conditions = []
|
|
495
|
+
for k, v in filter_metadata.items():
|
|
496
|
+
if isinstance(v, (str, int)):
|
|
497
|
+
conditions.append(
|
|
498
|
+
FieldCondition(key=k, match=MatchValue(value=v))
|
|
499
|
+
)
|
|
500
|
+
if conditions:
|
|
501
|
+
search_filter = Filter(must=conditions)
|
|
502
|
+
|
|
503
|
+
assert self._client is not None
|
|
504
|
+
results = self._client.query_points(
|
|
505
|
+
collection_name=self.collection_name,
|
|
506
|
+
query=query_embedding,
|
|
507
|
+
limit=top_k,
|
|
508
|
+
query_filter=search_filter,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
return [
|
|
512
|
+
SearchResult(
|
|
513
|
+
node_id=hit.payload["node_id"],
|
|
514
|
+
score=hit.score,
|
|
515
|
+
qualified_name=str(hit.payload.get("qualified_name", "")),
|
|
516
|
+
)
|
|
517
|
+
for hit in results.points
|
|
518
|
+
if hit.payload is not None
|
|
519
|
+
]
|
|
520
|
+
|
|
521
|
+
def delete_by_node_id(self, node_id: int) -> bool:
|
|
522
|
+
"""Delete an embedding by node ID."""
|
|
523
|
+
self._lazy_init()
|
|
524
|
+
|
|
525
|
+
assert self._client is not None
|
|
526
|
+
result = self._client.delete(
|
|
527
|
+
collection_name=self.collection_name,
|
|
528
|
+
points_selector=[node_id],
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
return result.operation_id is not None
|
|
532
|
+
|
|
533
|
+
def clear(self) -> None:
|
|
534
|
+
"""Clear all embeddings."""
|
|
535
|
+
self._lazy_init()
|
|
536
|
+
|
|
537
|
+
assert self._client is not None
|
|
538
|
+
self._client.delete_collection(self.collection_name)
|
|
539
|
+
self._initialized = False
|
|
540
|
+
self._lazy_init()
|
|
541
|
+
|
|
542
|
+
def get_stats(self) -> dict[str, int]:
|
|
543
|
+
"""Get store statistics."""
|
|
544
|
+
self._lazy_init()
|
|
545
|
+
|
|
546
|
+
assert self._client is not None
|
|
547
|
+
info = self._client.get_collection(self.collection_name)
|
|
548
|
+
|
|
549
|
+
return {
|
|
550
|
+
"count": info.points_count,
|
|
551
|
+
"dimension": self.dimension,
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
|
|
555
|
+
def create_vector_store(
|
|
556
|
+
backend: str = "memory",
|
|
557
|
+
dimension: int = 1024,
|
|
558
|
+
**kwargs: str | int | Path | None,
|
|
559
|
+
) -> VectorStore:
|
|
560
|
+
"""Factory function to create vector store.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
backend: Backend type ("memory" or "qdrant")
|
|
564
|
+
dimension: Embedding dimension
|
|
565
|
+
**kwargs: Additional arguments for specific backends
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
VectorStore instance
|
|
569
|
+
|
|
570
|
+
Raises:
|
|
571
|
+
ValueError: If backend is unknown
|
|
572
|
+
"""
|
|
573
|
+
if backend == "memory":
|
|
574
|
+
return MemoryVectorStore(dimension=dimension)
|
|
575
|
+
elif backend == "qdrant":
|
|
576
|
+
return QdrantVectorStore(
|
|
577
|
+
dimension=dimension,
|
|
578
|
+
collection_name=str(kwargs.get("collection_name", "code_embeddings")),
|
|
579
|
+
db_path=kwargs.get("db_path"),
|
|
580
|
+
host=kwargs.get("host"),
|
|
581
|
+
port=int(kwargs.get("port", 6333)),
|
|
582
|
+
)
|
|
583
|
+
else:
|
|
584
|
+
raise ValueError(f"Unknown vector store backend: {backend}")
|
|
File without changes
|