cognee-community-vector-adapter-valkey 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee_community_vector_adapter_valkey/__init__.py +4 -0
- cognee_community_vector_adapter_valkey/exceptions.py +10 -0
- cognee_community_vector_adapter_valkey/register.py +5 -0
- cognee_community_vector_adapter_valkey/utils.py +180 -0
- cognee_community_vector_adapter_valkey/valkey_adapter.py +535 -0
- cognee_community_vector_adapter_valkey-0.1.1.dist-info/METADATA +160 -0
- cognee_community_vector_adapter_valkey-0.1.1.dist-info/RECORD +8 -0
- cognee_community_vector_adapter_valkey-0.1.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import struct
|
|
5
|
+
from functools import singledispatch
|
|
6
|
+
from typing import Any
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from uuid import UUID
|
|
9
|
+
|
|
10
|
+
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
11
|
+
|
|
12
|
+
"""
|
|
13
|
+
Internal helper function. Not part of the public API.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _parse_host_port(url: str) -> tuple[str, int]:
|
|
18
|
+
"""
|
|
19
|
+
Parse a url and extract the host and port.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
url (str): The connection URL, e.g., "valkey://localhost:6379".
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
tuple[str, int]: A tuple containing:
|
|
26
|
+
- host (str): The hostname from the URL, defaults to "localhost" if missing.
|
|
27
|
+
- port (int): The port number from the URL, defaults to 6379 if missing.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
parsed = urlparse(url)
|
|
31
|
+
host = parsed.hostname or "localhost"
|
|
32
|
+
port = parsed.port or 6379
|
|
33
|
+
return host, port
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _to_float32_bytes(vec) -> bytes:
|
|
37
|
+
"""
|
|
38
|
+
Convert a sequence of numeric values into a bytes representation using 32-bit floats.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
vec (Iterable[float]): A sequence of numbers (e.g., list, tuple) to be converted.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
bytes: A binary representation of the input values packed as consecutive 32-bit floats.
|
|
45
|
+
|
|
46
|
+
Notes:
|
|
47
|
+
- Uses `struct.pack` with the format string `"{len(vec)}f"`, which packs all values as
|
|
48
|
+
IEEE 754 single-precision floats.
|
|
49
|
+
- Ensures compatibility with vector databases or embedding engines that require raw
|
|
50
|
+
float32 byte arrays.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
return struct.pack(f"{len(vec)}f", *map(float, vec))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@singledispatch
|
|
57
|
+
def _serialize_for_json(obj: Any) -> Any:
|
|
58
|
+
"""Convert objects to JSON-serializable format.
|
|
59
|
+
This id default serialization: return the object as-is.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
obj: Object to serialize (UUID, dict, list, or any other type).
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
JSON-serializable representation of the object.
|
|
66
|
+
"""
|
|
67
|
+
return obj
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@_serialize_for_json.register
|
|
71
|
+
def _(obj: UUID) -> str:
|
|
72
|
+
return str(obj)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@_serialize_for_json.register
|
|
76
|
+
def _(obj: dict) -> dict:
|
|
77
|
+
return {k: _serialize_for_json(v) for k, v in obj.items()}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@_serialize_for_json.register
|
|
81
|
+
def _(obj: list) -> list:
|
|
82
|
+
return [_serialize_for_json(item) for item in obj]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _b2s(x: bytes | bytearray | str) -> str:
|
|
86
|
+
"""Convert bytes or bytearray to a UTF-8 string if possible,
|
|
87
|
+
otherwise return a string representation.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
x (Any): The input value, which may be bytes, bytearray, or any other type.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Any: A decoded UTF-8 string if `x` is bytes or bytearray; otherwise, returns `x` unchanged.
|
|
94
|
+
If decoding fails, returns the string representation of `x`.
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
if isinstance(x, (bytes, bytearray)):
|
|
98
|
+
try:
|
|
99
|
+
return x.decode("utf-8")
|
|
100
|
+
except Exception:
|
|
101
|
+
return str(x)
|
|
102
|
+
return x
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _build_scored_results_from_ft(
|
|
106
|
+
raw: Any,
|
|
107
|
+
*,
|
|
108
|
+
use_key_suffix_when_missing_id: bool = True,
|
|
109
|
+
) -> list[ScoredResult]:
|
|
110
|
+
"""Build a list of `ScoredResult` objects from raw FT (Full-Text) search response.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
raw (Any): The raw response from Valkey's FT search command, expected to be a list or tuple
|
|
114
|
+
where the second element is a mapping of keys to field dictionaries.
|
|
115
|
+
use_key_suffix_when_missing_id (bool): If True, use the key string as the ID when the `id`
|
|
116
|
+
field is missing in the response.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
list[ScoredResult]: A list of scored results, each containing:
|
|
120
|
+
- id (str): Extracted from `id` field or fallback to key.
|
|
121
|
+
- payload (dict): Parsed JSON from `payload_data` field, or raw string if malformed.
|
|
122
|
+
- score (float | None): Extracted from `__vector_score` field if present.
|
|
123
|
+
|
|
124
|
+
Notes:
|
|
125
|
+
- Handles both byte keys and string keys by decoding them.
|
|
126
|
+
- Gracefully falls back when fields are missing or payload is invalid JSON.
|
|
127
|
+
"""
|
|
128
|
+
if not isinstance(raw, (list, tuple)) or len(raw) < 2 or not isinstance(raw[1], dict):
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
mapping: dict[Any, dict[Any, Any]] = raw[1] # the { key -> fields } dict
|
|
132
|
+
scored: list[ScoredResult] = []
|
|
133
|
+
|
|
134
|
+
for key_bytes, fields in mapping.items():
|
|
135
|
+
key_str = _b2s(key_bytes)
|
|
136
|
+
|
|
137
|
+
# Extract id
|
|
138
|
+
raw_id = fields.get(b"id") if b"id" in fields else fields.get("id")
|
|
139
|
+
if raw_id is not None:
|
|
140
|
+
result_id = _b2s(raw_id)
|
|
141
|
+
else:
|
|
142
|
+
result_id = key_str
|
|
143
|
+
|
|
144
|
+
# Extrat score
|
|
145
|
+
score = (
|
|
146
|
+
fields.get(b"__vector_score")
|
|
147
|
+
if b"__vector_score" in fields
|
|
148
|
+
else fields.get("__vector_score")
|
|
149
|
+
)
|
|
150
|
+
if score is not None:
|
|
151
|
+
score = float(score)
|
|
152
|
+
|
|
153
|
+
# Extract and parse payload_data
|
|
154
|
+
payload_raw = (
|
|
155
|
+
fields.get(b"payload_data") if b"payload_data" in fields else fields.get("payload_data")
|
|
156
|
+
)
|
|
157
|
+
payload: dict[str, Any] = {}
|
|
158
|
+
if payload_raw is not None:
|
|
159
|
+
payload_str = _b2s(payload_raw)
|
|
160
|
+
if isinstance(payload_str, str):
|
|
161
|
+
try:
|
|
162
|
+
obj = json.loads(payload_str)
|
|
163
|
+
if isinstance(obj, dict):
|
|
164
|
+
payload = obj
|
|
165
|
+
else:
|
|
166
|
+
# If it's not a dict (e.g., list), wrap it
|
|
167
|
+
payload = {"_payload": obj}
|
|
168
|
+
except json.JSONDecodeError:
|
|
169
|
+
# Keep the raw string if malformed
|
|
170
|
+
payload = {"_payload_raw": payload_str}
|
|
171
|
+
|
|
172
|
+
scored.append(
|
|
173
|
+
ScoredResult(
|
|
174
|
+
id=result_id,
|
|
175
|
+
payload=payload,
|
|
176
|
+
score=score,
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return scored
|
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from cognee.infrastructure.databases.exceptions import MissingQueryParameterError
|
|
8
|
+
from cognee.infrastructure.databases.vector import VectorDBInterface
|
|
9
|
+
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import (
|
|
10
|
+
EmbeddingEngine,
|
|
11
|
+
)
|
|
12
|
+
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
13
|
+
from cognee.infrastructure.engine import DataPoint
|
|
14
|
+
from cognee.shared.logging_utils import get_logger
|
|
15
|
+
from glide import (
|
|
16
|
+
BackoffStrategy,
|
|
17
|
+
GlideClient,
|
|
18
|
+
GlideClientConfiguration,
|
|
19
|
+
NodeAddress,
|
|
20
|
+
ft,
|
|
21
|
+
glide_json,
|
|
22
|
+
)
|
|
23
|
+
from glide_shared.commands.server_modules.ft_options.ft_create_options import (
|
|
24
|
+
DataType,
|
|
25
|
+
DistanceMetricType,
|
|
26
|
+
FtCreateOptions,
|
|
27
|
+
TagField,
|
|
28
|
+
VectorAlgorithm,
|
|
29
|
+
VectorField,
|
|
30
|
+
VectorFieldAttributesHnsw,
|
|
31
|
+
VectorType,
|
|
32
|
+
)
|
|
33
|
+
from glide_shared.commands.server_modules.ft_options.ft_search_options import (
|
|
34
|
+
FtSearchOptions,
|
|
35
|
+
ReturnField,
|
|
36
|
+
)
|
|
37
|
+
from glide_shared.exceptions import RequestError
|
|
38
|
+
|
|
39
|
+
from .exceptions import CollectionNotFoundError, ValkeyVectorEngineInitializationError
|
|
40
|
+
from .utils import (
|
|
41
|
+
_build_scored_results_from_ft,
|
|
42
|
+
_parse_host_port,
|
|
43
|
+
_serialize_for_json,
|
|
44
|
+
_to_float32_bytes,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logger = get_logger("ValkeyAdapter")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ValkeyAdapter(VectorDBInterface):
|
|
51
|
+
"""Valkey vector database adapter using ValkeyGlide for vector similarity search.
|
|
52
|
+
|
|
53
|
+
This adapter provides an implementation of the `VectorDBInterface` for Valkey,
|
|
54
|
+
enabling vector storage, retrieval, and similarity search using Valkey's
|
|
55
|
+
full-text and vector indexing capabilities.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name = "Valkey"
|
|
59
|
+
url: str | None
|
|
60
|
+
api_key: str | None = None
|
|
61
|
+
embedding_engine: EmbeddingEngine | None = None
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
url: str | None,
|
|
66
|
+
api_key: str | None = None,
|
|
67
|
+
database_name: str = "cognee",
|
|
68
|
+
embedding_engine: EmbeddingEngine | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Initialize the Valkey adapter.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
url (str): Connection string for your Valkey instance like valkey://localhost:6379.
|
|
74
|
+
embedding_engine: Engine for generating embeddings.
|
|
75
|
+
api_key: Optional API key. Ignored for Valkey.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValkeyVectorEngineInitializationError: If required parameters are missing.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
if not embedding_engine:
|
|
82
|
+
raise ValkeyVectorEngineInitializationError(
|
|
83
|
+
"Embedding engine is required. Provide 'embedding_engine' to the Valkey adapter."
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self.url = url
|
|
87
|
+
self._host, self._port = _parse_host_port(url)
|
|
88
|
+
self.database_name = database_name
|
|
89
|
+
self.embedding_engine = embedding_engine
|
|
90
|
+
self._client: GlideClient | None = None
|
|
91
|
+
self._connected = False
|
|
92
|
+
self.VECTOR_DB_LOCK = asyncio.Lock()
|
|
93
|
+
|
|
94
|
+
# -------------------- lifecycle --------------------
|
|
95
|
+
|
|
96
|
+
async def get_connection(self) -> GlideClient:
|
|
97
|
+
"""Establish and return an asynchronous Glide client connection to the Valkey server.
|
|
98
|
+
|
|
99
|
+
If a connection already exists and is marked as active, it will be reused.
|
|
100
|
+
Otherwise, a new connection is created using the configured host and port.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
GlideClient: An active Glide client instance for executing Valkey commands.
|
|
104
|
+
|
|
105
|
+
Behavior:
|
|
106
|
+
- Uses a backoff reconnect strategy with 3 retries and exponential delay.
|
|
107
|
+
- Disables TLS by default (set `use_tls=True` in configuration if needed).
|
|
108
|
+
- Sets a request timeout of 5000 ms.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
if self._connected and self._client is not None:
|
|
112
|
+
return self._client
|
|
113
|
+
|
|
114
|
+
cfg = GlideClientConfiguration(
|
|
115
|
+
[NodeAddress(self._host, self._port)],
|
|
116
|
+
use_tls=False,
|
|
117
|
+
request_timeout=5000,
|
|
118
|
+
reconnect_strategy=BackoffStrategy(num_of_retries=3, factor=1000, exponent_base=2),
|
|
119
|
+
)
|
|
120
|
+
self._client = await GlideClient.create(cfg)
|
|
121
|
+
self._connected = True
|
|
122
|
+
|
|
123
|
+
return self._client
|
|
124
|
+
|
|
125
|
+
async def close(self) -> None:
|
|
126
|
+
"""Close the active Glide client connection to the Valkey server.
|
|
127
|
+
|
|
128
|
+
If a client connection exists, attempts to close it gracefully.
|
|
129
|
+
Any exceptions during closure are suppressed to avoid breaking cleanup logic.
|
|
130
|
+
|
|
131
|
+
After closing:
|
|
132
|
+
- The internal client reference is set to None.
|
|
133
|
+
- The connection state flag (`_connected`) is reset to False.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
None
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
if self._client is not None:
|
|
141
|
+
try:
|
|
142
|
+
await self._client.close()
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.error("Failed to close Valkey client: %e", e)
|
|
145
|
+
pass
|
|
146
|
+
self._client = None
|
|
147
|
+
self._connected = False
|
|
148
|
+
|
|
149
|
+
# -------------------- helpers --------------------
|
|
150
|
+
|
|
151
|
+
def _index_name(self, collection: str) -> str:
|
|
152
|
+
return f"index:{collection}"
|
|
153
|
+
|
|
154
|
+
def _key_prefix(self, collection: str) -> str:
|
|
155
|
+
return f"vdb:{collection}:"
|
|
156
|
+
|
|
157
|
+
def _key(self, collection: str, pid: str) -> str:
|
|
158
|
+
return f"{self._key_prefix(collection)}{pid}"
|
|
159
|
+
|
|
160
|
+
def _ensure_dims(self) -> int:
|
|
161
|
+
dims = self.embedding_engine.get_dimensions()
|
|
162
|
+
return int(dims)
|
|
163
|
+
|
|
164
|
+
async def embed_data(self, data: list[str]) -> list[list[float]]:
|
|
165
|
+
"""Embed text data using the embedding engine.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
data: List of text strings to embed.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of embedding vectors as lists of floats.
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
Exception: If embedding generation fails.
|
|
175
|
+
"""
|
|
176
|
+
return await self.embedding_engine.embed_text(data)
|
|
177
|
+
|
|
178
|
+
# -------------------- VectorDBInterface methods --------------------
|
|
179
|
+
|
|
180
|
+
async def has_collection(self, collection_name: str) -> bool:
|
|
181
|
+
"""Check if a collection (index) exists.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
collection_name: Name of the collection to check.
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
True if collection exists, False otherwise.
|
|
188
|
+
"""
|
|
189
|
+
client = await self.get_connection()
|
|
190
|
+
try:
|
|
191
|
+
await ft.info(client, self._index_name(collection_name))
|
|
192
|
+
return True
|
|
193
|
+
except Exception as e:
|
|
194
|
+
logger.warning("Valkey index check failed for '%s': %s", collection_name, e)
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
async def create_collection(
|
|
198
|
+
self,
|
|
199
|
+
collection_name: str,
|
|
200
|
+
payload_schema: Any | None = None,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Create a new collection (Valkey index) with vector search capabilities.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
collection_name: Name of the collection to create.
|
|
206
|
+
payload_schema: Schema for payload data (not used).
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
Exception: If collection creation fails.
|
|
210
|
+
"""
|
|
211
|
+
async with self.VECTOR_DB_LOCK:
|
|
212
|
+
try:
|
|
213
|
+
if await self.has_collection(collection_name):
|
|
214
|
+
logger.info(f"Collection {collection_name} already exists")
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
fields = [
|
|
218
|
+
TagField("id"),
|
|
219
|
+
VectorField(
|
|
220
|
+
name="vector",
|
|
221
|
+
algorithm=VectorAlgorithm.HNSW,
|
|
222
|
+
attributes=VectorFieldAttributesHnsw(
|
|
223
|
+
dimensions=self.embedding_engine.get_vector_size(),
|
|
224
|
+
distance_metric=DistanceMetricType.COSINE,
|
|
225
|
+
type=VectorType.FLOAT32,
|
|
226
|
+
),
|
|
227
|
+
),
|
|
228
|
+
]
|
|
229
|
+
prefixes = [self._key_prefix(collection_name)]
|
|
230
|
+
options = FtCreateOptions(DataType.JSON, prefixes)
|
|
231
|
+
index = self._index_name(collection_name)
|
|
232
|
+
|
|
233
|
+
ok = await ft.create(self._client, index, fields, options)
|
|
234
|
+
if ok not in (b"OK", "OK"):
|
|
235
|
+
raise Exception(f"FT.CREATE failed for index '{index}': {ok!r}")
|
|
236
|
+
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logger.error(f"Error creating collection {collection_name}: {str(e)}")
|
|
239
|
+
raise e
|
|
240
|
+
|
|
241
|
+
async def create_data_points(
|
|
242
|
+
self,
|
|
243
|
+
collection_name: str,
|
|
244
|
+
data_points: list[DataPoint],
|
|
245
|
+
) -> None:
|
|
246
|
+
"""Create data points in the collection.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
collection_name: Name of the target collection.
|
|
250
|
+
data_points: List of DataPoint objects to insert.
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
CollectionNotFoundError: If the collection doesn't exist.
|
|
254
|
+
Exception: If data point creation fails.
|
|
255
|
+
"""
|
|
256
|
+
client = await self.get_connection()
|
|
257
|
+
assert self._client is not None
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
if not await self.has_collection(collection_name):
|
|
261
|
+
raise CollectionNotFoundError(f"Collection {collection_name} not found!")
|
|
262
|
+
|
|
263
|
+
# Embed the data points
|
|
264
|
+
data_to_embed = [
|
|
265
|
+
DataPoint.get_embeddable_data(data_point) for data_point in data_points
|
|
266
|
+
]
|
|
267
|
+
data_vectors = await self.embed_data(data_to_embed)
|
|
268
|
+
|
|
269
|
+
documents = []
|
|
270
|
+
for data_point, embedding in zip(data_points, data_vectors, strict=False):
|
|
271
|
+
payload = _serialize_for_json(data_point.model_dump())
|
|
272
|
+
|
|
273
|
+
doc_data = {
|
|
274
|
+
"id": str(data_point.id),
|
|
275
|
+
"vector": embedding,
|
|
276
|
+
"payload_data": json.dumps(payload), # Store as JSON string
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
documents.append(
|
|
280
|
+
glide_json.set(
|
|
281
|
+
client,
|
|
282
|
+
self._key(collection_name, str(data_point.id)),
|
|
283
|
+
"$",
|
|
284
|
+
json.dumps(doc_data),
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
await asyncio.gather(*documents)
|
|
289
|
+
|
|
290
|
+
except RequestError as e:
|
|
291
|
+
# Helpful guidance if JSON vector arrays aren't supported by the deployed module
|
|
292
|
+
logger.error(f"JSON.SET failed: {e}")
|
|
293
|
+
raise e
|
|
294
|
+
|
|
295
|
+
except Exception as e:
|
|
296
|
+
logger.error(f"Error creating data points: {str(e)}")
|
|
297
|
+
raise e
|
|
298
|
+
|
|
299
|
+
# TODO: Add this and fix issues
|
|
300
|
+
# async def create_vector_index(self, index_name: str, index_property_name: str):
|
|
301
|
+
# await self.create_collection(f"{index_name}_{index_property_name}")
|
|
302
|
+
#
|
|
303
|
+
# async def index_data_points(
|
|
304
|
+
# self, index_name: str, index_property_name: str, data_points: List[DataPoint]
|
|
305
|
+
# ):
|
|
306
|
+
# """Index data points in the collection."""
|
|
307
|
+
#
|
|
308
|
+
# await self.create_data_points(f"{index_name}_{index_property_name}", data_points)
|
|
309
|
+
|
|
310
|
+
async def retrieve(
|
|
311
|
+
self,
|
|
312
|
+
collection_name: str,
|
|
313
|
+
data_point_ids: list[str],
|
|
314
|
+
) -> list[dict[str, Any]]:
|
|
315
|
+
"""Retrieve data points by their IDs.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
collection_name: Name of the collection to retrieve from.
|
|
319
|
+
data_point_ids: List of data point IDs to retrieve.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
List of retrieved data point payloads.
|
|
323
|
+
"""
|
|
324
|
+
client = await self.get_connection()
|
|
325
|
+
assert self._client is not None
|
|
326
|
+
|
|
327
|
+
try:
|
|
328
|
+
results = []
|
|
329
|
+
for data_id in data_point_ids:
|
|
330
|
+
key = self._key(collection_name, data_id)
|
|
331
|
+
raw_doc = await glide_json.get(client, key, "$")
|
|
332
|
+
if raw_doc:
|
|
333
|
+
doc = json.loads(raw_doc)
|
|
334
|
+
payload_str = doc[0]["payload_data"]
|
|
335
|
+
try:
|
|
336
|
+
payload = json.loads(payload_str)
|
|
337
|
+
results.append(payload)
|
|
338
|
+
except json.JSONDecodeError:
|
|
339
|
+
# Fallback to the document itself if payload parsing fails
|
|
340
|
+
results.append(raw_doc)
|
|
341
|
+
|
|
342
|
+
return results
|
|
343
|
+
|
|
344
|
+
except Exception as e:
|
|
345
|
+
logger.error(f"Error retrieving data points: {str(e)}")
|
|
346
|
+
return []
|
|
347
|
+
|
|
348
|
+
async def search(
|
|
349
|
+
self,
|
|
350
|
+
collection_name: str,
|
|
351
|
+
query_text: str | None = None,
|
|
352
|
+
query_vector: list[float] | None = None,
|
|
353
|
+
limit: int | None = 15,
|
|
354
|
+
with_vector: bool = False,
|
|
355
|
+
include_payload: bool = False,
|
|
356
|
+
) -> list[ScoredResult]:
|
|
357
|
+
"""Search for similar vectors in the collection.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
collection_name: Name of the collection to search.
|
|
361
|
+
query_text: Text query to search for (will be embedded).
|
|
362
|
+
query_vector: Pre-computed query vector.
|
|
363
|
+
limit: Maximum number of results to return.
|
|
364
|
+
with_vector: Whether to include vectors in results.
|
|
365
|
+
include_payload: Whether to include payloads in results.
|
|
366
|
+
|
|
367
|
+
Returns:
|
|
368
|
+
List of ScoredResult objects sorted by similarity.
|
|
369
|
+
|
|
370
|
+
Raises:
|
|
371
|
+
MissingQueryParameterError: If neither query_text nor query_vector is provided.
|
|
372
|
+
Exception: If search execution fails.
|
|
373
|
+
"""
|
|
374
|
+
client = await self.get_connection()
|
|
375
|
+
assert self._client is not None
|
|
376
|
+
|
|
377
|
+
if query_text is None and query_vector is None:
|
|
378
|
+
raise MissingQueryParameterError()
|
|
379
|
+
|
|
380
|
+
if not await self.has_collection(collection_name):
|
|
381
|
+
logger.warning(
|
|
382
|
+
f"Collection '{collection_name}' not found in ValkeyAdapter.search; returning []."
|
|
383
|
+
)
|
|
384
|
+
return []
|
|
385
|
+
|
|
386
|
+
if limit is None:
|
|
387
|
+
info = await ft.info(client, self._index_name(collection_name))
|
|
388
|
+
limit = info["num_docs"]
|
|
389
|
+
|
|
390
|
+
if limit <= 0:
|
|
391
|
+
return []
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
# Get the query vector
|
|
395
|
+
if query_vector is None:
|
|
396
|
+
[vec] = await self.embed_data([query_text])
|
|
397
|
+
else:
|
|
398
|
+
vec = query_vector
|
|
399
|
+
vec_bytes = _to_float32_bytes(vec)
|
|
400
|
+
|
|
401
|
+
# Set return fields
|
|
402
|
+
return_fields = [
|
|
403
|
+
ReturnField("$.id", alias="id"),
|
|
404
|
+
ReturnField("__vector_score", alias="score"),
|
|
405
|
+
]
|
|
406
|
+
if include_payload:
|
|
407
|
+
return_fields.append(ReturnField("$.payload_data", alias="payload_data"))
|
|
408
|
+
if with_vector:
|
|
409
|
+
return_fields.append(ReturnField("$.vector", alias="vector"))
|
|
410
|
+
|
|
411
|
+
vector_param_name = "query_vector"
|
|
412
|
+
query = f"*=>[KNN {limit} @vector ${vector_param_name}]"
|
|
413
|
+
query_options = FtSearchOptions(
|
|
414
|
+
params={vector_param_name: vec_bytes}, return_fields=return_fields
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Execute the search
|
|
418
|
+
raw_results = await ft.search(
|
|
419
|
+
client=client,
|
|
420
|
+
index_name=self._index_name(collection_name),
|
|
421
|
+
query=query,
|
|
422
|
+
options=query_options,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
scored_results = _build_scored_results_from_ft(raw_results)
|
|
426
|
+
return scored_results
|
|
427
|
+
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.error(f"Error during search: {str(e)}")
|
|
430
|
+
raise e
|
|
431
|
+
|
|
432
|
+
async def batch_search(
|
|
433
|
+
self,
|
|
434
|
+
collection_name: str,
|
|
435
|
+
query_texts: list[str],
|
|
436
|
+
limit: int | None,
|
|
437
|
+
with_vectors: bool = False,
|
|
438
|
+
score_threshold: float | None = 0.1,
|
|
439
|
+
max_concurrency: int = 10,
|
|
440
|
+
include_payload: bool = False,
|
|
441
|
+
) -> list[list[ScoredResult]]:
|
|
442
|
+
"""Perform batch search for multiple queries.
|
|
443
|
+
|
|
444
|
+
Args:
|
|
445
|
+
collection_name: Name of the collection to search.
|
|
446
|
+
query_texts: List of text queries to search for.
|
|
447
|
+
limit: Maximum number of results per query.
|
|
448
|
+
with_vectors: Whether to include vectors in results.
|
|
449
|
+
score_threshold: threshold for filtering scores.
|
|
450
|
+
max_concurrency: maximum number of concurrent searches.
|
|
451
|
+
include_payload: Whether to include payloads in results.
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
List of search results for each query, filtered by score threshold.
|
|
455
|
+
"""
|
|
456
|
+
if not await self.has_collection(collection_name):
|
|
457
|
+
logger.warning(
|
|
458
|
+
f"Collection '{collection_name}' not found in ValkeyAdapter.search; returning []."
|
|
459
|
+
)
|
|
460
|
+
return []
|
|
461
|
+
|
|
462
|
+
# Embed all queries at once
|
|
463
|
+
vectors = await self.embed_data(query_texts)
|
|
464
|
+
|
|
465
|
+
# Execute searches in parallel
|
|
466
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
467
|
+
|
|
468
|
+
async def limited_search(vector):
|
|
469
|
+
async with semaphore:
|
|
470
|
+
return await self.search(
|
|
471
|
+
collection_name=collection_name,
|
|
472
|
+
query_vector=vector,
|
|
473
|
+
limit=limit,
|
|
474
|
+
with_vector=with_vectors,
|
|
475
|
+
include_payload=include_payload,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
tasks = [limited_search(vector) for vector in vectors]
|
|
479
|
+
results = await asyncio.gather(*tasks)
|
|
480
|
+
|
|
481
|
+
# Filter results by a score threshold
|
|
482
|
+
return [
|
|
483
|
+
[result for result in result_group if result.score < score_threshold]
|
|
484
|
+
for result_group in results
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
async def delete_data_points(
|
|
488
|
+
self,
|
|
489
|
+
collection_name: str,
|
|
490
|
+
data_point_ids: list[str],
|
|
491
|
+
) -> dict[str, int]:
|
|
492
|
+
"""Delete data points by their IDs.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
collection_name: Name of the collection to delete from.
|
|
496
|
+
data_point_ids: List of data point IDs to delete.
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
Dictionary containing the number of deleted documents.
|
|
500
|
+
|
|
501
|
+
Raises:
|
|
502
|
+
Exception: If deletion fails.
|
|
503
|
+
"""
|
|
504
|
+
client = await self.get_connection()
|
|
505
|
+
assert self._client is not None
|
|
506
|
+
|
|
507
|
+
ids = [self._key(collection_name, id) for id in data_point_ids]
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
deleted_count = await client.delete(ids)
|
|
511
|
+
logger.info(f"Deleted {deleted_count} data points from collection {collection_name}")
|
|
512
|
+
return {"deleted": deleted_count}
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.error(f"Error deleting data points: {str(e)}")
|
|
515
|
+
raise e
|
|
516
|
+
|
|
517
|
+
async def prune(self):
|
|
518
|
+
"""Remove all collections and data from Valkey.
|
|
519
|
+
|
|
520
|
+
This method drops all existing indices and clears the internal cache.
|
|
521
|
+
|
|
522
|
+
Raises:
|
|
523
|
+
Exception: If pruning fails.
|
|
524
|
+
"""
|
|
525
|
+
client = await self.get_connection()
|
|
526
|
+
assert self._client is not None
|
|
527
|
+
try:
|
|
528
|
+
all_indexes = await ft.list(client)
|
|
529
|
+
for index in all_indexes:
|
|
530
|
+
await ft.dropindex(client, index)
|
|
531
|
+
logger.info(f"Dropped index {index}")
|
|
532
|
+
|
|
533
|
+
except Exception as e:
|
|
534
|
+
logger.error(f"Error during prune: {str(e)}")
|
|
535
|
+
raise e
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cognee-community-vector-adapter-valkey
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Valkey vector database adapter for cognee
|
|
5
|
+
Requires-Python: <=3.13,>=3.11
|
|
6
|
+
Requires-Dist: cognee==0.5.2
|
|
7
|
+
Requires-Dist: numpy>=1.24.0
|
|
8
|
+
Requires-Dist: valkey-glide>=2.1.0
|
|
9
|
+
Provides-Extra: dev
|
|
10
|
+
Requires-Dist: anyio>=4.0; extra == 'dev'
|
|
11
|
+
Requires-Dist: mypy>=1.17.1; extra == 'dev'
|
|
12
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
15
|
+
Provides-Extra: test
|
|
16
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'test'
|
|
17
|
+
Requires-Dist: pytest>=7.4; extra == 'test'
|
|
18
|
+
Requires-Dist: valkey-glide>=2.1.0; extra == 'test'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Cognee Valkey Vector Adapter
|
|
22
|
+
|
|
23
|
+
A Valkey vector database adapter for Cognee using Valkey Glide, providing high-performance vector storage and retrieval for AI memory applications. Compared to the Redis adapter, Valkey offers a fully open-source, community-driven architecture without the licensing restrictions of Redis. Using Valkey Glide ensures efficient async operations and native support for Valkey’s enhancements, providing optimal compatibility and performance when running on Valkey, making it the best choice for teams adopting Valkey as their primary in-memory vector solution.
|
|
24
|
+
|
|
25
|
+
## Features
|
|
26
|
+
|
|
27
|
+
- Full support for vector embeddings storage and retrieval
|
|
28
|
+
- Batch / pipeline operations for efficient processing
|
|
29
|
+
- Automatic embedding generation via configurable embedding engines
|
|
30
|
+
- Comprehensive error handling
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
If published, the package can be simply installed via pip:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install cognee-community-vector-adapter-valkey
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
In case it is not published yet, you can use poetry to locally build the adapter package:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
pip install uv
|
|
44
|
+
uv sync --all-extras
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Prerequisites
|
|
48
|
+
|
|
49
|
+
You need a Valkey instance with the Valkey Search module enabled. You can use:
|
|
50
|
+
|
|
51
|
+
1. **Valkey**:
|
|
52
|
+
```bash
|
|
53
|
+
docker run -d --name valkey -p 6379:6379 valkey/valkey-bundle
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Examples
|
|
57
|
+
Checkout the `examples/` folder!
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
uv run examples/example.py
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
>You will need an OpenAI API key to run the example script.
|
|
64
|
+
|
|
65
|
+
## Configuration
|
|
66
|
+
|
|
67
|
+
Configure Valkey as your vector database in cognee:
|
|
68
|
+
|
|
69
|
+
- `vector_db_provider`: Set to "valkey"
|
|
70
|
+
- `vector_db_url`: Valkey connection URL (e.g., "valkey://localhost:6379")
|
|
71
|
+
|
|
72
|
+
### Environment Variables
|
|
73
|
+
|
|
74
|
+
Set the following environment variables or pass them directly in the config:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
export VECTOR_DB_URL="valkey://localhost:6379"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Connection URL Examples
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
# Local Valkey
|
|
84
|
+
config.set_vector_db_config({
|
|
85
|
+
"vector_db_provider": "valkey",
|
|
86
|
+
"vector_db_url": "valkey://localhost:6379"
|
|
87
|
+
})
|
|
88
|
+
|
|
89
|
+
# Valkey with authentication
|
|
90
|
+
config.set_vector_db_config({
|
|
91
|
+
"vector_db_provider": "valkey",
|
|
92
|
+
"vector_db_url": "valkey://user:password@localhost:6379"
|
|
93
|
+
})
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Requirements
|
|
97
|
+
|
|
98
|
+
- Python >= 3.11, <= 3.13
|
|
99
|
+
- valkey-glide >= 2.1.0
|
|
100
|
+
- cognee >= 0.4.0
|
|
101
|
+
|
|
102
|
+
## Advanced Usage
|
|
103
|
+
|
|
104
|
+
For direct adapter usage (advanced users only):
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import EmbeddingEngine
|
|
108
|
+
from cognee_community_vector_adapter_valkey import ValkeyAdapter
|
|
109
|
+
from cognee.infrastructure.engine import DataPoint
|
|
110
|
+
|
|
111
|
+
# Initialize embedding engine and adapter
|
|
112
|
+
embedding_engine = EmbeddingEngine(model="your-model")
|
|
113
|
+
valkey_adapter = ValkeyAdapter(
|
|
114
|
+
url="valkey://localhost:6379",
|
|
115
|
+
embedding_engine=embedding_engine
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Direct adapter operations
|
|
119
|
+
await valkey_adapter.create_collection("my_collection")
|
|
120
|
+
data_points = [DataPoint(id="1", text="Hello", metadata={"index_fields": ["text"]})]
|
|
121
|
+
await valkey_adapter.create_data_points("my_collection", data_points)
|
|
122
|
+
results = await valkey_adapter.search("my_collection", query_text="Hello", limit=10)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Error Handling
|
|
126
|
+
|
|
127
|
+
The adapter includes comprehensive error handling:
|
|
128
|
+
|
|
129
|
+
- `VectorEngineInitializationError`: Raised when required parameters are missing
|
|
130
|
+
- `CollectionNotFoundError`: Raised when attempting operations on non-existent collections
|
|
131
|
+
- `InvalidValueError`: Raised for invalid query parameters
|
|
132
|
+
- Graceful handling of connection failures and embedding errors
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
## Troubleshooting
|
|
136
|
+
|
|
137
|
+
### Common Issues
|
|
138
|
+
|
|
139
|
+
1. **Connection Errors**: Ensure Valkey is running and accessible at the specified URL
|
|
140
|
+
2. **Search Module Missing**: Make sure Valkey has the Search module enabled
|
|
141
|
+
3. **Embedding Dimension Mismatch**: Verify embedding engine dimensions match index configuration
|
|
142
|
+
4. **Collection Not Found**: Always create collections before adding data points
|
|
143
|
+
|
|
144
|
+
### Debug Logging
|
|
145
|
+
|
|
146
|
+
The adapter uses Cognee's logging system. Enable debug logging to see detailed operation logs:
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
import logging
|
|
150
|
+
logging.getLogger("ValkeyAdapter").setLevel(logging.DEBUG)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Development
|
|
154
|
+
|
|
155
|
+
To contribute or modify the adapter:
|
|
156
|
+
|
|
157
|
+
1. Clone the repository and `cd` into the `valkey` folder
|
|
158
|
+
2. Install dependencies: `uv sync --all-extras`
|
|
159
|
+
3. Make sure a Valkey instance is running (see above)
|
|
160
|
+
5. Make your changes, test, and submit a PR
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
cognee_community_vector_adapter_valkey/__init__.py,sha256=jEC7tJHqvrLazHCBOU22molo1p_FkjMjUItxEEoxTZA,226
|
|
2
|
+
cognee_community_vector_adapter_valkey/exceptions.py,sha256=mMxiVP0eOTwrrwtPVPWJFKv7Ur4Oky2_6qAeaEeB5g0,247
|
|
3
|
+
cognee_community_vector_adapter_valkey/register.py,sha256=Eh4lgm6TISLw3dfq2RuNpcMubvo9Kf5-wRvJ6VxClWU,158
|
|
4
|
+
cognee_community_vector_adapter_valkey/utils.py,sha256=dFiXNsd8WaX3gDf5o25Dz6lS0FtqnACYyOGSdIg0Lx4,5664
|
|
5
|
+
cognee_community_vector_adapter_valkey/valkey_adapter.py,sha256=BAzMXboHX7nRdHCHENlrJ29sP7gKWs1qUo3psPzqyhk,18481
|
|
6
|
+
cognee_community_vector_adapter_valkey-0.1.1.dist-info/METADATA,sha256=vRe5w9-vXTF0ZsLBTQE0yCwbEI91ELapKpCtyO6nxUY,4975
|
|
7
|
+
cognee_community_vector_adapter_valkey-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
+
cognee_community_vector_adapter_valkey-0.1.1.dist-info/RECORD,,
|