cognee-community-vector-adapter-weaviate 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee_community_vector_adapter_weaviate/__init__.py +3 -0
- cognee_community_vector_adapter_weaviate/register.py +5 -0
- cognee_community_vector_adapter_weaviate/weaviate_adapter.py +556 -0
- cognee_community_vector_adapter_weaviate-0.0.2.dist-info/METADATA +138 -0
- cognee_community_vector_adapter_weaviate-0.0.2.dist-info/RECORD +6 -0
- cognee_community_vector_adapter_weaviate-0.0.2.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from tenacity import retry, retry_if_exception, stop_after_attempt, wait_exponential
|
|
5
|
+
|
|
6
|
+
from cognee.shared.logging_utils import get_logger
|
|
7
|
+
from cognee.infrastructure.engine import DataPoint
|
|
8
|
+
from cognee.infrastructure.engine.utils import parse_id
|
|
9
|
+
from cognee.infrastructure.databases.exceptions import MissingQueryParameterError
|
|
10
|
+
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
|
|
11
|
+
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import (
|
|
12
|
+
EmbeddingEngine,
|
|
13
|
+
)
|
|
14
|
+
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
15
|
+
from cognee.infrastructure.databases.vector.vector_db_interface import VectorDBInterface
|
|
16
|
+
|
|
17
|
+
logger = get_logger("WeaviateAdapter")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def is_retryable_request(error):
|
|
21
|
+
from weaviate.exceptions import UnexpectedStatusCodeException
|
|
22
|
+
from requests.exceptions import RequestException
|
|
23
|
+
|
|
24
|
+
if isinstance(error, UnexpectedStatusCodeException):
|
|
25
|
+
# Retry on conflict, service unavailable, internal error
|
|
26
|
+
return error.status_code in {409, 503, 500}
|
|
27
|
+
if isinstance(error, RequestException):
|
|
28
|
+
return True # Includes timeout, connection error, etc.
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class IndexSchema(DataPoint):
|
|
33
|
+
"""
|
|
34
|
+
Define a schema for indexing data points with textual content.
|
|
35
|
+
|
|
36
|
+
The IndexSchema class inherits from DataPoint and includes the following public
|
|
37
|
+
attributes:
|
|
38
|
+
|
|
39
|
+
- text: A string representing the main content of the data point.
|
|
40
|
+
- metadata: A dictionary containing indexing information, specifically the fields to be
|
|
41
|
+
indexed (in this case, the 'text' field).
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
text: str
|
|
45
|
+
|
|
46
|
+
metadata: dict = {"index_fields": ["text"]}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class WeaviateAdapter(VectorDBInterface):
|
|
50
|
+
"""
|
|
51
|
+
Adapt the Weaviate vector database to an interface for managing collections and data
|
|
52
|
+
points.
|
|
53
|
+
|
|
54
|
+
Public methods:
|
|
55
|
+
- get_client
|
|
56
|
+
- embed_data
|
|
57
|
+
- has_collection
|
|
58
|
+
- create_collection
|
|
59
|
+
- get_collection
|
|
60
|
+
- create_data_points
|
|
61
|
+
- create_vector_index
|
|
62
|
+
- index_data_points
|
|
63
|
+
- retrieve
|
|
64
|
+
- search
|
|
65
|
+
- batch_search
|
|
66
|
+
- delete_data_points
|
|
67
|
+
- prune
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name = "Weaviate"
|
|
71
|
+
url: str
|
|
72
|
+
api_key: str
|
|
73
|
+
embedding_engine: EmbeddingEngine = None
|
|
74
|
+
|
|
75
|
+
def __init__(self, url: str, api_key: str, embedding_engine: EmbeddingEngine):
|
|
76
|
+
import weaviate
|
|
77
|
+
import weaviate.classes as wvc
|
|
78
|
+
|
|
79
|
+
self.url = url
|
|
80
|
+
self.api_key = api_key
|
|
81
|
+
|
|
82
|
+
self.embedding_engine = embedding_engine
|
|
83
|
+
self.VECTOR_DB_LOCK = asyncio.Lock()
|
|
84
|
+
|
|
85
|
+
self.client = weaviate.use_async_with_weaviate_cloud(
|
|
86
|
+
cluster_url=url,
|
|
87
|
+
auth_credentials=weaviate.auth.AuthApiKey(api_key),
|
|
88
|
+
additional_config=wvc.init.AdditionalConfig(
|
|
89
|
+
timeout=wvc.init.Timeout(init=30)
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
async def get_client(self):
|
|
94
|
+
"""
|
|
95
|
+
Establish a connection to the Weaviate client.
|
|
96
|
+
|
|
97
|
+
Return the Weaviate client instance after connecting asynchronously.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
--------
|
|
101
|
+
|
|
102
|
+
The Weaviate client instance.
|
|
103
|
+
"""
|
|
104
|
+
await self.client.connect()
|
|
105
|
+
|
|
106
|
+
return self.client
|
|
107
|
+
|
|
108
|
+
async def embed_data(self, data: List[str]) -> List[float]:
|
|
109
|
+
"""
|
|
110
|
+
Embed the given text data into vector representations.
|
|
111
|
+
|
|
112
|
+
Given a list of strings, return their vector embeddings using the configured embedding
|
|
113
|
+
engine.
|
|
114
|
+
|
|
115
|
+
Parameters:
|
|
116
|
+
-----------
|
|
117
|
+
|
|
118
|
+
- data (List[str]): A list of strings to be embedded.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
--------
|
|
122
|
+
|
|
123
|
+
- List[float]: A list of float vectors corresponding to the embedded text data.
|
|
124
|
+
"""
|
|
125
|
+
return await self.embedding_engine.embed_text(data)
|
|
126
|
+
|
|
127
|
+
async def has_collection(self, collection_name: str) -> bool:
|
|
128
|
+
"""
|
|
129
|
+
Check if a collection exists in the Weaviate database.
|
|
130
|
+
|
|
131
|
+
Return a boolean indicating the presence of the specified collection.
|
|
132
|
+
|
|
133
|
+
Parameters:
|
|
134
|
+
-----------
|
|
135
|
+
|
|
136
|
+
- collection_name (str): The name of the collection to check.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
--------
|
|
140
|
+
|
|
141
|
+
- bool: True if the collection exists, otherwise False.
|
|
142
|
+
"""
|
|
143
|
+
return await self.client.collections.exists(collection_name)
|
|
144
|
+
|
|
145
|
+
@retry(
|
|
146
|
+
retry=retry_if_exception(is_retryable_request),
|
|
147
|
+
stop=stop_after_attempt(3),
|
|
148
|
+
wait=wait_exponential(multiplier=2, min=1, max=6),
|
|
149
|
+
)
|
|
150
|
+
async def create_collection(
|
|
151
|
+
self,
|
|
152
|
+
collection_name: str,
|
|
153
|
+
payload_schema=None,
|
|
154
|
+
):
|
|
155
|
+
"""
|
|
156
|
+
Create a new collection in the Weaviate database if it does not already exist.
|
|
157
|
+
|
|
158
|
+
The collection will be initialized with a default schema.
|
|
159
|
+
|
|
160
|
+
Parameters:
|
|
161
|
+
-----------
|
|
162
|
+
|
|
163
|
+
- collection_name (str): The name of the new collection to be created.
|
|
164
|
+
- payload_schema: Optional schema definition for the collection payload. (default
|
|
165
|
+
None)
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
--------
|
|
169
|
+
|
|
170
|
+
The created collection's configuration, if a new collection was made, otherwise
|
|
171
|
+
information about the existing collection.
|
|
172
|
+
"""
|
|
173
|
+
import weaviate.classes.config as wvcc
|
|
174
|
+
|
|
175
|
+
client = await self.get_client()
|
|
176
|
+
async with self.VECTOR_DB_LOCK:
|
|
177
|
+
if not await self.has_collection(collection_name):
|
|
178
|
+
return await client.collections.create(
|
|
179
|
+
name=collection_name,
|
|
180
|
+
properties=[
|
|
181
|
+
wvcc.Property(
|
|
182
|
+
name="text",
|
|
183
|
+
data_type=wvcc.DataType.TEXT,
|
|
184
|
+
skip_vectorization=True,
|
|
185
|
+
)
|
|
186
|
+
],
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
result = await self.get_collection(collection_name)
|
|
190
|
+
await client.close()
|
|
191
|
+
return result
|
|
192
|
+
|
|
193
|
+
async def get_collection(self, collection_name: str):
|
|
194
|
+
"""
|
|
195
|
+
Retrieve a collection from the Weaviate database by its name.
|
|
196
|
+
|
|
197
|
+
Raise a CollectionNotFoundError if the specified collection does not exist.
|
|
198
|
+
|
|
199
|
+
Parameters:
|
|
200
|
+
-----------
|
|
201
|
+
|
|
202
|
+
- collection_name (str): The name of the collection to be retrieved.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
--------
|
|
206
|
+
|
|
207
|
+
The requested collection object from the database.
|
|
208
|
+
"""
|
|
209
|
+
if not await self.has_collection(collection_name):
|
|
210
|
+
raise CollectionNotFoundError(f"Collection '{collection_name}' not found.")
|
|
211
|
+
|
|
212
|
+
return self.client.collections.get(collection_name)
|
|
213
|
+
|
|
214
|
+
@retry(
|
|
215
|
+
retry=retry_if_exception(is_retryable_request),
|
|
216
|
+
stop=stop_after_attempt(3),
|
|
217
|
+
wait=wait_exponential(multiplier=2, min=1, max=6),
|
|
218
|
+
)
|
|
219
|
+
async def create_data_points(
|
|
220
|
+
self, collection_name: str, data_points: List[DataPoint]
|
|
221
|
+
):
|
|
222
|
+
"""
|
|
223
|
+
Create or update data points in the specified collection in the Weaviate database.
|
|
224
|
+
|
|
225
|
+
Process the list of data points, embedding them and either inserting them or updating if
|
|
226
|
+
they already exist.
|
|
227
|
+
|
|
228
|
+
Parameters:
|
|
229
|
+
-----------
|
|
230
|
+
|
|
231
|
+
- collection_name (str): The name of the collection to add data points to.
|
|
232
|
+
- data_points (List[DataPoint]): A list of DataPoint objects to be created or
|
|
233
|
+
updated in the collection.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
--------
|
|
237
|
+
|
|
238
|
+
Information about the inserted or updated data points in the collection.
|
|
239
|
+
"""
|
|
240
|
+
from weaviate.classes.data import DataObject
|
|
241
|
+
|
|
242
|
+
data_vectors = await self.embed_data(
|
|
243
|
+
[DataPoint.get_embeddable_data(data_point) for data_point in data_points]
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def convert_to_weaviate_data_points(data_point: DataPoint):
|
|
247
|
+
"""
|
|
248
|
+
Transform a DataPoint object into a Weaviate DataObject format for insertion.
|
|
249
|
+
|
|
250
|
+
Return a DataObject ready for use in Weaviate with the properties and vector included.
|
|
251
|
+
|
|
252
|
+
Parameters:
|
|
253
|
+
-----------
|
|
254
|
+
|
|
255
|
+
- data_point (DataPoint): The DataPoint to convert into the Weaviate DataObject
|
|
256
|
+
format.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
--------
|
|
260
|
+
|
|
261
|
+
The corresponding Weaviate DataObject representing the data point.
|
|
262
|
+
"""
|
|
263
|
+
vector = data_vectors[data_points.index(data_point)]
|
|
264
|
+
properties = data_point.model_dump()
|
|
265
|
+
|
|
266
|
+
if "id" in properties:
|
|
267
|
+
properties["uuid"] = str(data_point.id)
|
|
268
|
+
del properties["id"]
|
|
269
|
+
|
|
270
|
+
return DataObject(uuid=data_point.id, properties=properties, vector=vector)
|
|
271
|
+
|
|
272
|
+
data_points = [
|
|
273
|
+
convert_to_weaviate_data_points(data_point) for data_point in data_points
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
await self.get_client()
|
|
277
|
+
collection = await self.get_collection(collection_name)
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
if len(data_points) > 1:
|
|
281
|
+
return await collection.data.insert_many(data_points)
|
|
282
|
+
else:
|
|
283
|
+
data_point: DataObject = data_points[0]
|
|
284
|
+
if await collection.data.exists(data_point.uuid):
|
|
285
|
+
return await collection.data.update(
|
|
286
|
+
uuid=data_point.uuid,
|
|
287
|
+
vector=data_point.vector,
|
|
288
|
+
properties=data_point.properties,
|
|
289
|
+
references=data_point.references,
|
|
290
|
+
)
|
|
291
|
+
else:
|
|
292
|
+
return await collection.data.insert(
|
|
293
|
+
uuid=data_point.uuid,
|
|
294
|
+
vector=data_point.vector,
|
|
295
|
+
properties=data_point.properties,
|
|
296
|
+
references=data_point.references,
|
|
297
|
+
)
|
|
298
|
+
except Exception as error:
|
|
299
|
+
logger.error("Error creating data points: %s", str(error))
|
|
300
|
+
raise error
|
|
301
|
+
finally:
|
|
302
|
+
await self.client.close()
|
|
303
|
+
|
|
304
|
+
async def create_vector_index(self, index_name: str, index_property_name: str):
|
|
305
|
+
"""
|
|
306
|
+
Create a vector index based on an index name and property name by creating a
|
|
307
|
+
corresponding collection.
|
|
308
|
+
|
|
309
|
+
Parameters:
|
|
310
|
+
-----------
|
|
311
|
+
|
|
312
|
+
- index_name (str): The name for the vector index.
|
|
313
|
+
- index_property_name (str): The property name associated with the vector index.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
--------
|
|
317
|
+
|
|
318
|
+
The created collection representing the vector index.
|
|
319
|
+
"""
|
|
320
|
+
return await self.create_collection(f"{index_name}_{index_property_name}")
|
|
321
|
+
|
|
322
|
+
async def index_data_points(
|
|
323
|
+
self, index_name: str, index_property_name: str, data_points: list[DataPoint]
|
|
324
|
+
):
|
|
325
|
+
"""
|
|
326
|
+
Index a list of data points by creating an associated vector index collection.
|
|
327
|
+
|
|
328
|
+
Data points are transformed into embeddable data before being processed for indexing.
|
|
329
|
+
|
|
330
|
+
Parameters:
|
|
331
|
+
-----------
|
|
332
|
+
|
|
333
|
+
- index_name (str): The index name under which to store the data points.
|
|
334
|
+
- index_property_name (str): The associated property name for the index.
|
|
335
|
+
- data_points (list[DataPoint]): A list of DataPoint objects to be indexed.
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
--------
|
|
339
|
+
|
|
340
|
+
Information about the operation of indexing the data points.
|
|
341
|
+
"""
|
|
342
|
+
return await self.create_data_points(
|
|
343
|
+
f"{index_name}_{index_property_name}",
|
|
344
|
+
[
|
|
345
|
+
IndexSchema(
|
|
346
|
+
id=data_point.id,
|
|
347
|
+
text=DataPoint.get_embeddable_data(data_point),
|
|
348
|
+
)
|
|
349
|
+
for data_point in data_points
|
|
350
|
+
],
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
async def retrieve(self, collection_name: str, data_point_ids: list[str]):
|
|
354
|
+
"""
|
|
355
|
+
Fetch data points from a specified collection based on their IDs.
|
|
356
|
+
|
|
357
|
+
Return data points wrapped in an object containing their properties after
|
|
358
|
+
transformation.
|
|
359
|
+
|
|
360
|
+
Parameters:
|
|
361
|
+
-----------
|
|
362
|
+
|
|
363
|
+
- collection_name (str): The name of the collection to retrieve data points from.
|
|
364
|
+
- data_point_ids (list[str]): A list of IDs for the data points to retrieve.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
--------
|
|
368
|
+
|
|
369
|
+
A list of objects representing the retrieved data points.
|
|
370
|
+
"""
|
|
371
|
+
from weaviate.classes.query import Filter
|
|
372
|
+
|
|
373
|
+
await self.get_client()
|
|
374
|
+
collection = await self.get_collection(collection_name)
|
|
375
|
+
data_points = await collection.query.fetch_objects(
|
|
376
|
+
filters=Filter.by_id().contains_any(data_point_ids)
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
for data_point in data_points.objects:
|
|
380
|
+
data_point.payload = data_point.properties
|
|
381
|
+
data_point.id = data_point.uuid
|
|
382
|
+
del data_point.properties
|
|
383
|
+
|
|
384
|
+
await self.client.close()
|
|
385
|
+
return data_points.objects
|
|
386
|
+
|
|
387
|
+
async def search(
|
|
388
|
+
self,
|
|
389
|
+
collection_name: str,
|
|
390
|
+
query_text: Optional[str] = None,
|
|
391
|
+
query_vector: Optional[List[float]] = None,
|
|
392
|
+
limit: int = 15,
|
|
393
|
+
with_vector: bool = False,
|
|
394
|
+
):
|
|
395
|
+
"""
|
|
396
|
+
Perform a search on a collection using either a text query or a vector query.
|
|
397
|
+
|
|
398
|
+
Return scored results based on the search criteria provided. Raise MissingQueryParameterError if
|
|
399
|
+
no query is provided.
|
|
400
|
+
|
|
401
|
+
Parameters:
|
|
402
|
+
-----------
|
|
403
|
+
|
|
404
|
+
- collection_name (str): The name of the collection to search within.
|
|
405
|
+
- query_text (Optional[str]): Optional plain text query for searching. (default
|
|
406
|
+
None)
|
|
407
|
+
- query_vector (Optional[List[float]]): Optional vector representation for
|
|
408
|
+
searching. (default None)
|
|
409
|
+
- limit (int): The maximum number of results to return. (default 15)
|
|
410
|
+
- with_vector (bool): Include vector information in the results. (default False)
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
--------
|
|
414
|
+
|
|
415
|
+
A list of scored results matching the search criteria.
|
|
416
|
+
"""
|
|
417
|
+
import weaviate.classes as wvc
|
|
418
|
+
import weaviate.exceptions
|
|
419
|
+
|
|
420
|
+
if query_text is None and query_vector is None:
|
|
421
|
+
raise MissingQueryParameterError()
|
|
422
|
+
|
|
423
|
+
if query_vector is None:
|
|
424
|
+
query_vector = (await self.embed_data([query_text]))[0]
|
|
425
|
+
|
|
426
|
+
# TODO: Creation of new client for every search call. This is VERY ugly, needs discussion. (Andrej's comment)
|
|
427
|
+
async with weaviate.use_async_with_weaviate_cloud(
|
|
428
|
+
cluster_url=self.url,
|
|
429
|
+
auth_credentials=weaviate.auth.AuthApiKey(self.api_key),
|
|
430
|
+
additional_config=wvc.init.AdditionalConfig(
|
|
431
|
+
timeout=wvc.init.Timeout(init=30)
|
|
432
|
+
),
|
|
433
|
+
) as client:
|
|
434
|
+
if not await client.collections.exists(collection_name):
|
|
435
|
+
raise CollectionNotFoundError(
|
|
436
|
+
f"Collection '{collection_name}' not found."
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
collection = client.collections.get(collection_name)
|
|
440
|
+
|
|
441
|
+
try:
|
|
442
|
+
search_result = await collection.query.hybrid(
|
|
443
|
+
query=None,
|
|
444
|
+
vector=query_vector,
|
|
445
|
+
limit=limit if limit > 0 else None,
|
|
446
|
+
include_vector=with_vector,
|
|
447
|
+
return_metadata=wvc.query.MetadataQuery(score=True),
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return [
|
|
451
|
+
ScoredResult(
|
|
452
|
+
id=parse_id(str(result.uuid)),
|
|
453
|
+
payload=result.properties,
|
|
454
|
+
score=1 - float(result.metadata.score),
|
|
455
|
+
)
|
|
456
|
+
for result in search_result.objects
|
|
457
|
+
]
|
|
458
|
+
except weaviate.exceptions.WeaviateInvalidInputError:
|
|
459
|
+
# Ignore if the collection doesn't exist
|
|
460
|
+
return []
|
|
461
|
+
|
|
462
|
+
async def batch_search(
|
|
463
|
+
self,
|
|
464
|
+
collection_name: str,
|
|
465
|
+
query_texts: List[str],
|
|
466
|
+
limit: int,
|
|
467
|
+
with_vectors: bool = False,
|
|
468
|
+
):
|
|
469
|
+
"""
|
|
470
|
+
Execute a batch search for multiple query texts in the specified collection.
|
|
471
|
+
|
|
472
|
+
Return a list of results for each query performed in parallel.
|
|
473
|
+
|
|
474
|
+
Parameters:
|
|
475
|
+
-----------
|
|
476
|
+
|
|
477
|
+
- collection_name (str): The name of the collection to search within.
|
|
478
|
+
- query_texts (List[str]): A list of text queries to be processed in a batch.
|
|
479
|
+
- limit (int): The maximum number of results to return for each query.
|
|
480
|
+
- with_vectors (bool): Indicate whether to include vector information in the
|
|
481
|
+
results. (default False)
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
--------
|
|
485
|
+
|
|
486
|
+
A list containing results for each search query executed.
|
|
487
|
+
"""
|
|
488
|
+
|
|
489
|
+
def query_search(query_vector):
|
|
490
|
+
"""
|
|
491
|
+
Wrap the search operation based on a query vector for fetching results.
|
|
492
|
+
|
|
493
|
+
This function coordinates the search call, ensuring the collection name and search
|
|
494
|
+
parameters are applied.
|
|
495
|
+
|
|
496
|
+
Parameters:
|
|
497
|
+
-----------
|
|
498
|
+
|
|
499
|
+
- query_vector: The vector representation of the query for searching.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
--------
|
|
503
|
+
|
|
504
|
+
The results of the search operation on the specified collection.
|
|
505
|
+
"""
|
|
506
|
+
return self.search(
|
|
507
|
+
collection_name,
|
|
508
|
+
query_vector=query_vector,
|
|
509
|
+
limit=limit,
|
|
510
|
+
with_vector=with_vectors,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
return [
|
|
514
|
+
await query_search(query_vector)
|
|
515
|
+
for query_vector in await self.embed_data(query_texts)
|
|
516
|
+
]
|
|
517
|
+
|
|
518
|
+
async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
|
|
519
|
+
"""
|
|
520
|
+
Remove specified data points from a collection based on their IDs.
|
|
521
|
+
|
|
522
|
+
Return information about the deletion result, ideally confirming the operation's
|
|
523
|
+
success.
|
|
524
|
+
|
|
525
|
+
Parameters:
|
|
526
|
+
-----------
|
|
527
|
+
|
|
528
|
+
- collection_name (str): The name of the collection from which to delete data
|
|
529
|
+
points.
|
|
530
|
+
- data_point_ids (list[str]): A list of IDs for the data points to be deleted.
|
|
531
|
+
|
|
532
|
+
Returns:
|
|
533
|
+
--------
|
|
534
|
+
|
|
535
|
+
Confirmation of deletion operation result.
|
|
536
|
+
"""
|
|
537
|
+
from weaviate.classes.query import Filter
|
|
538
|
+
|
|
539
|
+
await self.get_client()
|
|
540
|
+
collection = await self.get_collection(collection_name)
|
|
541
|
+
result = await collection.data.delete_many(
|
|
542
|
+
filters=Filter.by_id().contains_any(data_point_ids)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
await self.client.close()
|
|
546
|
+
return result
|
|
547
|
+
|
|
548
|
+
async def prune(self):
|
|
549
|
+
"""
|
|
550
|
+
Delete all collections from the Weaviate database.
|
|
551
|
+
|
|
552
|
+
This operation will remove all data and cannot be undone.
|
|
553
|
+
"""
|
|
554
|
+
client = await self.get_client()
|
|
555
|
+
await client.collections.delete_all()
|
|
556
|
+
await client.close()
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: cognee-community-vector-adapter-weaviate
|
|
3
|
+
Version: 0.0.2
|
|
4
|
+
Summary: Weaviate vector database adapter for cognee
|
|
5
|
+
Requires-Python: >=3.11,<=3.13
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
10
|
+
Requires-Dist: cognee (>=0.2.4)
|
|
11
|
+
Requires-Dist: weaviate-client (>=4.9.6,<5.0.0)
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
|
|
14
|
+
# Cognee Community Weaviate Vector Adapter
|
|
15
|
+
|
|
16
|
+
This is a community-maintained adapter that enables Cognee to work with Weaviate as a vector database.
|
|
17
|
+
|
|
18
|
+
## Installation
|
|
19
|
+
|
|
20
|
+
If published, the package can be simply installed via pip:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install cognee-community-vector-adapter-weaviate
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
In case it is not published yet, you can use poetry to locally build the adapter package:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install poetry
|
|
30
|
+
poetry install # run this command in the directory containing the pyproject.toml file
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Connection Setup
|
|
34
|
+
The provided code creates an async client connected to a remote instance of Weaviate. If you want to connect to a local
|
|
35
|
+
instance, like running a docker container locally and connecting to it, you need to change a few lines of code.
|
|
36
|
+
In the `weaviate_adapter.py` file inside the `.../weaviate/cognee_community_vector_adapter_weaviate` directory, replace
|
|
37
|
+
the following lines in the constructor:
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
self.client = weaviate.use_async_with_weaviate_cloud(
|
|
41
|
+
cluster_url=url,
|
|
42
|
+
auth_credentials=weaviate.auth.AuthApiKey(api_key),
|
|
43
|
+
additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=30)),
|
|
44
|
+
)
|
|
45
|
+
```
|
|
46
|
+
with the following:
|
|
47
|
+
|
|
48
|
+
```
|
|
49
|
+
self.client = weaviate.use_async_with_local(
|
|
50
|
+
host="localhost",
|
|
51
|
+
port=8080,
|
|
52
|
+
grpc_port=50051
|
|
53
|
+
)
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
You can use the docker command provided by Weaviate (https://docs.weaviate.io/deploy/installation-guides/docker-installation)
|
|
57
|
+
to run Weaviate with default settings. The command looks something like this, specifying the ports for connection:
|
|
58
|
+
`docker run -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.32.4`
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
import asyncio
|
|
64
|
+
import os
|
|
65
|
+
from cognee import config, prune, add, cognify, search, SearchType
|
|
66
|
+
|
|
67
|
+
# Import the register module to enable Weaviate support
|
|
68
|
+
import cognee_community_vector_adapter_weaviate.register
|
|
69
|
+
|
|
70
|
+
async def main():
|
|
71
|
+
# Configure databases
|
|
72
|
+
config.set_relational_db_config({
|
|
73
|
+
"db_provider": "sqlite",
|
|
74
|
+
})
|
|
75
|
+
config.set_vector_db_config({
|
|
76
|
+
"vector_db_provider": "weaviate",
|
|
77
|
+
"vector_db_url": os.getenv("VECTOR_DB_URL"), # or your Weaviate URL
|
|
78
|
+
"vector_db_key": os.getenv("VECTOR_DB_KEY"), # or your API key
|
|
79
|
+
})
|
|
80
|
+
config.set_graph_db_config({
|
|
81
|
+
"graph_database_provider": "networkx",
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
# Optional: Clean previous data
|
|
85
|
+
await prune.prune_data()
|
|
86
|
+
await prune.prune_system()
|
|
87
|
+
|
|
88
|
+
# Add and process your content
|
|
89
|
+
text = "Your text content here"
|
|
90
|
+
await add(text)
|
|
91
|
+
await cognify()
|
|
92
|
+
|
|
93
|
+
# Search
|
|
94
|
+
search_results = await search(
|
|
95
|
+
query_type=SearchType.GRAPH_COMPLETION,
|
|
96
|
+
query_text="Your search query"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
for result in search_results:
|
|
100
|
+
print(result)
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
asyncio.run(main())
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Configuration
|
|
107
|
+
|
|
108
|
+
The Weaviate adapter requires the following configuration parameters:
|
|
109
|
+
|
|
110
|
+
- `vector_db_url`: Your Weaviate cluster endpoint URL
|
|
111
|
+
- `vector_db_key`: Your Weaviate API key
|
|
112
|
+
- `vector_db_provider`: Set to "weaviate"
|
|
113
|
+
|
|
114
|
+
### Environment Variables
|
|
115
|
+
|
|
116
|
+
Set the following environment variables or pass them directly in the config:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
export VECTOR_DB_URL="https://your-weaviate-instance.weaviate.network"
|
|
120
|
+
export VECTOR_DB_KEY="your-api-key"
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
**Alternative:** You can also use the [`.env.template`](https://github.com/topoteretes/cognee/blob/main/.env.template) file from the main cognee repository. Copy it to your project directory, rename it to `.env`, and fill in your Weaviate configuration values.
|
|
124
|
+
|
|
125
|
+
## Requirements
|
|
126
|
+
|
|
127
|
+
- Python >= 3.11, <= 3.13
|
|
128
|
+
- weaviate-client >= 4.9.6, < 5.0.0
|
|
129
|
+
- cognee >= 0.2.1
|
|
130
|
+
|
|
131
|
+
## Features
|
|
132
|
+
|
|
133
|
+
- Full vector search capabilities
|
|
134
|
+
- Batch operations support
|
|
135
|
+
- Async/await support
|
|
136
|
+
- Retry logic for better reliability
|
|
137
|
+
- Collection management
|
|
138
|
+
- Data point indexing and retrieval
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
cognee_community_vector_adapter_weaviate/__init__.py,sha256=EWEnTWaRf8kw8rNnbmppLYCiNl00Ltv9hsb1NmlIgMw,77
|
|
2
|
+
cognee_community_vector_adapter_weaviate/register.py,sha256=aGTpOdpiFXYlGkDshE5XjLC_iJRxD66nSz66SnuaxpY,166
|
|
3
|
+
cognee_community_vector_adapter_weaviate/weaviate_adapter.py,sha256=rDZBXQFUyU6sgVcbmInwK3JPs3oaa6Y4Ek4BzaLE9vI,18594
|
|
4
|
+
cognee_community_vector_adapter_weaviate-0.0.2.dist-info/METADATA,sha256=6AvHIbq5KJTWtvmfvBvn38W4L8V10wj7zrDZJJD-hBg,4252
|
|
5
|
+
cognee_community_vector_adapter_weaviate-0.0.2.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
|
6
|
+
cognee_community_vector_adapter_weaviate-0.0.2.dist-info/RECORD,,
|