cognee-community-vector-adapter-pinecone 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognee_community_vector_adapter_pinecone/__init__.py +3 -0
- cognee_community_vector_adapter_pinecone/pinecone_adapter.py +360 -0
- cognee_community_vector_adapter_pinecone/register.py +5 -0
- cognee_community_vector_adapter_pinecone-0.1.0.dist-info/METADATA +47 -0
- cognee_community_vector_adapter_pinecone-0.1.0.dist-info/RECORD +6 -0
- cognee_community_vector_adapter_pinecone-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from cognee.infrastructure.databases.exceptions import MissingQueryParameterError
|
|
4
|
+
from cognee.infrastructure.databases.vector import VectorDBInterface
|
|
5
|
+
from cognee.infrastructure.databases.vector.embeddings.EmbeddingEngine import (
|
|
6
|
+
EmbeddingEngine,
|
|
7
|
+
)
|
|
8
|
+
from cognee.infrastructure.databases.vector.exceptions import CollectionNotFoundError
|
|
9
|
+
from cognee.infrastructure.databases.vector.models.ScoredResult import ScoredResult
|
|
10
|
+
from cognee.infrastructure.engine import DataPoint
|
|
11
|
+
from cognee.infrastructure.engine.utils import parse_id
|
|
12
|
+
from cognee.shared.logging_utils import get_logger
|
|
13
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
14
|
+
|
|
15
|
+
logger = get_logger("PineconeAdapter")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def sanitize_pinecone_name(name: str) -> str:
|
|
19
|
+
"""
|
|
20
|
+
Sanitize a name to comply with Pinecone's naming requirements:
|
|
21
|
+
- Only lowercase alphanumeric characters and hyphens
|
|
22
|
+
- Must start with a letter
|
|
23
|
+
- Must not end with a hyphen
|
|
24
|
+
"""
|
|
25
|
+
# Convert to lowercase
|
|
26
|
+
name = name.lower()
|
|
27
|
+
|
|
28
|
+
# Replace invalid characters with hyphens
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
name = re.sub(r"[^a-z0-9\-]", "-", name)
|
|
32
|
+
|
|
33
|
+
# Ensure it starts with a letter
|
|
34
|
+
if not name[0].isalpha():
|
|
35
|
+
name = "index-" + name
|
|
36
|
+
|
|
37
|
+
# Remove consecutive hyphens
|
|
38
|
+
name = re.sub(r"-+", "-", name)
|
|
39
|
+
|
|
40
|
+
# Ensure it doesn't end with a hyphen
|
|
41
|
+
name = name.rstrip("-")
|
|
42
|
+
|
|
43
|
+
return name
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class IndexSchema(DataPoint):
|
|
47
|
+
text: str
|
|
48
|
+
|
|
49
|
+
metadata: dict = {"index_fields": ["text"]}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class PineconeAdapter(VectorDBInterface):
|
|
53
|
+
name = "Pinecone"
|
|
54
|
+
api_key: str = None
|
|
55
|
+
environment: str = None
|
|
56
|
+
cloud: str = None
|
|
57
|
+
region: str = None
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
url,
|
|
62
|
+
api_key,
|
|
63
|
+
embedding_engine: EmbeddingEngine,
|
|
64
|
+
database_name: str = "cognee",
|
|
65
|
+
environment: str = None,
|
|
66
|
+
cloud: str = None,
|
|
67
|
+
region: str = None,
|
|
68
|
+
):
|
|
69
|
+
self.url = url # Not used by Pinecone, but required by Cognee interface
|
|
70
|
+
self.api_key = api_key
|
|
71
|
+
self.environment = environment
|
|
72
|
+
self.database_name = database_name
|
|
73
|
+
self.cloud = cloud if cloud is not None else "aws"
|
|
74
|
+
self.region = region if region is not None else "us-east-1"
|
|
75
|
+
self.embedding_engine = embedding_engine
|
|
76
|
+
self.VECTOR_DB_LOCK = asyncio.Lock()
|
|
77
|
+
|
|
78
|
+
# Initialize Pinecone client
|
|
79
|
+
self.pc = Pinecone(api_key=api_key)
|
|
80
|
+
|
|
81
|
+
def get_pinecone_index(self, collection_name: str):
|
|
82
|
+
"""Get Pinecone index instance"""
|
|
83
|
+
return self.pc.Index(collection_name)
|
|
84
|
+
|
|
85
|
+
async def embed_data(self, data: list[str]) -> list[float]:
|
|
86
|
+
return await self.embedding_engine.embed_text(data)
|
|
87
|
+
|
|
88
|
+
async def has_collection(self, collection_name: str) -> bool:
|
|
89
|
+
try:
|
|
90
|
+
index_list = self.pc.list_indexes()
|
|
91
|
+
return collection_name in [index.name for index in index_list]
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error("Error checking if collection exists: %s", str(e))
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
async def create_collection(
|
|
97
|
+
self,
|
|
98
|
+
collection_name: str,
|
|
99
|
+
payload_schema=None,
|
|
100
|
+
):
|
|
101
|
+
async with self.VECTOR_DB_LOCK:
|
|
102
|
+
if not await self.has_collection(collection_name):
|
|
103
|
+
try:
|
|
104
|
+
self.pc.create_index(
|
|
105
|
+
name=collection_name,
|
|
106
|
+
dimension=self.embedding_engine.get_vector_size(),
|
|
107
|
+
metric="cosine",
|
|
108
|
+
spec=ServerlessSpec(cloud=self.cloud, region=self.region),
|
|
109
|
+
)
|
|
110
|
+
logger.info("Created Pinecone index: %s", collection_name)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
logger.error("Error creating collection: %s", str(e))
|
|
113
|
+
raise e
|
|
114
|
+
|
|
115
|
+
async def create_data_points(self, collection_name: str, data_points: list[DataPoint]):
|
|
116
|
+
try:
|
|
117
|
+
if not await self.has_collection(collection_name):
|
|
118
|
+
raise CollectionNotFoundError(message=f"Collection {collection_name} not found!")
|
|
119
|
+
|
|
120
|
+
index = self.get_pinecone_index(collection_name)
|
|
121
|
+
|
|
122
|
+
data_vectors = await self.embed_data(
|
|
123
|
+
[DataPoint.get_embeddable_data(data_point) for data_point in data_points]
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def convert_to_pinecone_vector(data_point: DataPoint):
|
|
127
|
+
# Clean metadata to only include Pinecone-compatible types
|
|
128
|
+
clean_metadata = {}
|
|
129
|
+
data_dump = data_point.model_dump()
|
|
130
|
+
|
|
131
|
+
for key, value in data_dump.items():
|
|
132
|
+
if key != "metadata": # Skip the nested metadata field that causes issues
|
|
133
|
+
if isinstance(value, (str, int, float, bool)):
|
|
134
|
+
clean_metadata[key] = value
|
|
135
|
+
elif isinstance(value, list) and all(
|
|
136
|
+
isinstance(item, str) for item in value
|
|
137
|
+
):
|
|
138
|
+
clean_metadata[key] = value
|
|
139
|
+
else:
|
|
140
|
+
# Convert complex types to strings
|
|
141
|
+
clean_metadata[key] = str(value)
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"id": str(data_point.id),
|
|
145
|
+
"values": data_vectors[data_points.index(data_point)],
|
|
146
|
+
"metadata": clean_metadata,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
vectors = [convert_to_pinecone_vector(point) for point in data_points]
|
|
150
|
+
|
|
151
|
+
index.upsert(vectors=vectors)
|
|
152
|
+
logger.info("Uploaded %d data points to Pinecone", len(vectors))
|
|
153
|
+
|
|
154
|
+
except Exception as error:
|
|
155
|
+
logger.error("Error uploading data points to Pinecone: %s", str(error))
|
|
156
|
+
raise error
|
|
157
|
+
|
|
158
|
+
async def create_vector_index(self, index_name: str, index_property_name: str):
|
|
159
|
+
sanitized_name = sanitize_pinecone_name(f"{index_name}_{index_property_name}")
|
|
160
|
+
await self.create_collection(sanitized_name)
|
|
161
|
+
|
|
162
|
+
async def index_data_points(
|
|
163
|
+
self, index_name: str, index_property_name: str, data_points: list[DataPoint]
|
|
164
|
+
):
|
|
165
|
+
sanitized_name = sanitize_pinecone_name(f"{index_name}_{index_property_name}")
|
|
166
|
+
await self.create_data_points(
|
|
167
|
+
sanitized_name,
|
|
168
|
+
[
|
|
169
|
+
IndexSchema(
|
|
170
|
+
id=data_point.id,
|
|
171
|
+
text=getattr(data_point, data_point.metadata["index_fields"][0]),
|
|
172
|
+
)
|
|
173
|
+
for data_point in data_points
|
|
174
|
+
],
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
async def retrieve(self, collection_name: str, data_point_ids: list[str]):
|
|
178
|
+
try:
|
|
179
|
+
if not await self.has_collection(collection_name):
|
|
180
|
+
raise CollectionNotFoundError(message=f"Collection {collection_name} not found!")
|
|
181
|
+
|
|
182
|
+
index = self.get_pinecone_index(collection_name)
|
|
183
|
+
results = index.fetch(ids=data_point_ids)
|
|
184
|
+
return results
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error("Error retrieving data points: %s", str(e))
|
|
187
|
+
raise e
|
|
188
|
+
|
|
189
|
+
async def search(
|
|
190
|
+
self,
|
|
191
|
+
collection_name: str,
|
|
192
|
+
query_text: str | None = None,
|
|
193
|
+
query_vector: list[float] | None = None,
|
|
194
|
+
limit: int = 15,
|
|
195
|
+
with_vector: bool = False,
|
|
196
|
+
include_payload: bool = False,
|
|
197
|
+
) -> list[ScoredResult]:
|
|
198
|
+
"""Search for similar vectors in the collection.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
collection_name: Name of the collection to search.
|
|
202
|
+
query_text: Text query to search for (will be embedded).
|
|
203
|
+
query_vector: Pre-computed query vector.
|
|
204
|
+
limit: Maximum number of results to return.
|
|
205
|
+
with_vector: Whether to include vectors in results.
|
|
206
|
+
include_payload: Whether to include payload in results.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
List of ScoredResult objects sorted by similarity.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
MissingQueryParameterError: If neither query_text nor query_vector is provided.
|
|
213
|
+
"""
|
|
214
|
+
if query_text is None and query_vector is None:
|
|
215
|
+
raise MissingQueryParameterError()
|
|
216
|
+
|
|
217
|
+
if not await self.has_collection(collection_name):
|
|
218
|
+
logger.warning(
|
|
219
|
+
f"Collection '{collection_name}' not found in PineconeAdapter.search; returning []."
|
|
220
|
+
)
|
|
221
|
+
return []
|
|
222
|
+
|
|
223
|
+
if query_vector is None:
|
|
224
|
+
query_vector = (await self.embed_data([query_text]))[0]
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
index = self.get_pinecone_index(collection_name)
|
|
228
|
+
|
|
229
|
+
if limit == 0:
|
|
230
|
+
# Get actual index stats instead of hardcoded limit
|
|
231
|
+
stats = index.describe_index_stats()
|
|
232
|
+
limit = stats.total_vector_count
|
|
233
|
+
|
|
234
|
+
if limit == 0:
|
|
235
|
+
return []
|
|
236
|
+
|
|
237
|
+
results = index.query(
|
|
238
|
+
vector=query_vector,
|
|
239
|
+
top_k=limit,
|
|
240
|
+
include_metadata=include_payload,
|
|
241
|
+
include_values=with_vector,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return [
|
|
245
|
+
ScoredResult(
|
|
246
|
+
id=parse_id(match.id),
|
|
247
|
+
payload={
|
|
248
|
+
**match.metadata,
|
|
249
|
+
"id": parse_id(match.id),
|
|
250
|
+
}
|
|
251
|
+
if include_payload
|
|
252
|
+
else {},
|
|
253
|
+
# Pinecone returns similarity score (0-1, higher = more similar)
|
|
254
|
+
score=match.score,
|
|
255
|
+
)
|
|
256
|
+
for match in results.matches
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.error("Error searching collection: %s", str(e))
|
|
261
|
+
raise e
|
|
262
|
+
|
|
263
|
+
async def batch_search(
|
|
264
|
+
self,
|
|
265
|
+
collection_name: str,
|
|
266
|
+
query_texts: list[str],
|
|
267
|
+
limit: int | None = None,
|
|
268
|
+
with_vectors: bool = False,
|
|
269
|
+
include_payload: bool = False,
|
|
270
|
+
) -> list[list[ScoredResult]]:
|
|
271
|
+
"""Perform batch search for multiple queries.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
collection_name: Name of the collection to search.
|
|
275
|
+
query_texts: List of query texts to search for.
|
|
276
|
+
limit: Maximum number of results per query.
|
|
277
|
+
with_vectors: Whether to include vectors in results.
|
|
278
|
+
include_payload: Whether to include payload in results.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
List of search results, one list per query.
|
|
282
|
+
|
|
283
|
+
Raises:
|
|
284
|
+
Exception: If search execution fails.
|
|
285
|
+
"""
|
|
286
|
+
if limit is None:
|
|
287
|
+
limit = 15
|
|
288
|
+
|
|
289
|
+
if not await self.has_collection(collection_name):
|
|
290
|
+
logger.warning(
|
|
291
|
+
f"Collection '{collection_name}' not found in PineconeAdapter.batch_search;"
|
|
292
|
+
f"returning empty results."
|
|
293
|
+
)
|
|
294
|
+
return [[] for _ in query_texts]
|
|
295
|
+
|
|
296
|
+
try:
|
|
297
|
+
vectors = await self.embed_data(query_texts)
|
|
298
|
+
index = self.get_pinecone_index(collection_name)
|
|
299
|
+
|
|
300
|
+
results = []
|
|
301
|
+
for i, vector in enumerate(vectors):
|
|
302
|
+
try:
|
|
303
|
+
result = index.query(
|
|
304
|
+
vector=vector,
|
|
305
|
+
top_k=limit,
|
|
306
|
+
include_metadata=include_payload,
|
|
307
|
+
include_values=with_vectors,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
# Convert to ScoredResult objects (no filtering to match other adapters)
|
|
311
|
+
scored_results = [
|
|
312
|
+
ScoredResult(
|
|
313
|
+
id=parse_id(match.id),
|
|
314
|
+
payload={
|
|
315
|
+
**match.metadata,
|
|
316
|
+
"id": parse_id(match.id),
|
|
317
|
+
}
|
|
318
|
+
if include_payload
|
|
319
|
+
else {},
|
|
320
|
+
score=match.score,
|
|
321
|
+
)
|
|
322
|
+
for match in result.matches
|
|
323
|
+
]
|
|
324
|
+
results.append(scored_results)
|
|
325
|
+
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.error(f"Error in batch search for query {i}: {str(e)}")
|
|
328
|
+
results.append([])
|
|
329
|
+
|
|
330
|
+
return results
|
|
331
|
+
|
|
332
|
+
except Exception as e:
|
|
333
|
+
logger.error(f"Error during batch search: {str(e)}")
|
|
334
|
+
raise e
|
|
335
|
+
|
|
336
|
+
async def delete_data_points(self, collection_name: str, data_point_ids: list[str]):
|
|
337
|
+
try:
|
|
338
|
+
if not await self.has_collection(collection_name):
|
|
339
|
+
raise CollectionNotFoundError(message=f"Collection {collection_name} not found!")
|
|
340
|
+
|
|
341
|
+
index = self.get_pinecone_index(collection_name)
|
|
342
|
+
results = index.delete(ids=data_point_ids)
|
|
343
|
+
logger.info("Deleted %d data points from %s", len(data_point_ids), collection_name)
|
|
344
|
+
return results
|
|
345
|
+
except Exception as e:
|
|
346
|
+
logger.error("Error deleting data points: %s", str(e))
|
|
347
|
+
raise e
|
|
348
|
+
|
|
349
|
+
async def prune(self):
|
|
350
|
+
"""Delete all indexes in Pinecone"""
|
|
351
|
+
try:
|
|
352
|
+
index_list = self.pc.list_indexes()
|
|
353
|
+
|
|
354
|
+
for index_info in index_list:
|
|
355
|
+
self.pc.delete_index(index_info.name)
|
|
356
|
+
logger.info("Deleted Pinecone index: %s", index_info.name)
|
|
357
|
+
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger.error("Error pruning Pinecone indexes: %s", str(e))
|
|
360
|
+
raise e
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cognee-community-vector-adapter-pinecone
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pinecone vector database adapter for cognee
|
|
5
|
+
Requires-Python: <=3.13,>=3.11
|
|
6
|
+
Requires-Dist: cognee==0.5.2
|
|
7
|
+
Requires-Dist: instructor>=1.11
|
|
8
|
+
Requires-Dist: pinecone>=3.0.0
|
|
9
|
+
Requires-Dist: starlette>=0.48.0
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# Cognee Pinecone Adapter
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
If published, the package can be simply installed via pip:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install cognee-community-vector-adapter-pinecone
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
In case it is not published yet, you can use poetry to locally build the adapter package:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install poetry
|
|
26
|
+
poetry install # run this command in the directory containing the pyproject.toml file
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Connection Setup
|
|
30
|
+
|
|
31
|
+
To use the Pinecone adapter, you need to:
|
|
32
|
+
|
|
33
|
+
1. Sign up for a Pinecone account at https://www.pinecone.io/
|
|
34
|
+
2. Create a new project and get your API key
|
|
35
|
+
3. Note your environment details (cloud provider and region)
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Import and register the adapter in your code:
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from cognee_community_vector_adapter_pinecone import register
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Example
|
|
46
|
+
|
|
47
|
+
See example in `example.py` file.
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
cognee_community_vector_adapter_pinecone/__init__.py,sha256=To8bCX7WdMDbAtjdF0bf4FVMR2NUxYvm6JAqAVFLUag,77
|
|
2
|
+
cognee_community_vector_adapter_pinecone/pinecone_adapter.py,sha256=pzstAY_9uOGpWt6xL61h9yz4wT81dAPqUZDJ69zSTMQ,13073
|
|
3
|
+
cognee_community_vector_adapter_pinecone/register.py,sha256=TggMmL38QcbWvqWMw7ZtPupwmU2DKL0S3pwxLs1d13k,166
|
|
4
|
+
cognee_community_vector_adapter_pinecone-0.1.0.dist-info/METADATA,sha256=gUMNawhgH3ngwQOyEwqlvTiAHtPrw_VvhyDdMZSZFow,1126
|
|
5
|
+
cognee_community_vector_adapter_pinecone-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
cognee_community_vector_adapter_pinecone-0.1.0.dist-info/RECORD,,
|