vector-inspector 0.2.6__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vector_inspector/config/__init__.py +4 -0
- vector_inspector/config/known_embedding_models.json +432 -0
- vector_inspector/core/cache_manager.py +159 -0
- vector_inspector/core/connection_manager.py +277 -0
- vector_inspector/core/connections/__init__.py +2 -1
- vector_inspector/core/connections/base_connection.py +42 -1
- vector_inspector/core/connections/chroma_connection.py +137 -16
- vector_inspector/core/connections/pinecone_connection.py +768 -0
- vector_inspector/core/connections/qdrant_connection.py +62 -8
- vector_inspector/core/embedding_providers/__init__.py +14 -0
- vector_inspector/core/embedding_providers/base_provider.py +128 -0
- vector_inspector/core/embedding_providers/clip_provider.py +260 -0
- vector_inspector/core/embedding_providers/provider_factory.py +176 -0
- vector_inspector/core/embedding_providers/sentence_transformer_provider.py +203 -0
- vector_inspector/core/embedding_utils.py +167 -0
- vector_inspector/core/model_registry.py +205 -0
- vector_inspector/services/backup_restore_service.py +19 -29
- vector_inspector/services/credential_service.py +130 -0
- vector_inspector/services/filter_service.py +1 -1
- vector_inspector/services/profile_service.py +409 -0
- vector_inspector/services/settings_service.py +136 -1
- vector_inspector/ui/components/connection_manager_panel.py +327 -0
- vector_inspector/ui/components/profile_manager_panel.py +565 -0
- vector_inspector/ui/dialogs/__init__.py +6 -0
- vector_inspector/ui/dialogs/cross_db_migration.py +383 -0
- vector_inspector/ui/dialogs/embedding_config_dialog.py +315 -0
- vector_inspector/ui/dialogs/provider_type_dialog.py +189 -0
- vector_inspector/ui/main_window.py +456 -190
- vector_inspector/ui/views/connection_view.py +55 -10
- vector_inspector/ui/views/info_panel.py +272 -55
- vector_inspector/ui/views/metadata_view.py +71 -3
- vector_inspector/ui/views/search_view.py +44 -4
- vector_inspector/ui/views/visualization_view.py +19 -5
- {vector_inspector-0.2.6.dist-info → vector_inspector-0.3.1.dist-info}/METADATA +3 -1
- vector_inspector-0.3.1.dist-info/RECORD +55 -0
- vector_inspector-0.2.6.dist-info/RECORD +0 -35
- {vector_inspector-0.2.6.dist-info → vector_inspector-0.3.1.dist-info}/WHEEL +0 -0
- {vector_inspector-0.2.6.dist-info → vector_inspector-0.3.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,768 @@
|
|
|
1
|
+
"""Pinecone connection manager."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, List, Dict, Any
|
|
4
|
+
import time
|
|
5
|
+
from pinecone import Pinecone, ServerlessSpec
|
|
6
|
+
from pinecone.exceptions import PineconeException
|
|
7
|
+
|
|
8
|
+
from .base_connection import VectorDBConnection
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PineconeConnection(VectorDBConnection):
|
|
12
|
+
"""Manages connection to Pinecone and provides query interface."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
api_key: str,
|
|
17
|
+
environment: Optional[str] = None,
|
|
18
|
+
index_host: Optional[str] = None
|
|
19
|
+
):
|
|
20
|
+
"""
|
|
21
|
+
Initialize Pinecone connection.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
api_key: Pinecone API key
|
|
25
|
+
environment: Pinecone environment (optional, auto-detected)
|
|
26
|
+
index_host: Specific index host URL (optional)
|
|
27
|
+
"""
|
|
28
|
+
self.api_key = api_key
|
|
29
|
+
self.environment = environment
|
|
30
|
+
self.index_host = index_host
|
|
31
|
+
self._client: Optional[Pinecone] = None
|
|
32
|
+
self._current_index = None
|
|
33
|
+
self._current_index_name: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
def connect(self) -> bool:
|
|
36
|
+
"""
|
|
37
|
+
Establish connection to Pinecone.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
True if connection successful, False otherwise
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
# Initialize Pinecone client
|
|
44
|
+
self._client = Pinecone(api_key=self.api_key)
|
|
45
|
+
|
|
46
|
+
# Test connection by listing indexes
|
|
47
|
+
self._client.list_indexes()
|
|
48
|
+
return True
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"Connection failed: {e}")
|
|
51
|
+
self._client = None # Reset client on failure
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
def disconnect(self):
|
|
55
|
+
"""Close connection to Pinecone."""
|
|
56
|
+
self._client = None
|
|
57
|
+
self._current_index = None
|
|
58
|
+
self._current_index_name = None
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def is_connected(self) -> bool:
|
|
62
|
+
"""Check if connected to Pinecone."""
|
|
63
|
+
return self._client is not None
|
|
64
|
+
|
|
65
|
+
def list_collections(self) -> List[str]:
|
|
66
|
+
"""
|
|
67
|
+
Get list of all indexes (collections in Pinecone terminology).
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
List of index names
|
|
71
|
+
"""
|
|
72
|
+
if not self._client:
|
|
73
|
+
return []
|
|
74
|
+
try:
|
|
75
|
+
indexes = self._client.list_indexes()
|
|
76
|
+
return [str(idx.name) for idx in indexes] # type: ignore
|
|
77
|
+
except Exception as e:
|
|
78
|
+
print(f"Failed to list indexes: {e}")
|
|
79
|
+
return []
|
|
80
|
+
|
|
81
|
+
def _get_index(self, name: str):
|
|
82
|
+
"""Get or create index reference."""
|
|
83
|
+
if not self._client:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# Cache the current index to avoid repeated lookups
|
|
88
|
+
if self._current_index_name != name:
|
|
89
|
+
self._current_index = self._client.Index(name)
|
|
90
|
+
self._current_index_name = name
|
|
91
|
+
return self._current_index
|
|
92
|
+
except Exception as e:
|
|
93
|
+
print(f"Failed to get index: {e}")
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
def get_collection_info(self, name: str) -> Optional[Dict[str, Any]]:
|
|
97
|
+
"""
|
|
98
|
+
Get index metadata and statistics.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
name: Index name
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Dictionary with index info
|
|
105
|
+
"""
|
|
106
|
+
if not self._client:
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
# Get index description
|
|
111
|
+
index_description = self._client.describe_index(name)
|
|
112
|
+
|
|
113
|
+
# Get index stats
|
|
114
|
+
index = self._get_index(name)
|
|
115
|
+
if not index:
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
stats = index.describe_index_stats()
|
|
119
|
+
|
|
120
|
+
# Extract information
|
|
121
|
+
total_vector_count = stats.get('total_vector_count', 0)
|
|
122
|
+
dimension = index_description.dimension
|
|
123
|
+
metric = index_description.metric
|
|
124
|
+
|
|
125
|
+
# Get metadata fields from a sample query (if vectors exist)
|
|
126
|
+
metadata_fields = []
|
|
127
|
+
if total_vector_count > 0:
|
|
128
|
+
try:
|
|
129
|
+
# Query for a small sample to see metadata structure
|
|
130
|
+
dimension_val = int(dimension) if dimension else 0
|
|
131
|
+
sample_query = index.query(
|
|
132
|
+
vector=[0.0] * dimension_val,
|
|
133
|
+
top_k=1,
|
|
134
|
+
include_metadata=True
|
|
135
|
+
)
|
|
136
|
+
if hasattr(sample_query, 'matches') and sample_query.matches: # type: ignore
|
|
137
|
+
metadata = sample_query.matches[0].metadata # type: ignore
|
|
138
|
+
if metadata:
|
|
139
|
+
metadata_fields = list(metadata.keys())
|
|
140
|
+
except Exception:
|
|
141
|
+
pass # Metadata fields will remain empty
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"name": name,
|
|
145
|
+
"count": total_vector_count,
|
|
146
|
+
"metadata_fields": metadata_fields,
|
|
147
|
+
"vector_dimension": dimension,
|
|
148
|
+
"distance_metric": str(metric).upper() if metric else "UNKNOWN",
|
|
149
|
+
"host": str(index_description.host) if hasattr(index_description, 'host') else "N/A",
|
|
150
|
+
"status": index_description.status.get('state', 'unknown') if hasattr(index_description.status, 'get') else str(index_description.status), # type: ignore
|
|
151
|
+
"spec": str(index_description.spec) if hasattr(index_description, 'spec') else "N/A",
|
|
152
|
+
}
|
|
153
|
+
except Exception as e:
|
|
154
|
+
print(f"Failed to get index info: {e}")
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
def create_collection(self, name: str, vector_size: int, distance: str = "Cosine") -> bool:
|
|
158
|
+
"""
|
|
159
|
+
Create a new index.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
name: Index name
|
|
163
|
+
vector_size: Dimension of vectors
|
|
164
|
+
distance: Distance metric (Cosine, Euclidean, DotProduct)
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
True if successful, False otherwise
|
|
168
|
+
"""
|
|
169
|
+
if not self._client:
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
# Map distance names to Pinecone metrics
|
|
174
|
+
metric_map = {
|
|
175
|
+
"cosine": "cosine",
|
|
176
|
+
"euclidean": "euclidean",
|
|
177
|
+
"dotproduct": "dotproduct",
|
|
178
|
+
"dot": "dotproduct",
|
|
179
|
+
}
|
|
180
|
+
metric = metric_map.get(distance.lower(), "cosine")
|
|
181
|
+
|
|
182
|
+
# Create serverless index (default configuration)
|
|
183
|
+
self._client.create_index(
|
|
184
|
+
name=name,
|
|
185
|
+
dimension=vector_size,
|
|
186
|
+
metric=metric,
|
|
187
|
+
spec=ServerlessSpec(
|
|
188
|
+
cloud='aws',
|
|
189
|
+
region='us-east-1'
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
# Wait for index to be ready
|
|
194
|
+
max_wait = 60 # seconds
|
|
195
|
+
start_time = time.time()
|
|
196
|
+
while time.time() - start_time < max_wait:
|
|
197
|
+
desc = self._client.describe_index(name)
|
|
198
|
+
status = desc.status.get('state', 'unknown') if hasattr(desc.status, 'get') else str(desc.status) # type: ignore
|
|
199
|
+
if status.lower() == 'ready':
|
|
200
|
+
return True
|
|
201
|
+
time.sleep(2)
|
|
202
|
+
|
|
203
|
+
return False
|
|
204
|
+
except Exception as e:
|
|
205
|
+
print(f"Failed to create index: {e}")
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
def add_items(
|
|
209
|
+
self,
|
|
210
|
+
collection_name: str,
|
|
211
|
+
documents: List[str],
|
|
212
|
+
metadatas: Optional[List[Dict[str, Any]]] = None,
|
|
213
|
+
ids: Optional[List[str]] = None,
|
|
214
|
+
embeddings: Optional[List[List[float]]] = None,
|
|
215
|
+
) -> bool:
|
|
216
|
+
"""
|
|
217
|
+
Add items to an index.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
collection_name: Name of index
|
|
221
|
+
documents: Document texts (stored in metadata)
|
|
222
|
+
metadatas: Metadata for each vector
|
|
223
|
+
ids: IDs for each vector
|
|
224
|
+
embeddings: Pre-computed embeddings (required for Pinecone)
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
True if successful, False otherwise
|
|
228
|
+
"""
|
|
229
|
+
if not embeddings:
|
|
230
|
+
print("Embeddings are required for Pinecone")
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
index = self._get_index(collection_name)
|
|
234
|
+
if not index:
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
# Generate IDs if not provided
|
|
239
|
+
if not ids:
|
|
240
|
+
ids = [f"vec_{i}" for i in range(len(embeddings))]
|
|
241
|
+
|
|
242
|
+
# Prepare vectors for upsert
|
|
243
|
+
vectors = []
|
|
244
|
+
for i, embedding in enumerate(embeddings):
|
|
245
|
+
metadata = {}
|
|
246
|
+
if metadatas and i < len(metadatas):
|
|
247
|
+
metadata = metadatas[i].copy()
|
|
248
|
+
|
|
249
|
+
# Add document text to metadata
|
|
250
|
+
if documents and i < len(documents):
|
|
251
|
+
metadata['document'] = documents[i]
|
|
252
|
+
|
|
253
|
+
vectors.append({
|
|
254
|
+
'id': ids[i],
|
|
255
|
+
'values': embedding,
|
|
256
|
+
'metadata': metadata
|
|
257
|
+
})
|
|
258
|
+
|
|
259
|
+
# Upsert in batches of 100 (Pinecone limit)
|
|
260
|
+
batch_size = 100
|
|
261
|
+
for i in range(0, len(vectors), batch_size):
|
|
262
|
+
batch = vectors[i:i + batch_size]
|
|
263
|
+
index.upsert(vectors=batch)
|
|
264
|
+
|
|
265
|
+
return True
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(f"Failed to add items: {e}")
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
def get_items(self, name: str, ids: List[str]) -> Dict[str, Any]:
|
|
271
|
+
"""
|
|
272
|
+
Retrieve items by IDs.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
name: Index name
|
|
276
|
+
ids: List of vector IDs
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Dictionary with documents and metadatas
|
|
280
|
+
"""
|
|
281
|
+
index = self._get_index(name)
|
|
282
|
+
if not index:
|
|
283
|
+
return {"documents": [], "metadatas": []}
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
# Fetch vectors
|
|
287
|
+
result = index.fetch(ids=ids)
|
|
288
|
+
|
|
289
|
+
documents = []
|
|
290
|
+
metadatas = []
|
|
291
|
+
|
|
292
|
+
for vid in ids:
|
|
293
|
+
if vid in result.vectors:
|
|
294
|
+
vector_data = result.vectors[vid]
|
|
295
|
+
metadata = vector_data.metadata or {}
|
|
296
|
+
|
|
297
|
+
# Extract document from metadata
|
|
298
|
+
doc = metadata.pop('document', '')
|
|
299
|
+
documents.append(doc)
|
|
300
|
+
metadatas.append(metadata)
|
|
301
|
+
else:
|
|
302
|
+
documents.append('')
|
|
303
|
+
metadatas.append({})
|
|
304
|
+
|
|
305
|
+
return {"documents": documents, "metadatas": metadatas}
|
|
306
|
+
except Exception as e:
|
|
307
|
+
print(f"Failed to get items: {e}")
|
|
308
|
+
return {"documents": [], "metadatas": []}
|
|
309
|
+
|
|
310
|
+
def delete_collection(self, name: str) -> bool:
|
|
311
|
+
"""
|
|
312
|
+
Delete an index.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
name: Index name
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
True if successful, False otherwise
|
|
319
|
+
"""
|
|
320
|
+
if not self._client:
|
|
321
|
+
return False
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
self._client.delete_index(name)
|
|
325
|
+
if self._current_index_name == name:
|
|
326
|
+
self._current_index = None
|
|
327
|
+
self._current_index_name = None
|
|
328
|
+
return True
|
|
329
|
+
except Exception as e:
|
|
330
|
+
print(f"Failed to delete index: {e}")
|
|
331
|
+
return False
|
|
332
|
+
|
|
333
|
+
def count_collection(self, name: str) -> int:
|
|
334
|
+
"""
|
|
335
|
+
Return the number of vectors in the index.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
name: Index name
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Number of vectors
|
|
342
|
+
"""
|
|
343
|
+
index = self._get_index(name)
|
|
344
|
+
if not index:
|
|
345
|
+
return 0
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
stats = index.describe_index_stats()
|
|
349
|
+
return stats.get('total_vector_count', 0)
|
|
350
|
+
except Exception:
|
|
351
|
+
return 0
|
|
352
|
+
|
|
353
|
+
def _get_embedding_function_for_collection(self, collection_name: str):
|
|
354
|
+
"""
|
|
355
|
+
Returns embedding function and model type for a given collection, matching ChromaDB/Qdrant API.
|
|
356
|
+
"""
|
|
357
|
+
info = self.get_collection_info(collection_name)
|
|
358
|
+
dim = info.get("vector_dimension") if info else None
|
|
359
|
+
try:
|
|
360
|
+
dim_int = int(dim) if dim is not None else None
|
|
361
|
+
except Exception:
|
|
362
|
+
dim_int = None
|
|
363
|
+
|
|
364
|
+
# Prefer user-configured model for this collection
|
|
365
|
+
from vector_inspector.services.settings_service import SettingsService
|
|
366
|
+
model = None
|
|
367
|
+
model_type: str = "sentence-transformer"
|
|
368
|
+
if hasattr(self, "connection_id") and collection_name:
|
|
369
|
+
settings = SettingsService()
|
|
370
|
+
cfg = settings.get_embedding_model(getattr(self, "connection_id", ""), collection_name)
|
|
371
|
+
if cfg and cfg.get("model") and cfg.get("type"):
|
|
372
|
+
from vector_inspector.core.embedding_utils import load_embedding_model
|
|
373
|
+
model = load_embedding_model(cfg["model"], cfg["type"])
|
|
374
|
+
model_type = str(cfg["type"]) or "sentence-transformer"
|
|
375
|
+
|
|
376
|
+
# Fallback to dimension-based model if none configured
|
|
377
|
+
if model is None:
|
|
378
|
+
from vector_inspector.core.embedding_utils import get_embedding_model_for_dimension
|
|
379
|
+
if dim_int is None:
|
|
380
|
+
dim_int = 384 # default for MiniLM
|
|
381
|
+
loaded_model, _, inferred_type = get_embedding_model_for_dimension(dim_int)
|
|
382
|
+
model = loaded_model
|
|
383
|
+
model_type = str(inferred_type) or "sentence-transformer"
|
|
384
|
+
|
|
385
|
+
from vector_inspector.core.embedding_utils import encode_text
|
|
386
|
+
def embedding_fn(text: str):
|
|
387
|
+
return encode_text(text, model, model_type)
|
|
388
|
+
|
|
389
|
+
return embedding_fn, model_type
|
|
390
|
+
|
|
391
|
+
def query_collection(
|
|
392
|
+
self,
|
|
393
|
+
collection_name: str,
|
|
394
|
+
query_texts: Optional[List[str]] = None,
|
|
395
|
+
query_embeddings: Optional[List[List[float]]] = None,
|
|
396
|
+
n_results: int = 10,
|
|
397
|
+
where: Optional[Dict[str, Any]] = None,
|
|
398
|
+
where_document: Optional[Dict[str, Any]] = None,
|
|
399
|
+
) -> Optional[Dict[str, Any]]:
|
|
400
|
+
"""
|
|
401
|
+
Query an index for similar vectors.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
collection_name: Name of index
|
|
405
|
+
query_texts: Text queries (will be embedded if provided)
|
|
406
|
+
query_embeddings: Query embedding vectors
|
|
407
|
+
n_results: Number of results to return
|
|
408
|
+
where: Metadata filter
|
|
409
|
+
where_document: Document content filter (not directly supported)
|
|
410
|
+
Returns:
|
|
411
|
+
Query results or None if failed
|
|
412
|
+
"""
|
|
413
|
+
|
|
414
|
+
# If query_embeddings not provided, but query_texts are, embed them using the embedding function
|
|
415
|
+
if query_embeddings is None and query_texts:
|
|
416
|
+
embedding_fn, _ = self._get_embedding_function_for_collection(collection_name)
|
|
417
|
+
query_embeddings = [embedding_fn(q) for q in query_texts]
|
|
418
|
+
query_texts = None
|
|
419
|
+
|
|
420
|
+
if not query_embeddings:
|
|
421
|
+
print("Query embeddings are required for Pinecone")
|
|
422
|
+
return None
|
|
423
|
+
|
|
424
|
+
index = self._get_index(collection_name)
|
|
425
|
+
if not index:
|
|
426
|
+
return None
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
# Pinecone queries one vector at a time
|
|
430
|
+
all_ids = []
|
|
431
|
+
all_distances = []
|
|
432
|
+
all_documents = []
|
|
433
|
+
all_metadatas = []
|
|
434
|
+
all_embeddings = []
|
|
435
|
+
|
|
436
|
+
for query_vector in query_embeddings:
|
|
437
|
+
# Build filter if provided
|
|
438
|
+
filter_dict = None
|
|
439
|
+
if where:
|
|
440
|
+
filter_dict = self._convert_filter(where)
|
|
441
|
+
|
|
442
|
+
result = index.query(
|
|
443
|
+
vector=query_vector,
|
|
444
|
+
top_k=n_results,
|
|
445
|
+
include_metadata=True,
|
|
446
|
+
include_values=True,
|
|
447
|
+
filter=filter_dict
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Extract results
|
|
451
|
+
ids = []
|
|
452
|
+
distances = []
|
|
453
|
+
documents = []
|
|
454
|
+
metadatas = []
|
|
455
|
+
embeddings = []
|
|
456
|
+
|
|
457
|
+
if hasattr(result, 'matches'):
|
|
458
|
+
for match in result.matches: # type: ignore
|
|
459
|
+
ids.append(match.id) # type: ignore
|
|
460
|
+
# Convert similarity to distance for cosine metric
|
|
461
|
+
score = getattr(match, 'score', None)
|
|
462
|
+
if score is not None:
|
|
463
|
+
distances.append(1.0 - score)
|
|
464
|
+
else:
|
|
465
|
+
distances.append(None)
|
|
466
|
+
|
|
467
|
+
metadata = match.metadata or {} # type: ignore
|
|
468
|
+
doc = metadata.pop('document', '')
|
|
469
|
+
documents.append(doc)
|
|
470
|
+
metadatas.append(metadata)
|
|
471
|
+
|
|
472
|
+
if hasattr(match, 'values') and match.values: # type: ignore
|
|
473
|
+
embeddings.append(match.values) # type: ignore
|
|
474
|
+
else:
|
|
475
|
+
embeddings.append([])
|
|
476
|
+
|
|
477
|
+
all_ids.append(ids)
|
|
478
|
+
all_distances.append(distances)
|
|
479
|
+
all_documents.append(documents)
|
|
480
|
+
all_metadatas.append(metadatas)
|
|
481
|
+
all_embeddings.append(embeddings)
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
"ids": all_ids,
|
|
485
|
+
"distances": all_distances,
|
|
486
|
+
"documents": all_documents,
|
|
487
|
+
"metadatas": all_metadatas,
|
|
488
|
+
"embeddings": all_embeddings,
|
|
489
|
+
}
|
|
490
|
+
except Exception as e:
|
|
491
|
+
print(f"Query failed: {e}")
|
|
492
|
+
import traceback
|
|
493
|
+
traceback.print_exc()
|
|
494
|
+
return None
|
|
495
|
+
|
|
496
|
+
def _convert_filter(self, where: Dict[str, Any]) -> Dict[str, Any]:
|
|
497
|
+
"""
|
|
498
|
+
Convert generic filter to Pinecone filter format.
|
|
499
|
+
|
|
500
|
+
Pinecone supports: $eq, $ne, $gt, $gte, $lt, $lte, $in, $nin
|
|
501
|
+
"""
|
|
502
|
+
# Simple conversion - map field equality
|
|
503
|
+
# For more complex filters, this would need expansion
|
|
504
|
+
pinecone_filter = {}
|
|
505
|
+
|
|
506
|
+
for key, value in where.items():
|
|
507
|
+
if isinstance(value, dict):
|
|
508
|
+
# Handle operator-based filters
|
|
509
|
+
pinecone_filter[key] = value
|
|
510
|
+
else:
|
|
511
|
+
# Simple equality
|
|
512
|
+
pinecone_filter[key] = {"$eq": value}
|
|
513
|
+
|
|
514
|
+
return pinecone_filter
|
|
515
|
+
|
|
516
|
+
def get_all_items(
|
|
517
|
+
self,
|
|
518
|
+
collection_name: str,
|
|
519
|
+
limit: Optional[int] = None,
|
|
520
|
+
offset: Optional[int] = None,
|
|
521
|
+
where: Optional[Dict[str, Any]] = None,
|
|
522
|
+
) -> Optional[Dict[str, Any]]:
|
|
523
|
+
"""
|
|
524
|
+
Get all items from an index using pagination.
|
|
525
|
+
|
|
526
|
+
Note: Uses Pinecone's list() method which returns a generator of ID lists.
|
|
527
|
+
Offset-based pagination is simulated by skipping items.
|
|
528
|
+
|
|
529
|
+
Args:
|
|
530
|
+
collection_name: Name of index
|
|
531
|
+
limit: Maximum number of items to return
|
|
532
|
+
offset: Number of items to skip
|
|
533
|
+
where: Metadata filter (not supported in list operation)
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
Index items or None if failed
|
|
537
|
+
"""
|
|
538
|
+
index = self._get_index(collection_name)
|
|
539
|
+
if not index:
|
|
540
|
+
return None
|
|
541
|
+
|
|
542
|
+
try:
|
|
543
|
+
ids_to_fetch = []
|
|
544
|
+
items_collected = 0
|
|
545
|
+
items_skipped = 0
|
|
546
|
+
target_offset = offset or 0
|
|
547
|
+
target_limit = limit or 100
|
|
548
|
+
|
|
549
|
+
# list() returns a generator that yields lists of IDs
|
|
550
|
+
for id_list in index.list(): # type: ignore
|
|
551
|
+
if not id_list:
|
|
552
|
+
continue
|
|
553
|
+
|
|
554
|
+
# Handle offset by skipping items
|
|
555
|
+
for vid in id_list:
|
|
556
|
+
if items_skipped < target_offset:
|
|
557
|
+
items_skipped += 1
|
|
558
|
+
continue
|
|
559
|
+
|
|
560
|
+
if items_collected < target_limit:
|
|
561
|
+
ids_to_fetch.append(vid)
|
|
562
|
+
items_collected += 1
|
|
563
|
+
else:
|
|
564
|
+
break
|
|
565
|
+
|
|
566
|
+
# Stop if we have enough
|
|
567
|
+
if items_collected >= target_limit:
|
|
568
|
+
break
|
|
569
|
+
|
|
570
|
+
# If no IDs found, return empty result
|
|
571
|
+
if not ids_to_fetch:
|
|
572
|
+
return {
|
|
573
|
+
"ids": [],
|
|
574
|
+
"documents": [],
|
|
575
|
+
"metadatas": [],
|
|
576
|
+
"embeddings": []
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
# Fetch the actual vector data in batches (Pinecone fetch limit is 1000)
|
|
580
|
+
batch_size = 1000
|
|
581
|
+
all_ids = []
|
|
582
|
+
all_documents = []
|
|
583
|
+
all_metadatas = []
|
|
584
|
+
all_embeddings = []
|
|
585
|
+
|
|
586
|
+
for i in range(0, len(ids_to_fetch), batch_size):
|
|
587
|
+
batch_ids = ids_to_fetch[i:i + batch_size]
|
|
588
|
+
fetch_result = index.fetch(ids=batch_ids)
|
|
589
|
+
|
|
590
|
+
for vid in batch_ids:
|
|
591
|
+
if vid in fetch_result.vectors:
|
|
592
|
+
vector_data = fetch_result.vectors[vid]
|
|
593
|
+
all_ids.append(vid)
|
|
594
|
+
|
|
595
|
+
metadata = vector_data.metadata.copy() if vector_data.metadata else {}
|
|
596
|
+
doc = metadata.pop('document', '')
|
|
597
|
+
all_documents.append(doc)
|
|
598
|
+
all_metadatas.append(metadata)
|
|
599
|
+
all_embeddings.append(vector_data.values)
|
|
600
|
+
|
|
601
|
+
return {
|
|
602
|
+
"ids": all_ids,
|
|
603
|
+
"documents": all_documents,
|
|
604
|
+
"metadatas": all_metadatas,
|
|
605
|
+
"embeddings": all_embeddings
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
except Exception as e:
|
|
609
|
+
print(f"Failed to get all items: {e}")
|
|
610
|
+
import traceback
|
|
611
|
+
traceback.print_exc()
|
|
612
|
+
return {
|
|
613
|
+
"ids": [],
|
|
614
|
+
"documents": [],
|
|
615
|
+
"metadatas": [],
|
|
616
|
+
"embeddings": []
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
def update_items(
|
|
620
|
+
self,
|
|
621
|
+
collection_name: str,
|
|
622
|
+
ids: List[str],
|
|
623
|
+
documents: Optional[List[str]] = None,
|
|
624
|
+
metadatas: Optional[List[Dict[str, Any]]] = None,
|
|
625
|
+
embeddings: Optional[List[List[float]]] = None,
|
|
626
|
+
) -> bool:
|
|
627
|
+
"""
|
|
628
|
+
Update items in an index.
|
|
629
|
+
|
|
630
|
+
Note: Pinecone updates via upsert (add_items can be used)
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
collection_name: Name of index
|
|
634
|
+
ids: IDs of items to update
|
|
635
|
+
documents: New document texts
|
|
636
|
+
metadatas: New metadata
|
|
637
|
+
embeddings: New embeddings
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
True if successful, False otherwise
|
|
641
|
+
"""
|
|
642
|
+
index = self._get_index(collection_name)
|
|
643
|
+
if not index:
|
|
644
|
+
return False
|
|
645
|
+
|
|
646
|
+
try:
|
|
647
|
+
# Fetch existing vectors to preserve data not being updated
|
|
648
|
+
existing = index.fetch(ids=ids)
|
|
649
|
+
|
|
650
|
+
vectors = []
|
|
651
|
+
for i, vid in enumerate(ids):
|
|
652
|
+
# Start with existing data
|
|
653
|
+
if vid in existing.vectors:
|
|
654
|
+
vector_data = existing.vectors[vid]
|
|
655
|
+
values = vector_data.values if embeddings is None else embeddings[i]
|
|
656
|
+
metadata = vector_data.metadata.copy() if vector_data.metadata else {}
|
|
657
|
+
else:
|
|
658
|
+
# New vector
|
|
659
|
+
if embeddings is None or i >= len(embeddings):
|
|
660
|
+
continue
|
|
661
|
+
values = embeddings[i]
|
|
662
|
+
metadata = {}
|
|
663
|
+
|
|
664
|
+
# Update metadata
|
|
665
|
+
if metadatas and i < len(metadatas):
|
|
666
|
+
metadata.update(metadatas[i])
|
|
667
|
+
|
|
668
|
+
# Update document
|
|
669
|
+
if documents and i < len(documents):
|
|
670
|
+
metadata['document'] = documents[i]
|
|
671
|
+
|
|
672
|
+
vectors.append({
|
|
673
|
+
'id': vid,
|
|
674
|
+
'values': values,
|
|
675
|
+
'metadata': metadata
|
|
676
|
+
})
|
|
677
|
+
|
|
678
|
+
# Upsert in batches
|
|
679
|
+
batch_size = 100
|
|
680
|
+
for i in range(0, len(vectors), batch_size):
|
|
681
|
+
batch = vectors[i:i + batch_size]
|
|
682
|
+
index.upsert(vectors=batch)
|
|
683
|
+
|
|
684
|
+
return True
|
|
685
|
+
except Exception as e:
|
|
686
|
+
print(f"Failed to update items: {e}")
|
|
687
|
+
return False
|
|
688
|
+
|
|
689
|
+
def delete_items(
|
|
690
|
+
self,
|
|
691
|
+
collection_name: str,
|
|
692
|
+
ids: Optional[List[str]] = None,
|
|
693
|
+
where: Optional[Dict[str, Any]] = None,
|
|
694
|
+
) -> bool:
|
|
695
|
+
"""
|
|
696
|
+
Delete items from an index.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
collection_name: Name of index
|
|
700
|
+
ids: IDs of items to delete
|
|
701
|
+
where: Metadata filter for items to delete
|
|
702
|
+
|
|
703
|
+
Returns:
|
|
704
|
+
True if successful, False otherwise
|
|
705
|
+
"""
|
|
706
|
+
index = self._get_index(collection_name)
|
|
707
|
+
if not index:
|
|
708
|
+
return False
|
|
709
|
+
|
|
710
|
+
try:
|
|
711
|
+
if ids:
|
|
712
|
+
# Delete by IDs
|
|
713
|
+
index.delete(ids=ids)
|
|
714
|
+
elif where:
|
|
715
|
+
# Delete by filter
|
|
716
|
+
filter_dict = self._convert_filter(where)
|
|
717
|
+
index.delete(filter=filter_dict)
|
|
718
|
+
else:
|
|
719
|
+
# Delete all (use with caution)
|
|
720
|
+
index.delete(delete_all=True)
|
|
721
|
+
|
|
722
|
+
return True
|
|
723
|
+
except Exception as e:
|
|
724
|
+
print(f"Failed to delete items: {e}")
|
|
725
|
+
return False
|
|
726
|
+
|
|
727
|
+
def get_connection_info(self) -> Dict[str, Any]:
|
|
728
|
+
"""
|
|
729
|
+
Get information about the current connection.
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
Dictionary with connection details
|
|
733
|
+
"""
|
|
734
|
+
info = {
|
|
735
|
+
"provider": "Pinecone",
|
|
736
|
+
"connected": self.is_connected
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if self.is_connected and self._client:
|
|
740
|
+
try:
|
|
741
|
+
# Get account/environment info if available
|
|
742
|
+
indexes = self._client.list_indexes()
|
|
743
|
+
info["index_count"] = len(indexes)
|
|
744
|
+
except Exception:
|
|
745
|
+
pass
|
|
746
|
+
|
|
747
|
+
return info
|
|
748
|
+
|
|
749
|
+
def get_supported_filter_operators(self) -> List[Dict[str, Any]]:
|
|
750
|
+
"""
|
|
751
|
+
Get filter operators supported by Pinecone.
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
List of operator dictionaries
|
|
755
|
+
"""
|
|
756
|
+
return [
|
|
757
|
+
{"name": "=", "server_side": True},
|
|
758
|
+
{"name": "!=", "server_side": True},
|
|
759
|
+
{"name": ">", "server_side": True},
|
|
760
|
+
{"name": ">=", "server_side": True},
|
|
761
|
+
{"name": "<", "server_side": True},
|
|
762
|
+
{"name": "<=", "server_side": True},
|
|
763
|
+
{"name": "in", "server_side": True},
|
|
764
|
+
{"name": "not in", "server_side": True},
|
|
765
|
+
# Client-side only operators
|
|
766
|
+
{"name": "contains", "server_side": False},
|
|
767
|
+
{"name": "not contains", "server_side": False},
|
|
768
|
+
]
|