matrixone-python-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrixone/__init__.py +155 -0
- matrixone/account.py +723 -0
- matrixone/async_client.py +3913 -0
- matrixone/async_metadata_manager.py +311 -0
- matrixone/async_orm.py +123 -0
- matrixone/async_vector_index_manager.py +633 -0
- matrixone/base_client.py +208 -0
- matrixone/client.py +4672 -0
- matrixone/config.py +452 -0
- matrixone/connection_hooks.py +286 -0
- matrixone/exceptions.py +89 -0
- matrixone/logger.py +782 -0
- matrixone/metadata.py +820 -0
- matrixone/moctl.py +219 -0
- matrixone/orm.py +2277 -0
- matrixone/pitr.py +646 -0
- matrixone/pubsub.py +771 -0
- matrixone/restore.py +411 -0
- matrixone/search_vector_index.py +1176 -0
- matrixone/snapshot.py +550 -0
- matrixone/sql_builder.py +844 -0
- matrixone/sqlalchemy_ext/__init__.py +161 -0
- matrixone/sqlalchemy_ext/adapters.py +163 -0
- matrixone/sqlalchemy_ext/dialect.py +534 -0
- matrixone/sqlalchemy_ext/fulltext_index.py +895 -0
- matrixone/sqlalchemy_ext/fulltext_search.py +1686 -0
- matrixone/sqlalchemy_ext/hnsw_config.py +194 -0
- matrixone/sqlalchemy_ext/ivf_config.py +252 -0
- matrixone/sqlalchemy_ext/table_builder.py +351 -0
- matrixone/sqlalchemy_ext/vector_index.py +1721 -0
- matrixone/sqlalchemy_ext/vector_type.py +948 -0
- matrixone/version.py +580 -0
- matrixone_python_sdk-0.1.0.dist-info/METADATA +706 -0
- matrixone_python_sdk-0.1.0.dist-info/RECORD +122 -0
- matrixone_python_sdk-0.1.0.dist-info/WHEEL +5 -0
- matrixone_python_sdk-0.1.0.dist-info/entry_points.txt +5 -0
- matrixone_python_sdk-0.1.0.dist-info/licenses/LICENSE +200 -0
- matrixone_python_sdk-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +19 -0
- tests/offline/__init__.py +20 -0
- tests/offline/conftest.py +77 -0
- tests/offline/test_account.py +703 -0
- tests/offline/test_async_client_query_comprehensive.py +1218 -0
- tests/offline/test_basic.py +54 -0
- tests/offline/test_case_sensitivity.py +227 -0
- tests/offline/test_connection_hooks_offline.py +287 -0
- tests/offline/test_dialect_schema_handling.py +609 -0
- tests/offline/test_explain_methods.py +346 -0
- tests/offline/test_filter_logical_in.py +237 -0
- tests/offline/test_fulltext_search_comprehensive.py +795 -0
- tests/offline/test_ivf_config.py +249 -0
- tests/offline/test_join_methods.py +281 -0
- tests/offline/test_join_sqlalchemy_compatibility.py +276 -0
- tests/offline/test_logical_in_method.py +237 -0
- tests/offline/test_matrixone_version_parsing.py +264 -0
- tests/offline/test_metadata_offline.py +557 -0
- tests/offline/test_moctl.py +300 -0
- tests/offline/test_moctl_simple.py +251 -0
- tests/offline/test_model_support_offline.py +359 -0
- tests/offline/test_model_support_simple.py +225 -0
- tests/offline/test_pinecone_filter_offline.py +377 -0
- tests/offline/test_pitr.py +585 -0
- tests/offline/test_pubsub.py +712 -0
- tests/offline/test_query_update.py +283 -0
- tests/offline/test_restore.py +445 -0
- tests/offline/test_snapshot_comprehensive.py +384 -0
- tests/offline/test_sql_escaping_edge_cases.py +551 -0
- tests/offline/test_sqlalchemy_integration.py +382 -0
- tests/offline/test_sqlalchemy_vector_integration.py +434 -0
- tests/offline/test_table_builder.py +198 -0
- tests/offline/test_unified_filter.py +398 -0
- tests/offline/test_unified_transaction.py +495 -0
- tests/offline/test_vector_index.py +238 -0
- tests/offline/test_vector_operations.py +688 -0
- tests/offline/test_vector_type.py +174 -0
- tests/offline/test_version_core.py +328 -0
- tests/offline/test_version_management.py +372 -0
- tests/offline/test_version_standalone.py +652 -0
- tests/online/__init__.py +20 -0
- tests/online/conftest.py +216 -0
- tests/online/test_account_management.py +194 -0
- tests/online/test_advanced_features.py +344 -0
- tests/online/test_async_client_interfaces.py +330 -0
- tests/online/test_async_client_online.py +285 -0
- tests/online/test_async_model_insert_online.py +293 -0
- tests/online/test_async_orm_online.py +300 -0
- tests/online/test_async_simple_query_online.py +802 -0
- tests/online/test_async_transaction_simple_query.py +300 -0
- tests/online/test_basic_connection.py +130 -0
- tests/online/test_client_online.py +238 -0
- tests/online/test_config.py +90 -0
- tests/online/test_config_validation.py +123 -0
- tests/online/test_connection_hooks_new_online.py +217 -0
- tests/online/test_dialect_schema_handling_online.py +331 -0
- tests/online/test_filter_logical_in_online.py +374 -0
- tests/online/test_fulltext_comprehensive.py +1773 -0
- tests/online/test_fulltext_label_online.py +433 -0
- tests/online/test_fulltext_search_online.py +842 -0
- tests/online/test_ivf_stats_online.py +506 -0
- tests/online/test_logger_integration.py +311 -0
- tests/online/test_matrixone_query_orm.py +540 -0
- tests/online/test_metadata_online.py +579 -0
- tests/online/test_model_insert_online.py +255 -0
- tests/online/test_mysql_driver_validation.py +213 -0
- tests/online/test_orm_advanced_features.py +2022 -0
- tests/online/test_orm_cte_integration.py +269 -0
- tests/online/test_orm_online.py +270 -0
- tests/online/test_pinecone_filter.py +708 -0
- tests/online/test_pubsub_operations.py +352 -0
- tests/online/test_query_methods.py +225 -0
- tests/online/test_query_update_online.py +433 -0
- tests/online/test_search_vector_index.py +557 -0
- tests/online/test_simple_fulltext_online.py +915 -0
- tests/online/test_snapshot_comprehensive.py +998 -0
- tests/online/test_sqlalchemy_engine_integration.py +336 -0
- tests/online/test_sqlalchemy_integration.py +425 -0
- tests/online/test_transaction_contexts.py +1219 -0
- tests/online/test_transaction_insert_methods.py +356 -0
- tests/online/test_transaction_query_methods.py +288 -0
- tests/online/test_unified_filter_online.py +529 -0
- tests/online/test_vector_comprehensive.py +706 -0
- tests/online/test_version_management.py +291 -0
@@ -0,0 +1,1176 @@
|
|
1
|
+
# Copyright 2021 - 2022 Matrix Origin
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
"""
|
16
|
+
SearchVectorIndex - A Pinecone-compatible vector search interface for MatrixOne
|
17
|
+
|
18
|
+
This module provides a high-level interface for vector search operations that is
|
19
|
+
compatible with Pinecone's API, making it easy to migrate from Pinecone to MatrixOne.
|
20
|
+
|
21
|
+
Key Features:
|
22
|
+
- Pinecone-compatible API for seamless migration
|
23
|
+
- Support for both IVF and HNSW vector indexes
|
24
|
+
- Metadata filtering with complex query syntax
|
25
|
+
- Vector upsert and delete operations (IVF only)
|
26
|
+
- Synchronous and asynchronous operation support
|
27
|
+
- Automatic index type detection and configuration
|
28
|
+
|
29
|
+
Supported Operations:
|
30
|
+
- Vector similarity search with multiple distance metrics
|
31
|
+
- Metadata filtering with Pinecone-compatible syntax
|
32
|
+
- Vector upsert (insert/update) operations
|
33
|
+
- Vector deletion by ID
|
34
|
+
- Index statistics and information
|
35
|
+
|
36
|
+
Index Types:
|
37
|
+
- IVF (Inverted File): Supports full CRUD operations, good for frequent updates
|
38
|
+
- HNSW (Hierarchical Navigable Small World): Read-only, optimized for search performance
|
39
|
+
|
40
|
+
Usage Example:
|
41
|
+
|
42
|
+
# Get a Pinecone-compatible index
|
43
|
+
index = client.get_pinecone_index("my_table", "embedding_column")
|
44
|
+
|
45
|
+
# Query vectors with metadata filtering
|
46
|
+
results = index.query(
|
47
|
+
vector=[0.1, 0.2, 0.3, ...],
|
48
|
+
top_k=10,
|
49
|
+
include_metadata=True,
|
50
|
+
filter={"category": "technology", "price": {"$gte": 100}}
|
51
|
+
)
|
52
|
+
|
53
|
+
# Process results
|
54
|
+
for match in results.matches:
|
55
|
+
print(f"ID: {match.id}, Score: {match.score}")
|
56
|
+
print(f"Metadata: {match.metadata}")
|
57
|
+
|
58
|
+
# Upsert vectors (IVF index only)
|
59
|
+
index.upsert([
|
60
|
+
{"id": "doc1", "embedding": [0.1, 0.2, ...], "title": "Document 1"},
|
61
|
+
{"id": "doc2", "embedding": [0.3, 0.4, ...], "title": "Document 2"}
|
62
|
+
])
|
63
|
+
|
64
|
+
# Delete vectors (IVF index only)
|
65
|
+
index.delete(["doc1", "doc2"])
|
66
|
+
"""
|
67
|
+
|
68
|
+
import re
|
69
|
+
from dataclasses import dataclass
|
70
|
+
from typing import Any, Dict, List, Optional
|
71
|
+
|
72
|
+
|
73
|
+
@dataclass
|
74
|
+
class VectorMatch:
|
75
|
+
"""
|
76
|
+
Represents a single vector search match result.
|
77
|
+
|
78
|
+
Attributes:
|
79
|
+
|
80
|
+
id: Unique identifier for the vector (primary key value as string)
|
81
|
+
score: Similarity score (lower is more similar for L2 distance)
|
82
|
+
metadata: Dictionary containing all metadata fields from the table
|
83
|
+
values: Optional vector values if include_values=True in query
|
84
|
+
"""
|
85
|
+
|
86
|
+
id: str
|
87
|
+
score: float
|
88
|
+
metadata: Dict[str, Any]
|
89
|
+
values: Optional[List[float]] = None
|
90
|
+
|
91
|
+
|
92
|
+
@dataclass
|
93
|
+
class QueryResponse:
|
94
|
+
"""
|
95
|
+
Represents a query response compatible with Pinecone API.
|
96
|
+
|
97
|
+
Attributes:
|
98
|
+
|
99
|
+
matches: List of VectorMatch objects containing search results
|
100
|
+
namespace: Namespace identifier (empty string for MatrixOne)
|
101
|
+
usage: Optional usage statistics (e.g., {"read_units": 10})
|
102
|
+
"""
|
103
|
+
|
104
|
+
matches: List[VectorMatch]
|
105
|
+
namespace: str = ""
|
106
|
+
usage: Optional[Dict[str, Any]] = None
|
107
|
+
|
108
|
+
|
109
|
+
class PineconeCompatibleIndex:
|
110
|
+
"""
|
111
|
+
A Pinecone-compatible vector search interface for MatrixOne.
|
112
|
+
|
113
|
+
This class provides a high-level interface for vector search operations
|
114
|
+
that is compatible with Pinecone's API, making it easy to migrate from
|
115
|
+
Pinecone to MatrixOne.
|
116
|
+
|
117
|
+
Features:
|
118
|
+
- Vector similarity search with multiple distance metrics (L2, cosine, inner product)
|
119
|
+
- Metadata filtering with Pinecone-compatible filter syntax
|
120
|
+
- Vector upsert and delete operations (IVF index only)
|
121
|
+
- Support for both synchronous and asynchronous operations
|
122
|
+
- Automatic index type detection (IVF/HNSW)
|
123
|
+
- Case-insensitive column name handling
|
124
|
+
|
125
|
+
Supported Index Types:
|
126
|
+
- IVF (Inverted File): Supports upsert/delete operations, good for frequent updates
|
127
|
+
- HNSW (Hierarchical Navigable Small World): Read-only, optimized for search performance
|
128
|
+
|
129
|
+
Example:
|
130
|
+
|
131
|
+
# Get a Pinecone-compatible index
|
132
|
+
index = client.get_pinecone_index("my_table", "embedding_column")
|
133
|
+
|
134
|
+
# Query vectors
|
135
|
+
results = index.query(
|
136
|
+
vector=[0.1, 0.2, 0.3, ...],
|
137
|
+
top_k=10,
|
138
|
+
include_metadata=True,
|
139
|
+
filter={"category": "technology", "price": {"$gte": 100}}
|
140
|
+
)
|
141
|
+
|
142
|
+
# Upsert vectors (IVF index only)
|
143
|
+
index.upsert([
|
144
|
+
{"id": "doc1", "embedding": [0.1, 0.2, ...], "title": "Document 1"},
|
145
|
+
{"id": "doc2", "embedding": [0.3, 0.4, ...], "title": "Document 2"}
|
146
|
+
])
|
147
|
+
|
148
|
+
# Delete vectors (IVF index only)
|
149
|
+
index.delete(["doc1", "doc2"])
|
150
|
+
"""
|
151
|
+
|
152
|
+
def __init__(self, client, table_name: str, vector_column: str):
|
153
|
+
"""
|
154
|
+
Initialize PineconeCompatibleIndex.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
|
158
|
+
client: MatrixOne client instance (Client or AsyncClient)
|
159
|
+
table_name: Name of the table containing vectors
|
160
|
+
vector_column: Name of the vector column containing embeddings
|
161
|
+
|
162
|
+
Note:
|
163
|
+
|
164
|
+
The table must already exist and contain a vector column.
|
165
|
+
The primary key column will be automatically detected.
|
166
|
+
Metadata columns are all non-primary-key, non-vector columns.
|
167
|
+
"""
|
168
|
+
self.client = client
|
169
|
+
self.table_name = table_name
|
170
|
+
self.vector_column = vector_column
|
171
|
+
self._index_info = None
|
172
|
+
self._metadata_columns = None # Will be auto-detected
|
173
|
+
self._id_column = None # Will be auto-detected as primary key
|
174
|
+
|
175
|
+
@property
|
176
|
+
def metadata_columns(self):
|
177
|
+
"""Get metadata columns (all columns except id and vector columns)"""
|
178
|
+
# Check if this is an async client
|
179
|
+
if hasattr(self.client, "execute") and hasattr(self.client.execute, "__call__"):
|
180
|
+
import asyncio
|
181
|
+
|
182
|
+
if asyncio.iscoroutinefunction(self.client.execute):
|
183
|
+
raise RuntimeError("Use _get_metadata_columns_async() for async clients")
|
184
|
+
return self._get_metadata_columns()
|
185
|
+
|
186
|
+
def _get_id_column(self):
|
187
|
+
"""Get the primary key column name"""
|
188
|
+
if self._id_column is not None:
|
189
|
+
return self._id_column
|
190
|
+
|
191
|
+
# Check if this is an async client
|
192
|
+
if hasattr(self.client, "execute") and hasattr(self.client.execute, "__call__"):
|
193
|
+
# Check if execute returns a coroutine (async client)
|
194
|
+
import asyncio
|
195
|
+
|
196
|
+
if asyncio.iscoroutinefunction(self.client.execute):
|
197
|
+
raise RuntimeError("Use _get_id_column_async() for async clients")
|
198
|
+
|
199
|
+
# Get table schema to find primary key
|
200
|
+
schema_result = self.client.execute(f"DESCRIBE {self.table_name}")
|
201
|
+
if not schema_result.rows:
|
202
|
+
# Fallback to 'id' if table not found
|
203
|
+
self._id_column = "id"
|
204
|
+
return self._id_column
|
205
|
+
|
206
|
+
# Find primary key column
|
207
|
+
for row in schema_result.rows:
|
208
|
+
column_name = row[0]
|
209
|
+
key_info = row[3] if len(row) > 3 else "" # Key column
|
210
|
+
if "PRI" in key_info.upper():
|
211
|
+
self._id_column = column_name
|
212
|
+
return self._id_column
|
213
|
+
|
214
|
+
# Fallback to 'id' if no primary key found
|
215
|
+
self._id_column = "id"
|
216
|
+
return self._id_column
|
217
|
+
|
218
|
+
async def _get_id_column_async(self):
|
219
|
+
"""Get the primary key column name - async version"""
|
220
|
+
if self._id_column is not None:
|
221
|
+
return self._id_column
|
222
|
+
|
223
|
+
# Get table schema to find primary key
|
224
|
+
schema_result = await self.client.execute(f"DESCRIBE {self.table_name}")
|
225
|
+
if not schema_result.rows:
|
226
|
+
# Fallback to 'id' if table not found
|
227
|
+
self._id_column = "id"
|
228
|
+
return self._id_column
|
229
|
+
|
230
|
+
# Find primary key column
|
231
|
+
for row in schema_result.rows:
|
232
|
+
column_name = row[0]
|
233
|
+
key_info = row[3] if len(row) > 3 else "" # Key column
|
234
|
+
if "PRI" in key_info.upper():
|
235
|
+
self._id_column = column_name
|
236
|
+
return self._id_column
|
237
|
+
|
238
|
+
# Fallback to 'id' if no primary key found
|
239
|
+
self._id_column = "id"
|
240
|
+
return self._id_column
|
241
|
+
|
242
|
+
def _get_metadata_columns(self):
|
243
|
+
"""Get metadata columns (all columns except id and vector columns)"""
|
244
|
+
if self._metadata_columns is not None:
|
245
|
+
return self._metadata_columns
|
246
|
+
|
247
|
+
# Get table schema
|
248
|
+
schema_result = self.client.execute(f"DESCRIBE {self.table_name}")
|
249
|
+
if not schema_result.rows:
|
250
|
+
self._metadata_columns = []
|
251
|
+
return self._metadata_columns
|
252
|
+
|
253
|
+
# Extract column names, excluding id and vector columns
|
254
|
+
all_columns = [row[0] for row in schema_result.rows]
|
255
|
+
id_column = self._get_id_column()
|
256
|
+
self._metadata_columns = [
|
257
|
+
col for col in all_columns if col.lower() not in [id_column.lower(), self.vector_column.lower()]
|
258
|
+
]
|
259
|
+
return self._metadata_columns
|
260
|
+
|
261
|
+
async def _get_metadata_columns_async(self):
|
262
|
+
"""Get metadata columns (all columns except id and vector columns) - async version"""
|
263
|
+
if self._metadata_columns is not None:
|
264
|
+
return self._metadata_columns
|
265
|
+
|
266
|
+
# Get table schema
|
267
|
+
schema_result = await self.client.execute(f"DESCRIBE {self.table_name}")
|
268
|
+
if not schema_result.rows:
|
269
|
+
self._metadata_columns = []
|
270
|
+
return self._metadata_columns
|
271
|
+
|
272
|
+
# Extract column names, excluding id and vector columns
|
273
|
+
all_columns = [row[0] for row in schema_result.rows]
|
274
|
+
id_column = await self._get_id_column_async()
|
275
|
+
self._metadata_columns = [
|
276
|
+
col for col in all_columns if col.lower() not in [id_column.lower(), self.vector_column.lower()]
|
277
|
+
]
|
278
|
+
return self._metadata_columns
|
279
|
+
|
280
|
+
async def _get_index_info_async(self):
|
281
|
+
"""Get index information for async client"""
|
282
|
+
if self._index_info is not None:
|
283
|
+
return self._index_info
|
284
|
+
|
285
|
+
# Get table schema
|
286
|
+
schema_result = await self.client.execute(f"SHOW CREATE TABLE {self.table_name}")
|
287
|
+
if not schema_result.rows:
|
288
|
+
raise ValueError(f"Table {self.table_name} not found")
|
289
|
+
|
290
|
+
create_sql = schema_result.rows[0][1] # Second column contains CREATE statement
|
291
|
+
|
292
|
+
# Parse vector index information from CREATE statement
|
293
|
+
self._index_info = self._parse_index_info(create_sql)
|
294
|
+
return self._index_info
|
295
|
+
|
296
|
+
def _get_index_info(self):
|
297
|
+
"""Get index information for sync client"""
|
298
|
+
if self._index_info is not None:
|
299
|
+
return self._index_info
|
300
|
+
|
301
|
+
# Get table schema
|
302
|
+
schema_result = self.client.execute(f"SHOW CREATE TABLE {self.table_name}")
|
303
|
+
if not schema_result.rows:
|
304
|
+
raise ValueError(f"Table {self.table_name} not found")
|
305
|
+
|
306
|
+
create_sql = schema_result.rows[0][1] # Second column contains CREATE statement
|
307
|
+
|
308
|
+
# Parse vector index information from CREATE statement
|
309
|
+
self._index_info = self._parse_index_info(create_sql)
|
310
|
+
return self._index_info
|
311
|
+
|
312
|
+
def _parse_index_info(self, create_sql: str) -> Dict[str, Any]:
|
313
|
+
"""
|
314
|
+
Parse vector index information from CREATE TABLE statement.
|
315
|
+
|
316
|
+
Args:
|
317
|
+
|
318
|
+
create_sql: CREATE TABLE SQL statement
|
319
|
+
|
320
|
+
Returns:
|
321
|
+
|
322
|
+
Dictionary containing index information
|
323
|
+
"""
|
324
|
+
index_info = {
|
325
|
+
"algorithm": "ivf",
|
326
|
+
"metric": "l2",
|
327
|
+
"dimensions": None,
|
328
|
+
"parameters": {},
|
329
|
+
} # default # default
|
330
|
+
|
331
|
+
# Extract vector column definition
|
332
|
+
vector_pattern = rf"`?{self.vector_column}`?\s+vec(?:f32|f64)\s*\(\s*(\d+)\s*\)"
|
333
|
+
vector_match = re.search(vector_pattern, create_sql, re.IGNORECASE)
|
334
|
+
if vector_match:
|
335
|
+
index_info["dimensions"] = int(vector_match.group(1))
|
336
|
+
|
337
|
+
# Extract index creation statements - match both CREATE INDEX and KEY definitions
|
338
|
+
index_pattern = (
|
339
|
+
r"(?:CREATE\s+(?:INDEX|VECTOR\s+INDEX)\s+(\w+)\s+ON\s+\w+\s*\([^)]+\)\s+USING\s+(\w+)"
|
340
|
+
r"(?:\s+WITH\s+\(([^)]+)\))?|KEY\s+`?(\w+)`?\s+USING\s+(\w+)\s+\([^)]+\)\s+([^,\n]+))"
|
341
|
+
)
|
342
|
+
index_matches = re.findall(index_pattern, create_sql, re.IGNORECASE | re.MULTILINE)
|
343
|
+
|
344
|
+
for match in index_matches:
|
345
|
+
# Handle both CREATE INDEX and KEY formats
|
346
|
+
if len(match) == 3: # CREATE INDEX format
|
347
|
+
algorithm, params = match[1], match[2]
|
348
|
+
else: # KEY format
|
349
|
+
algorithm, params = match[4], match[5]
|
350
|
+
|
351
|
+
if self.vector_column in create_sql:
|
352
|
+
index_info["algorithm"] = algorithm.lower()
|
353
|
+
|
354
|
+
# Parse parameters
|
355
|
+
if params:
|
356
|
+
# Parse parameters like "m = 16 ef_construction = 200 ef_search = 50 op_type 'vector_l2_ops'"
|
357
|
+
param_pairs = re.findall(r"(\w+)\s*=\s*([^,\s]+)", params)
|
358
|
+
for key, value in param_pairs:
|
359
|
+
# Convert string values to appropriate types
|
360
|
+
value = value.strip().strip("'\"")
|
361
|
+
if value.isdigit():
|
362
|
+
index_info["parameters"][key] = int(value)
|
363
|
+
elif value.replace(".", "").isdigit():
|
364
|
+
index_info["parameters"][key] = float(value)
|
365
|
+
else:
|
366
|
+
index_info["parameters"][key] = value
|
367
|
+
|
368
|
+
# Parse op_type parameter
|
369
|
+
op_type_match = re.search(r"op_type\s+'([^']+)'", params)
|
370
|
+
if op_type_match:
|
371
|
+
op_type = op_type_match.group(1)
|
372
|
+
if "cosine" in op_type.lower():
|
373
|
+
index_info["metric"] = "cosine"
|
374
|
+
elif "ip" in op_type.lower():
|
375
|
+
index_info["metric"] = "ip"
|
376
|
+
else:
|
377
|
+
index_info["metric"] = "l2"
|
378
|
+
break
|
379
|
+
|
380
|
+
return index_info
|
381
|
+
|
382
|
+
def _parse_pinecone_filter(self, filter_dict: Dict[str, Any]) -> tuple:
|
383
|
+
"""
|
384
|
+
Parse Pinecone-compatible filter into SQL WHERE conditions and parameters.
|
385
|
+
|
386
|
+
Args:
|
387
|
+
|
388
|
+
filter_dict: Pinecone filter dictionary
|
389
|
+
|
390
|
+
Returns:
|
391
|
+
|
392
|
+
Tuple of (where_conditions, where_params)
|
393
|
+
"""
|
394
|
+
if not filter_dict:
|
395
|
+
return [], []
|
396
|
+
|
397
|
+
where_conditions = []
|
398
|
+
where_params = []
|
399
|
+
|
400
|
+
def parse_condition(key: str, value: Any) -> str:
|
401
|
+
"""Parse a single filter condition"""
|
402
|
+
if isinstance(value, dict):
|
403
|
+
# Handle operators like $eq, $in, $gte, etc.
|
404
|
+
if "$eq" in value:
|
405
|
+
where_params.append(value["$eq"])
|
406
|
+
return f"{key} = ?"
|
407
|
+
elif "$ne" in value:
|
408
|
+
where_params.append(value["$ne"])
|
409
|
+
return f"{key} != ?"
|
410
|
+
elif "$in" in value:
|
411
|
+
if not value["$in"]: # Empty list
|
412
|
+
return "1=0" # Always false condition
|
413
|
+
placeholders = ",".join(["?" for _ in value["$in"]])
|
414
|
+
where_params.extend(value["$in"])
|
415
|
+
return f"{key} IN ({placeholders})"
|
416
|
+
elif "$nin" in value:
|
417
|
+
if not value["$nin"]: # Empty list
|
418
|
+
return "1=1" # Always true condition
|
419
|
+
placeholders = ",".join(["?" for _ in value["$nin"]])
|
420
|
+
where_params.extend(value["$nin"])
|
421
|
+
return f"{key} NOT IN ({placeholders})"
|
422
|
+
elif "$gt" in value:
|
423
|
+
where_params.append(value["$gt"])
|
424
|
+
return f"{key} > ?"
|
425
|
+
elif "$gte" in value:
|
426
|
+
where_params.append(value["$gte"])
|
427
|
+
return f"{key} >= ?"
|
428
|
+
elif "$lt" in value:
|
429
|
+
where_params.append(value["$lt"])
|
430
|
+
return f"{key} < ?"
|
431
|
+
elif "$lte" in value:
|
432
|
+
where_params.append(value["$lte"])
|
433
|
+
return f"{key} <= ?"
|
434
|
+
elif "$and" in value:
|
435
|
+
# Handle nested $and conditions
|
436
|
+
and_conditions = []
|
437
|
+
for condition in value["$and"]:
|
438
|
+
for sub_key, sub_value in condition.items():
|
439
|
+
and_conditions.append(parse_condition(sub_key, sub_value))
|
440
|
+
return f"({' AND '.join(and_conditions)})"
|
441
|
+
elif "$or" in value:
|
442
|
+
# Handle nested $or conditions
|
443
|
+
or_conditions = []
|
444
|
+
for condition in value["$or"]:
|
445
|
+
for sub_key, sub_value in condition.items():
|
446
|
+
or_conditions.append(parse_condition(sub_key, sub_value))
|
447
|
+
return f"({' OR '.join(or_conditions)})"
|
448
|
+
else:
|
449
|
+
raise ValueError(f"Unsupported operator in filter: {list(value.keys())}")
|
450
|
+
else:
|
451
|
+
# Direct value comparison (equivalent to $eq)
|
452
|
+
where_params.append(value)
|
453
|
+
return f"{key} = ?"
|
454
|
+
|
455
|
+
def parse_nested_condition(condition_dict: dict) -> str:
|
456
|
+
"""Parse a nested condition that might contain $and or $or"""
|
457
|
+
if "$and" in condition_dict:
|
458
|
+
and_conditions = []
|
459
|
+
for condition in condition_dict["$and"]:
|
460
|
+
and_conditions.append(parse_nested_condition(condition))
|
461
|
+
return f"({' AND '.join(and_conditions)})"
|
462
|
+
elif "$or" in condition_dict:
|
463
|
+
or_conditions = []
|
464
|
+
for condition in condition_dict["$or"]:
|
465
|
+
or_conditions.append(parse_nested_condition(condition))
|
466
|
+
return f"({' OR '.join(or_conditions)})"
|
467
|
+
else:
|
468
|
+
# This is a simple condition, parse it normally
|
469
|
+
conditions = []
|
470
|
+
for key, value in condition_dict.items():
|
471
|
+
conditions.append(parse_condition(key, value))
|
472
|
+
return " AND ".join(conditions)
|
473
|
+
|
474
|
+
# Parse top-level conditions
|
475
|
+
for key, value in filter_dict.items():
|
476
|
+
if key == "$and":
|
477
|
+
# Handle top-level $and
|
478
|
+
and_conditions = []
|
479
|
+
for condition in value:
|
480
|
+
and_conditions.append(parse_nested_condition(condition))
|
481
|
+
where_conditions.append(f"({' AND '.join(and_conditions)})")
|
482
|
+
elif key == "$or":
|
483
|
+
# Handle top-level $or
|
484
|
+
or_conditions = []
|
485
|
+
for condition in value:
|
486
|
+
or_conditions.append(parse_nested_condition(condition))
|
487
|
+
where_conditions.append(f"({' OR '.join(or_conditions)})")
|
488
|
+
else:
|
489
|
+
condition = parse_condition(key, value)
|
490
|
+
where_conditions.append(condition)
|
491
|
+
|
492
|
+
return where_conditions, where_params
|
493
|
+
|
494
|
+
def query(
|
495
|
+
self,
|
496
|
+
vector: List[float],
|
497
|
+
top_k: int = 10,
|
498
|
+
include_metadata: bool = True,
|
499
|
+
include_values: bool = False,
|
500
|
+
filter: Optional[Dict[str, Any]] = None,
|
501
|
+
namespace: str = "",
|
502
|
+
) -> QueryResponse:
|
503
|
+
"""
|
504
|
+
Query the vector index using similarity search (Pinecone-compatible API).
|
505
|
+
|
506
|
+
Performs vector similarity search and returns the most similar vectors
|
507
|
+
based on the configured distance metric (L2, cosine, or inner product).
|
508
|
+
|
509
|
+
Args:
|
510
|
+
|
511
|
+
vector: Query vector for similarity search. Must match the dimension
|
512
|
+
of vectors in the index.
|
513
|
+
top_k: Maximum number of results to return (default: 10)
|
514
|
+
include_metadata: Whether to include metadata fields in results (default: True)
|
515
|
+
include_values: Whether to include vector values in results (default: False)
|
516
|
+
filter: Optional metadata filter using Pinecone-compatible syntax:
|
517
|
+
- Equality: {"category": "technology"} or {"category": {"$eq": "technology"}}
|
518
|
+
- Not Equal: {"status": {"$ne": "inactive"}}
|
519
|
+
- Greater Than: {"price": {"$gt": 100}}
|
520
|
+
- Greater Than or Equal: {"price": {"$gte": 100}}
|
521
|
+
- Less Than: {"price": {"$lt": 500}}
|
522
|
+
- Less Than or Equal: {"price": {"$lte": 500}}
|
523
|
+
- In: {"status": {"$in": ["active", "pending", "review"]}}
|
524
|
+
- Not In: {"category": {"$nin": ["deprecated", "archived"]}}
|
525
|
+
- Logical AND: {"$and": [{"category": "tech"}, {"price": {"$gt": 50}}]}
|
526
|
+
- Logical OR: {"$or": [{"status": "active"}, {"priority": "high"}]}
|
527
|
+
- Nested conditions: {"$and": [{"$or": [{"a": 1}, {"b": 2}]}, {"c": 3}]}
|
528
|
+
namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
|
529
|
+
|
530
|
+
Returns:
|
531
|
+
|
532
|
+
QueryResponse: Object containing:
|
533
|
+
- matches: List of VectorMatch objects with id, score, metadata, and optional values
|
534
|
+
- namespace: Namespace (empty string for MatrixOne)
|
535
|
+
- usage: Dictionary with read_units count
|
536
|
+
|
537
|
+
Example:
|
538
|
+
|
539
|
+
# Basic similarity search
|
540
|
+
results = index.query([0.1, 0.2, 0.3], top_k=5)
|
541
|
+
|
542
|
+
# Simple equality filter
|
543
|
+
results = index.query(
|
544
|
+
vector=[0.1, 0.2, 0.3],
|
545
|
+
filter={"category": "technology"}
|
546
|
+
)
|
547
|
+
|
548
|
+
# Comparison operators
|
549
|
+
results = index.query(
|
550
|
+
vector=[0.1, 0.2, 0.3],
|
551
|
+
filter={"price": {"$gte": 100, "$lt": 500}}
|
552
|
+
)
|
553
|
+
|
554
|
+
# In/Not In operators
|
555
|
+
results = index.query(
|
556
|
+
vector=[0.1, 0.2, 0.3],
|
557
|
+
filter={"status": {"$in": ["active", "pending"]}}
|
558
|
+
)
|
559
|
+
|
560
|
+
# Logical AND/OR operators
|
561
|
+
results = index.query(
|
562
|
+
vector=[0.1, 0.2, 0.3],
|
563
|
+
filter={
|
564
|
+
"$and": [
|
565
|
+
{"category": {"$in": ["tech", "science"]}},
|
566
|
+
{"$or": [{"price": {"$lt": 100}}, {"discount": True}]}
|
567
|
+
]
|
568
|
+
}
|
569
|
+
)
|
570
|
+
|
571
|
+
# Complex nested conditions
|
572
|
+
results = index.query(
|
573
|
+
vector=[0.1, 0.2, 0.3],
|
574
|
+
filter={
|
575
|
+
"$and": [
|
576
|
+
{"$or": [{"priority": "high"}, {"urgent": True}]},
|
577
|
+
{"status": {"$ne": "archived"}},
|
578
|
+
{"created_date": {"$gte": "2024-01-01"}}
|
579
|
+
]
|
580
|
+
}
|
581
|
+
)
|
582
|
+
|
583
|
+
Raises:
|
584
|
+
|
585
|
+
ValueError: If vector dimension doesn't match index dimension
|
586
|
+
RuntimeError: If used with async client (use query_async instead)
|
587
|
+
"""
|
588
|
+
index_info = self._get_index_info()
|
589
|
+
|
590
|
+
# Parse filter if provided
|
591
|
+
where_conditions, where_params = self._parse_pinecone_filter(filter)
|
592
|
+
|
593
|
+
# Build similarity search query
|
594
|
+
id_column = self._get_id_column()
|
595
|
+
select_columns = [id_column]
|
596
|
+
if include_metadata:
|
597
|
+
metadata_columns = self._get_metadata_columns()
|
598
|
+
select_columns.extend(metadata_columns)
|
599
|
+
if include_values:
|
600
|
+
select_columns.append(self.vector_column)
|
601
|
+
|
602
|
+
# Use the appropriate similarity search method based on algorithm
|
603
|
+
if index_info["algorithm"] == "hnsw":
|
604
|
+
results = self.client.vector_ops.similarity_search(
|
605
|
+
self.table_name,
|
606
|
+
vector_column=self.vector_column,
|
607
|
+
query_vector=vector,
|
608
|
+
limit=top_k,
|
609
|
+
distance_type=index_info.get("metric", "l2"),
|
610
|
+
select_columns=select_columns,
|
611
|
+
where_conditions=where_conditions,
|
612
|
+
where_params=where_params,
|
613
|
+
)
|
614
|
+
else: # default to IVF
|
615
|
+
results = self.client.vector_ops.similarity_search(
|
616
|
+
self.table_name,
|
617
|
+
vector_column=self.vector_column,
|
618
|
+
query_vector=vector,
|
619
|
+
limit=top_k,
|
620
|
+
distance_type=index_info.get("metric", "l2"),
|
621
|
+
select_columns=select_columns,
|
622
|
+
where_conditions=where_conditions,
|
623
|
+
where_params=where_params,
|
624
|
+
)
|
625
|
+
|
626
|
+
# Convert results to MatrixOne format (using real primary key)
|
627
|
+
matches = []
|
628
|
+
for row in results:
|
629
|
+
# Use the actual primary key value and column name
|
630
|
+
pk_value = row[0] # Primary key value (can be any type)
|
631
|
+
score = float(row[-1]) if len(row) > 1 else 0.0 # Last column is usually score
|
632
|
+
|
633
|
+
# Extract metadata (including primary key as a field)
|
634
|
+
metadata = {}
|
635
|
+
if include_metadata:
|
636
|
+
metadata_columns = self._get_metadata_columns()
|
637
|
+
for i, col in enumerate(metadata_columns):
|
638
|
+
if i + 1 < len(row):
|
639
|
+
metadata[col] = row[i + 1]
|
640
|
+
|
641
|
+
# Add primary key to metadata with its real column name
|
642
|
+
id_column = self._get_id_column()
|
643
|
+
metadata[id_column] = pk_value
|
644
|
+
|
645
|
+
# Extract vector values if requested
|
646
|
+
values = None
|
647
|
+
if include_values and self.vector_column in select_columns:
|
648
|
+
# Find vector column index case-insensitively
|
649
|
+
vector_idx = next(i for i, col in enumerate(select_columns) if col.lower() == self.vector_column.lower())
|
650
|
+
if vector_idx < len(row):
|
651
|
+
values = row[vector_idx]
|
652
|
+
|
653
|
+
# Use primary key value as the match ID (convert to string for compatibility)
|
654
|
+
matches.append(VectorMatch(id=str(pk_value), score=score, metadata=metadata, values=values))
|
655
|
+
|
656
|
+
return QueryResponse(matches=matches, namespace=namespace, usage={"read_units": len(matches)})
|
657
|
+
|
658
|
+
async def query_async(
|
659
|
+
self,
|
660
|
+
vector: List[float],
|
661
|
+
top_k: int = 10,
|
662
|
+
include_metadata: bool = True,
|
663
|
+
include_values: bool = False,
|
664
|
+
filter: Optional[Dict[str, Any]] = None,
|
665
|
+
namespace: str = "",
|
666
|
+
) -> QueryResponse:
|
667
|
+
"""
|
668
|
+
Async version of query method.
|
669
|
+
|
670
|
+
Args:
|
671
|
+
|
672
|
+
vector: Query vector
|
673
|
+
top_k: Number of results to return
|
674
|
+
include_metadata: Whether to include metadata in results
|
675
|
+
include_values: Whether to include vector values in results
|
676
|
+
filter: Optional metadata filter (Pinecone-compatible)
|
677
|
+
namespace: Namespace (not used in MatrixOne)
|
678
|
+
|
679
|
+
Returns:
|
680
|
+
|
681
|
+
QueryResponse object with matches
|
682
|
+
"""
|
683
|
+
index_info = await self._get_index_info_async()
|
684
|
+
|
685
|
+
# Parse filter if provided
|
686
|
+
where_conditions, where_params = self._parse_pinecone_filter(filter)
|
687
|
+
|
688
|
+
# Build similarity search query
|
689
|
+
id_column = await self._get_id_column_async()
|
690
|
+
select_columns = [id_column]
|
691
|
+
if include_metadata:
|
692
|
+
metadata_columns = await self._get_metadata_columns_async()
|
693
|
+
select_columns.extend(metadata_columns)
|
694
|
+
if include_values:
|
695
|
+
select_columns.append(self.vector_column)
|
696
|
+
|
697
|
+
# Use unified SQL builder for async queries
|
698
|
+
from .sql_builder import DistanceFunction, build_vector_similarity_query
|
699
|
+
|
700
|
+
# Convert metric to distance function enum
|
701
|
+
metric = index_info.get("metric", "l2")
|
702
|
+
if metric == "l2":
|
703
|
+
distance_func = DistanceFunction.L2
|
704
|
+
elif metric == "cosine":
|
705
|
+
distance_func = DistanceFunction.COSINE
|
706
|
+
elif metric == "ip":
|
707
|
+
distance_func = DistanceFunction.INNER_PRODUCT
|
708
|
+
else:
|
709
|
+
distance_func = DistanceFunction.L2
|
710
|
+
|
711
|
+
# Build query using unified SQL builder
|
712
|
+
sql = build_vector_similarity_query(
|
713
|
+
table_name=self.table_name,
|
714
|
+
vector_column=self.vector_column,
|
715
|
+
query_vector=vector,
|
716
|
+
distance_func=distance_func,
|
717
|
+
limit=top_k,
|
718
|
+
select_columns=select_columns,
|
719
|
+
where_conditions=where_conditions,
|
720
|
+
where_params=where_params,
|
721
|
+
)
|
722
|
+
|
723
|
+
# Execute query
|
724
|
+
result = await self.client.execute(sql)
|
725
|
+
results = result.rows
|
726
|
+
|
727
|
+
# Convert results to MatrixOne format (using real primary key)
|
728
|
+
matches = []
|
729
|
+
for row in results:
|
730
|
+
# Use the actual primary key value and column name
|
731
|
+
pk_value = row[0] # Primary key value (can be any type)
|
732
|
+
score = float(row[-1]) if len(row) > 1 else 0.0 # Last column is usually score
|
733
|
+
|
734
|
+
# Extract metadata (including primary key as a field)
|
735
|
+
metadata = {}
|
736
|
+
if include_metadata:
|
737
|
+
metadata_columns = await self._get_metadata_columns_async()
|
738
|
+
for i, col in enumerate(metadata_columns):
|
739
|
+
if i + 1 < len(row):
|
740
|
+
metadata[col] = row[i + 1]
|
741
|
+
|
742
|
+
# Add primary key to metadata with its real column name
|
743
|
+
id_column = await self._get_id_column_async()
|
744
|
+
metadata[id_column] = pk_value
|
745
|
+
|
746
|
+
# Extract vector values if requested
|
747
|
+
values = None
|
748
|
+
if include_values and self.vector_column in select_columns:
|
749
|
+
# Find vector column index case-insensitively
|
750
|
+
vector_idx = next(i for i, col in enumerate(select_columns) if col.lower() == self.vector_column.lower())
|
751
|
+
if vector_idx < len(row):
|
752
|
+
values = row[vector_idx]
|
753
|
+
|
754
|
+
# Use primary key value as the match ID (convert to string for compatibility)
|
755
|
+
matches.append(VectorMatch(id=str(pk_value), score=score, metadata=metadata, values=values))
|
756
|
+
|
757
|
+
return QueryResponse(matches=matches, namespace=namespace, usage={"read_units": len(matches)})
|
758
|
+
|
759
|
+
def delete(self, ids: List[Any], namespace: str = ""):
|
760
|
+
"""
|
761
|
+
Delete vectors by their primary key IDs (IVF index only).
|
762
|
+
|
763
|
+
Removes vectors from the index based on their primary key values.
|
764
|
+
This operation is only supported for IVF indexes, not HNSW indexes.
|
765
|
+
|
766
|
+
Args:
|
767
|
+
|
768
|
+
ids: List of primary key values to delete. Can be any type (str, int, etc.)
|
769
|
+
that matches the primary key column type.
|
770
|
+
namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
|
771
|
+
|
772
|
+
Returns:
|
773
|
+
|
774
|
+
None
|
775
|
+
|
776
|
+
Example:
|
777
|
+
|
778
|
+
# Delete vectors by ID
|
779
|
+
index.delete(["doc1", "doc2", "doc3"])
|
780
|
+
|
781
|
+
# Delete vectors with integer IDs
|
782
|
+
index.delete([1, 2, 3, 4, 5])
|
783
|
+
|
784
|
+
# Delete a single vector
|
785
|
+
index.delete(["single_doc_id"])
|
786
|
+
|
787
|
+
Raises:
|
788
|
+
|
789
|
+
ValueError: If the index type is HNSW (not supported for delete operations)
|
790
|
+
RuntimeError: If used with async client (use delete_async instead)
|
791
|
+
|
792
|
+
Note:
|
793
|
+
|
794
|
+
- Only IVF indexes support delete operations
|
795
|
+
- HNSW indexes are read-only and do not support upsert/delete
|
796
|
+
- IDs must match the primary key column type and values
|
797
|
+
- Non-existent IDs are silently ignored (no error raised)
|
798
|
+
"""
|
799
|
+
index_info = self._get_index_info()
|
800
|
+
|
801
|
+
# Check if index type supports delete operations
|
802
|
+
if index_info["algorithm"] == "hnsw":
|
803
|
+
raise ValueError(
|
804
|
+
"HNSW index does not support delete operations. " "Only IVF index supports INSERT/UPDATE/DELETE operations."
|
805
|
+
)
|
806
|
+
|
807
|
+
if ids:
|
808
|
+
# Use unified SQL builder for DELETE
|
809
|
+
from .sql_builder import build_delete_query
|
810
|
+
|
811
|
+
id_column = self._get_id_column()
|
812
|
+
placeholders = ",".join(["?" for _ in ids])
|
813
|
+
where_condition = f"{id_column} IN ({placeholders})"
|
814
|
+
|
815
|
+
sql, params = build_delete_query(
|
816
|
+
table_name=self.table_name, where_conditions=[where_condition], where_params=ids
|
817
|
+
)
|
818
|
+
self.client.execute(sql, params)
|
819
|
+
|
820
|
+
async def delete_async(self, ids: List[Any], namespace: str = ""):
|
821
|
+
"""
|
822
|
+
Async version of delete method.
|
823
|
+
|
824
|
+
Args:
|
825
|
+
|
826
|
+
ids: List of vector IDs to delete (can be any type: str, int, etc.)
|
827
|
+
namespace: Namespace (not used in MatrixOne)
|
828
|
+
|
829
|
+
Raises:
|
830
|
+
|
831
|
+
ValueError: If the index type is HNSW (not supported for delete operations)
|
832
|
+
"""
|
833
|
+
index_info = await self._get_index_info_async()
|
834
|
+
|
835
|
+
# Check if index type supports delete operations
|
836
|
+
if index_info["algorithm"] == "hnsw":
|
837
|
+
raise ValueError(
|
838
|
+
"HNSW index does not support delete operations. " "Only IVF index supports INSERT/UPDATE/DELETE operations."
|
839
|
+
)
|
840
|
+
|
841
|
+
if ids:
|
842
|
+
# Use unified SQL builder for DELETE
|
843
|
+
from .sql_builder import build_delete_query
|
844
|
+
|
845
|
+
id_column = await self._get_id_column_async()
|
846
|
+
placeholders = ",".join(["?" for _ in ids])
|
847
|
+
where_condition = f"{id_column} IN ({placeholders})"
|
848
|
+
|
849
|
+
sql, params = build_delete_query(
|
850
|
+
table_name=self.table_name, where_conditions=[where_condition], where_params=ids
|
851
|
+
)
|
852
|
+
await self.client.execute(sql, params)
|
853
|
+
|
854
|
+
def describe_index_stats(self) -> Dict[str, Any]:
|
855
|
+
"""
|
856
|
+
Get comprehensive index statistics (Pinecone-compatible API).
|
857
|
+
|
858
|
+
Returns detailed information about the vector index including dimensions,
|
859
|
+
vector count, and namespace information.
|
860
|
+
|
861
|
+
Returns:
|
862
|
+
|
863
|
+
Dict: Index statistics containing:
|
864
|
+
- dimension: Vector dimension size
|
865
|
+
- index_fullness: Index fullness ratio (always 0.0 for MatrixOne)
|
866
|
+
- total_vector_count: Total number of vectors in the index
|
867
|
+
- namespaces: Dictionary with namespace information:
|
868
|
+
- "": Default namespace with vector_count
|
869
|
+
|
870
|
+
Example:
|
871
|
+
|
872
|
+
stats = index.describe_index_stats()
|
873
|
+
print(f"Index has {stats['total_vector_count']} vectors")
|
874
|
+
print(f"Vector dimension: {stats['dimension']}")
|
875
|
+
print(f"Namespace vector count: {stats['namespaces']['']['vector_count']}")
|
876
|
+
|
877
|
+
Note:
|
878
|
+
|
879
|
+
- index_fullness is always 0.0 as MatrixOne doesn't use this concept
|
880
|
+
- Only the default namespace ("") is supported
|
881
|
+
- Vector count is the total number of rows in the table
|
882
|
+
"""
|
883
|
+
# Get table row count using unified SQL builder
|
884
|
+
from .sql_builder import build_select_query
|
885
|
+
|
886
|
+
sql = build_select_query(table_name=self.table_name, select_columns=["COUNT(*)"])
|
887
|
+
count_result = self.client.execute(sql)
|
888
|
+
total_vector_count = count_result.rows[0][0] if count_result.rows else 0
|
889
|
+
|
890
|
+
index_info = self._get_index_info()
|
891
|
+
|
892
|
+
return {
|
893
|
+
"dimension": index_info.get("dimensions", 0),
|
894
|
+
"index_fullness": 0.0, # Not applicable to MatrixOne
|
895
|
+
"total_vector_count": total_vector_count,
|
896
|
+
"namespaces": {"": {"vector_count": total_vector_count}},
|
897
|
+
}
|
898
|
+
|
899
|
+
async def describe_index_stats_async(self) -> Dict[str, Any]:
|
900
|
+
"""
|
901
|
+
Async version of describe_index_stats method.
|
902
|
+
|
903
|
+
Returns:
|
904
|
+
|
905
|
+
Dictionary with index statistics
|
906
|
+
"""
|
907
|
+
# Get table row count using unified SQL builder
|
908
|
+
from .sql_builder import build_select_query
|
909
|
+
|
910
|
+
sql = build_select_query(table_name=self.table_name, select_columns=["COUNT(*)"])
|
911
|
+
count_result = await self.client.execute(sql)
|
912
|
+
total_vector_count = count_result.rows[0][0] if count_result.rows else 0
|
913
|
+
|
914
|
+
index_info = await self._get_index_info_async()
|
915
|
+
|
916
|
+
return {
|
917
|
+
"dimension": index_info.get("dimensions", 0),
|
918
|
+
"index_fullness": 0.0, # Not applicable to MatrixOne
|
919
|
+
"total_vector_count": total_vector_count,
|
920
|
+
"namespaces": {"": {"vector_count": total_vector_count}},
|
921
|
+
}
|
922
|
+
|
923
|
+
def upsert(self, vectors: List[Dict[str, Any]], namespace: str = ""):
|
924
|
+
"""
|
925
|
+
Upsert vectors into the index (IVF index only).
|
926
|
+
|
927
|
+
Inserts new vectors or updates existing ones based on the primary key.
|
928
|
+
This operation is only supported for IVF indexes, not HNSW indexes.
|
929
|
+
|
930
|
+
Args:
|
931
|
+
|
932
|
+
vectors: List of vector dictionaries to upsert. Each vector dict must contain:
|
933
|
+
- Primary key field: Value for the primary key column (required)
|
934
|
+
- Vector field: Vector values as a list of floats (required)
|
935
|
+
- Additional fields: Any metadata fields to store
|
936
|
+
namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
|
937
|
+
|
938
|
+
Returns:
|
939
|
+
|
940
|
+
Dict: Statistics about the upsert operation:
|
941
|
+
- upserted_count: Number of vectors successfully upserted
|
942
|
+
|
943
|
+
Example:
|
944
|
+
|
945
|
+
# Upsert vectors with metadata
|
946
|
+
vectors = [
|
947
|
+
{
|
948
|
+
"id": "doc1", # Primary key field
|
949
|
+
"embedding": [0.1, 0.2, 0.3, 0.4], # Vector field
|
950
|
+
"title": "Document 1",
|
951
|
+
"category": "technology",
|
952
|
+
"price": 99.99
|
953
|
+
},
|
954
|
+
{
|
955
|
+
"id": "doc2",
|
956
|
+
"embedding": [0.5, 0.6, 0.7, 0.8],
|
957
|
+
"title": "Document 2",
|
958
|
+
"category": "science",
|
959
|
+
"price": 149.99
|
960
|
+
}
|
961
|
+
]
|
962
|
+
result = index.upsert(vectors)
|
963
|
+
print(f"Upserted {result['upserted_count']} vectors")
|
964
|
+
|
965
|
+
Raises:
|
966
|
+
|
967
|
+
ValueError: If the index type is HNSW (not supported for upsert operations)
|
968
|
+
ValueError: If a vector is missing required fields (primary key or vector)
|
969
|
+
RuntimeError: If used with async client (use upsert_async instead)
|
970
|
+
|
971
|
+
Note:
|
972
|
+
|
973
|
+
- Only IVF indexes support upsert operations
|
974
|
+
- HNSW indexes are read-only and do not support upsert/delete
|
975
|
+
- Vector dimensions must match the index configuration
|
976
|
+
- Primary key values must be unique within the table
|
977
|
+
"""
|
978
|
+
if not vectors:
|
979
|
+
return {"upserted_count": 0}
|
980
|
+
|
981
|
+
# Get the actual primary key column name
|
982
|
+
id_column = self._get_id_column()
|
983
|
+
|
984
|
+
# Process each vector individually for proper upsert behavior
|
985
|
+
for vector in vectors:
|
986
|
+
# Check if primary key field exists
|
987
|
+
if id_column not in vector:
|
988
|
+
raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
|
989
|
+
|
990
|
+
# Check if vector field exists
|
991
|
+
if self.vector_column not in vector:
|
992
|
+
raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
|
993
|
+
|
994
|
+
# Prepare data - use all fields from the vector dict
|
995
|
+
data = dict(vector)
|
996
|
+
|
997
|
+
# Build upsert SQL using INSERT ... ON DUPLICATE KEY UPDATE
|
998
|
+
columns = list(data.keys())
|
999
|
+
columns_str = ", ".join(columns)
|
1000
|
+
|
1001
|
+
# Format values - use proper vector format
|
1002
|
+
formatted_values = []
|
1003
|
+
for col in columns:
|
1004
|
+
value = data[col]
|
1005
|
+
if isinstance(value, list):
|
1006
|
+
# Format vector as string with proper escaping
|
1007
|
+
vector_str = "[" + ",".join(map(str, value)) + "]"
|
1008
|
+
formatted_values.append(f"'{vector_str}'")
|
1009
|
+
else:
|
1010
|
+
formatted_values.append(f"'{value}'")
|
1011
|
+
values_str = "(" + ", ".join(formatted_values) + ")"
|
1012
|
+
|
1013
|
+
# Build ON DUPLICATE KEY UPDATE clause
|
1014
|
+
update_clauses = []
|
1015
|
+
for col in columns:
|
1016
|
+
if col != id_column: # Don't update the primary key
|
1017
|
+
update_clauses.append(f"{col} = VALUES({col})")
|
1018
|
+
update_str = ", ".join(update_clauses)
|
1019
|
+
|
1020
|
+
# Execute upsert SQL
|
1021
|
+
sql = (
|
1022
|
+
f"INSERT INTO {self.table_name} ({columns_str}) VALUES {values_str} " f"ON DUPLICATE KEY UPDATE {update_str}"
|
1023
|
+
)
|
1024
|
+
self.client.execute(sql)
|
1025
|
+
|
1026
|
+
return {"upserted_count": len(vectors)}
|
1027
|
+
|
1028
|
+
async def upsert_async(self, vectors: List[Dict[str, Any]], namespace: str = ""):
|
1029
|
+
"""
|
1030
|
+
Async version of upsert method.
|
1031
|
+
|
1032
|
+
Args:
|
1033
|
+
|
1034
|
+
vectors: List of vectors to upsert. Each vector should be a dict with:
|
1035
|
+
- Primary key field: Value for the primary key column (required)
|
1036
|
+
- Vector field: Vector values (required)
|
1037
|
+
- Other fields: Any additional metadata fields
|
1038
|
+
namespace: Namespace (not used in MatrixOne)
|
1039
|
+
|
1040
|
+
Returns:
|
1041
|
+
|
1042
|
+
Dict with upsert statistics
|
1043
|
+
"""
|
1044
|
+
if not vectors:
|
1045
|
+
return {"upserted_count": 0}
|
1046
|
+
|
1047
|
+
# Get the actual primary key column name
|
1048
|
+
id_column = await self._get_id_column_async()
|
1049
|
+
|
1050
|
+
# Process each vector individually for proper upsert behavior
|
1051
|
+
for vector in vectors:
|
1052
|
+
# Check if primary key field exists
|
1053
|
+
if id_column not in vector:
|
1054
|
+
raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
|
1055
|
+
|
1056
|
+
# Check if vector field exists
|
1057
|
+
if self.vector_column not in vector:
|
1058
|
+
raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
|
1059
|
+
|
1060
|
+
# Prepare data - use all fields from the vector dict
|
1061
|
+
data = dict(vector)
|
1062
|
+
|
1063
|
+
# Build upsert SQL using INSERT ... ON DUPLICATE KEY UPDATE
|
1064
|
+
columns = list(data.keys())
|
1065
|
+
columns_str = ", ".join(columns)
|
1066
|
+
|
1067
|
+
# Format values - use proper vector format
|
1068
|
+
formatted_values = []
|
1069
|
+
for col in columns:
|
1070
|
+
value = data[col]
|
1071
|
+
if isinstance(value, list):
|
1072
|
+
# Format vector as string with proper escaping
|
1073
|
+
vector_str = "[" + ",".join(map(str, value)) + "]"
|
1074
|
+
formatted_values.append(f"'{vector_str}'")
|
1075
|
+
else:
|
1076
|
+
formatted_values.append(f"'{value}'")
|
1077
|
+
values_str = "(" + ", ".join(formatted_values) + ")"
|
1078
|
+
|
1079
|
+
# Build ON DUPLICATE KEY UPDATE clause
|
1080
|
+
update_clauses = []
|
1081
|
+
for col in columns:
|
1082
|
+
if col != id_column: # Don't update the primary key
|
1083
|
+
update_clauses.append(f"{col} = VALUES({col})")
|
1084
|
+
update_str = ", ".join(update_clauses)
|
1085
|
+
|
1086
|
+
# Execute upsert SQL
|
1087
|
+
sql = (
|
1088
|
+
f"INSERT INTO {self.table_name} ({columns_str}) VALUES {values_str} " f"ON DUPLICATE KEY UPDATE {update_str}"
|
1089
|
+
)
|
1090
|
+
await self.client.execute(sql)
|
1091
|
+
|
1092
|
+
return {"upserted_count": len(vectors)}
|
1093
|
+
|
1094
|
+
def batch_insert(self, vectors: List[Dict[str, Any]], namespace: str = ""):
|
1095
|
+
"""
|
1096
|
+
Batch insert vectors (Pinecone-compatible API).
|
1097
|
+
|
1098
|
+
Args:
|
1099
|
+
|
1100
|
+
vectors: List of vectors to insert. Each vector should be a dict with:
|
1101
|
+
- Primary key field: Value for the primary key column (required)
|
1102
|
+
- Vector field: Vector values (required)
|
1103
|
+
- Other fields: Any additional metadata fields
|
1104
|
+
namespace: Namespace (not used in MatrixOne)
|
1105
|
+
|
1106
|
+
Returns:
|
1107
|
+
|
1108
|
+
Dict with insert statistics
|
1109
|
+
"""
|
1110
|
+
if not vectors:
|
1111
|
+
return {"inserted_count": 0}
|
1112
|
+
|
1113
|
+
# Get the actual primary key column name
|
1114
|
+
id_column = self._get_id_column()
|
1115
|
+
|
1116
|
+
# Prepare data for batch insert
|
1117
|
+
batch_data = []
|
1118
|
+
for vector in vectors:
|
1119
|
+
# Check if primary key field exists
|
1120
|
+
if id_column not in vector:
|
1121
|
+
raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
|
1122
|
+
|
1123
|
+
# Check if vector field exists
|
1124
|
+
if self.vector_column not in vector:
|
1125
|
+
raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
|
1126
|
+
|
1127
|
+
# Prepare row data
|
1128
|
+
row_data = dict(vector)
|
1129
|
+
batch_data.append(row_data)
|
1130
|
+
|
1131
|
+
# Use client's batch_insert method
|
1132
|
+
self.client.batch_insert(self.table_name, batch_data)
|
1133
|
+
|
1134
|
+
return {"inserted_count": len(vectors)}
|
1135
|
+
|
1136
|
+
async def batch_insert_async(self, vectors: List[Dict[str, Any]], namespace: str = ""):
|
1137
|
+
"""
|
1138
|
+
Async version of batch_insert method.
|
1139
|
+
|
1140
|
+
Args:
|
1141
|
+
|
1142
|
+
vectors: List of vectors to insert. Each vector should be a dict with:
|
1143
|
+
- Primary key field: Value for the primary key column (required)
|
1144
|
+
- Vector field: Vector values (required)
|
1145
|
+
- Other fields: Any additional metadata fields
|
1146
|
+
namespace: Namespace (not used in MatrixOne)
|
1147
|
+
|
1148
|
+
Returns:
|
1149
|
+
|
1150
|
+
Dict with insert statistics
|
1151
|
+
"""
|
1152
|
+
if not vectors:
|
1153
|
+
return {"inserted_count": 0}
|
1154
|
+
|
1155
|
+
# Get the actual primary key column name
|
1156
|
+
id_column = await self._get_id_column_async()
|
1157
|
+
|
1158
|
+
# Prepare data for batch insert
|
1159
|
+
batch_data = []
|
1160
|
+
for vector in vectors:
|
1161
|
+
# Check if primary key field exists
|
1162
|
+
if id_column not in vector:
|
1163
|
+
raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
|
1164
|
+
|
1165
|
+
# Check if vector field exists
|
1166
|
+
if self.vector_column not in vector:
|
1167
|
+
raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
|
1168
|
+
|
1169
|
+
# Prepare row data
|
1170
|
+
row_data = dict(vector)
|
1171
|
+
batch_data.append(row_data)
|
1172
|
+
|
1173
|
+
# Use client's batch_insert_async method
|
1174
|
+
await self.client.batch_insert_async(self.table_name, batch_data)
|
1175
|
+
|
1176
|
+
return {"inserted_count": len(vectors)}
|