matrixone-python-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. matrixone/__init__.py +155 -0
  2. matrixone/account.py +723 -0
  3. matrixone/async_client.py +3913 -0
  4. matrixone/async_metadata_manager.py +311 -0
  5. matrixone/async_orm.py +123 -0
  6. matrixone/async_vector_index_manager.py +633 -0
  7. matrixone/base_client.py +208 -0
  8. matrixone/client.py +4672 -0
  9. matrixone/config.py +452 -0
  10. matrixone/connection_hooks.py +286 -0
  11. matrixone/exceptions.py +89 -0
  12. matrixone/logger.py +782 -0
  13. matrixone/metadata.py +820 -0
  14. matrixone/moctl.py +219 -0
  15. matrixone/orm.py +2277 -0
  16. matrixone/pitr.py +646 -0
  17. matrixone/pubsub.py +771 -0
  18. matrixone/restore.py +411 -0
  19. matrixone/search_vector_index.py +1176 -0
  20. matrixone/snapshot.py +550 -0
  21. matrixone/sql_builder.py +844 -0
  22. matrixone/sqlalchemy_ext/__init__.py +161 -0
  23. matrixone/sqlalchemy_ext/adapters.py +163 -0
  24. matrixone/sqlalchemy_ext/dialect.py +534 -0
  25. matrixone/sqlalchemy_ext/fulltext_index.py +895 -0
  26. matrixone/sqlalchemy_ext/fulltext_search.py +1686 -0
  27. matrixone/sqlalchemy_ext/hnsw_config.py +194 -0
  28. matrixone/sqlalchemy_ext/ivf_config.py +252 -0
  29. matrixone/sqlalchemy_ext/table_builder.py +351 -0
  30. matrixone/sqlalchemy_ext/vector_index.py +1721 -0
  31. matrixone/sqlalchemy_ext/vector_type.py +948 -0
  32. matrixone/version.py +580 -0
  33. matrixone_python_sdk-0.1.0.dist-info/METADATA +706 -0
  34. matrixone_python_sdk-0.1.0.dist-info/RECORD +122 -0
  35. matrixone_python_sdk-0.1.0.dist-info/WHEEL +5 -0
  36. matrixone_python_sdk-0.1.0.dist-info/entry_points.txt +5 -0
  37. matrixone_python_sdk-0.1.0.dist-info/licenses/LICENSE +200 -0
  38. matrixone_python_sdk-0.1.0.dist-info/top_level.txt +2 -0
  39. tests/__init__.py +19 -0
  40. tests/offline/__init__.py +20 -0
  41. tests/offline/conftest.py +77 -0
  42. tests/offline/test_account.py +703 -0
  43. tests/offline/test_async_client_query_comprehensive.py +1218 -0
  44. tests/offline/test_basic.py +54 -0
  45. tests/offline/test_case_sensitivity.py +227 -0
  46. tests/offline/test_connection_hooks_offline.py +287 -0
  47. tests/offline/test_dialect_schema_handling.py +609 -0
  48. tests/offline/test_explain_methods.py +346 -0
  49. tests/offline/test_filter_logical_in.py +237 -0
  50. tests/offline/test_fulltext_search_comprehensive.py +795 -0
  51. tests/offline/test_ivf_config.py +249 -0
  52. tests/offline/test_join_methods.py +281 -0
  53. tests/offline/test_join_sqlalchemy_compatibility.py +276 -0
  54. tests/offline/test_logical_in_method.py +237 -0
  55. tests/offline/test_matrixone_version_parsing.py +264 -0
  56. tests/offline/test_metadata_offline.py +557 -0
  57. tests/offline/test_moctl.py +300 -0
  58. tests/offline/test_moctl_simple.py +251 -0
  59. tests/offline/test_model_support_offline.py +359 -0
  60. tests/offline/test_model_support_simple.py +225 -0
  61. tests/offline/test_pinecone_filter_offline.py +377 -0
  62. tests/offline/test_pitr.py +585 -0
  63. tests/offline/test_pubsub.py +712 -0
  64. tests/offline/test_query_update.py +283 -0
  65. tests/offline/test_restore.py +445 -0
  66. tests/offline/test_snapshot_comprehensive.py +384 -0
  67. tests/offline/test_sql_escaping_edge_cases.py +551 -0
  68. tests/offline/test_sqlalchemy_integration.py +382 -0
  69. tests/offline/test_sqlalchemy_vector_integration.py +434 -0
  70. tests/offline/test_table_builder.py +198 -0
  71. tests/offline/test_unified_filter.py +398 -0
  72. tests/offline/test_unified_transaction.py +495 -0
  73. tests/offline/test_vector_index.py +238 -0
  74. tests/offline/test_vector_operations.py +688 -0
  75. tests/offline/test_vector_type.py +174 -0
  76. tests/offline/test_version_core.py +328 -0
  77. tests/offline/test_version_management.py +372 -0
  78. tests/offline/test_version_standalone.py +652 -0
  79. tests/online/__init__.py +20 -0
  80. tests/online/conftest.py +216 -0
  81. tests/online/test_account_management.py +194 -0
  82. tests/online/test_advanced_features.py +344 -0
  83. tests/online/test_async_client_interfaces.py +330 -0
  84. tests/online/test_async_client_online.py +285 -0
  85. tests/online/test_async_model_insert_online.py +293 -0
  86. tests/online/test_async_orm_online.py +300 -0
  87. tests/online/test_async_simple_query_online.py +802 -0
  88. tests/online/test_async_transaction_simple_query.py +300 -0
  89. tests/online/test_basic_connection.py +130 -0
  90. tests/online/test_client_online.py +238 -0
  91. tests/online/test_config.py +90 -0
  92. tests/online/test_config_validation.py +123 -0
  93. tests/online/test_connection_hooks_new_online.py +217 -0
  94. tests/online/test_dialect_schema_handling_online.py +331 -0
  95. tests/online/test_filter_logical_in_online.py +374 -0
  96. tests/online/test_fulltext_comprehensive.py +1773 -0
  97. tests/online/test_fulltext_label_online.py +433 -0
  98. tests/online/test_fulltext_search_online.py +842 -0
  99. tests/online/test_ivf_stats_online.py +506 -0
  100. tests/online/test_logger_integration.py +311 -0
  101. tests/online/test_matrixone_query_orm.py +540 -0
  102. tests/online/test_metadata_online.py +579 -0
  103. tests/online/test_model_insert_online.py +255 -0
  104. tests/online/test_mysql_driver_validation.py +213 -0
  105. tests/online/test_orm_advanced_features.py +2022 -0
  106. tests/online/test_orm_cte_integration.py +269 -0
  107. tests/online/test_orm_online.py +270 -0
  108. tests/online/test_pinecone_filter.py +708 -0
  109. tests/online/test_pubsub_operations.py +352 -0
  110. tests/online/test_query_methods.py +225 -0
  111. tests/online/test_query_update_online.py +433 -0
  112. tests/online/test_search_vector_index.py +557 -0
  113. tests/online/test_simple_fulltext_online.py +915 -0
  114. tests/online/test_snapshot_comprehensive.py +998 -0
  115. tests/online/test_sqlalchemy_engine_integration.py +336 -0
  116. tests/online/test_sqlalchemy_integration.py +425 -0
  117. tests/online/test_transaction_contexts.py +1219 -0
  118. tests/online/test_transaction_insert_methods.py +356 -0
  119. tests/online/test_transaction_query_methods.py +288 -0
  120. tests/online/test_unified_filter_online.py +529 -0
  121. tests/online/test_vector_comprehensive.py +706 -0
  122. tests/online/test_version_management.py +291 -0
@@ -0,0 +1,1176 @@
1
+ # Copyright 2021 - 2022 Matrix Origin
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """
16
+ SearchVectorIndex - A Pinecone-compatible vector search interface for MatrixOne
17
+
18
+ This module provides a high-level interface for vector search operations that is
19
+ compatible with Pinecone's API, making it easy to migrate from Pinecone to MatrixOne.
20
+
21
+ Key Features:
22
+ - Pinecone-compatible API for seamless migration
23
+ - Support for both IVF and HNSW vector indexes
24
+ - Metadata filtering with complex query syntax
25
+ - Vector upsert and delete operations (IVF only)
26
+ - Synchronous and asynchronous operation support
27
+ - Automatic index type detection and configuration
28
+
29
+ Supported Operations:
30
+ - Vector similarity search with multiple distance metrics
31
+ - Metadata filtering with Pinecone-compatible syntax
32
+ - Vector upsert (insert/update) operations
33
+ - Vector deletion by ID
34
+ - Index statistics and information
35
+
36
+ Index Types:
37
+ - IVF (Inverted File): Supports full CRUD operations, good for frequent updates
38
+ - HNSW (Hierarchical Navigable Small World): Read-only, optimized for search performance
39
+
40
+ Usage Example:
41
+
42
+ # Get a Pinecone-compatible index
43
+ index = client.get_pinecone_index("my_table", "embedding_column")
44
+
45
+ # Query vectors with metadata filtering
46
+ results = index.query(
47
+ vector=[0.1, 0.2, 0.3, ...],
48
+ top_k=10,
49
+ include_metadata=True,
50
+ filter={"category": "technology", "price": {"$gte": 100}}
51
+ )
52
+
53
+ # Process results
54
+ for match in results.matches:
55
+ print(f"ID: {match.id}, Score: {match.score}")
56
+ print(f"Metadata: {match.metadata}")
57
+
58
+ # Upsert vectors (IVF index only)
59
+ index.upsert([
60
+ {"id": "doc1", "embedding": [0.1, 0.2, ...], "title": "Document 1"},
61
+ {"id": "doc2", "embedding": [0.3, 0.4, ...], "title": "Document 2"}
62
+ ])
63
+
64
+ # Delete vectors (IVF index only)
65
+ index.delete(["doc1", "doc2"])
66
+ """
67
+
68
+ import re
69
+ from dataclasses import dataclass
70
+ from typing import Any, Dict, List, Optional
71
+
72
+
73
+ @dataclass
74
+ class VectorMatch:
75
+ """
76
+ Represents a single vector search match result.
77
+
78
+ Attributes:
79
+
80
+ id: Unique identifier for the vector (primary key value as string)
81
+ score: Similarity score (lower is more similar for L2 distance)
82
+ metadata: Dictionary containing all metadata fields from the table
83
+ values: Optional vector values if include_values=True in query
84
+ """
85
+
86
+ id: str
87
+ score: float
88
+ metadata: Dict[str, Any]
89
+ values: Optional[List[float]] = None
90
+
91
+
92
+ @dataclass
93
+ class QueryResponse:
94
+ """
95
+ Represents a query response compatible with Pinecone API.
96
+
97
+ Attributes:
98
+
99
+ matches: List of VectorMatch objects containing search results
100
+ namespace: Namespace identifier (empty string for MatrixOne)
101
+ usage: Optional usage statistics (e.g., {"read_units": 10})
102
+ """
103
+
104
+ matches: List[VectorMatch]
105
+ namespace: str = ""
106
+ usage: Optional[Dict[str, Any]] = None
107
+
108
+
109
+ class PineconeCompatibleIndex:
110
+ """
111
+ A Pinecone-compatible vector search interface for MatrixOne.
112
+
113
+ This class provides a high-level interface for vector search operations
114
+ that is compatible with Pinecone's API, making it easy to migrate from
115
+ Pinecone to MatrixOne.
116
+
117
+ Features:
118
+ - Vector similarity search with multiple distance metrics (L2, cosine, inner product)
119
+ - Metadata filtering with Pinecone-compatible filter syntax
120
+ - Vector upsert and delete operations (IVF index only)
121
+ - Support for both synchronous and asynchronous operations
122
+ - Automatic index type detection (IVF/HNSW)
123
+ - Case-insensitive column name handling
124
+
125
+ Supported Index Types:
126
+ - IVF (Inverted File): Supports upsert/delete operations, good for frequent updates
127
+ - HNSW (Hierarchical Navigable Small World): Read-only, optimized for search performance
128
+
129
+ Example:
130
+
131
+ # Get a Pinecone-compatible index
132
+ index = client.get_pinecone_index("my_table", "embedding_column")
133
+
134
+ # Query vectors
135
+ results = index.query(
136
+ vector=[0.1, 0.2, 0.3, ...],
137
+ top_k=10,
138
+ include_metadata=True,
139
+ filter={"category": "technology", "price": {"$gte": 100}}
140
+ )
141
+
142
+ # Upsert vectors (IVF index only)
143
+ index.upsert([
144
+ {"id": "doc1", "embedding": [0.1, 0.2, ...], "title": "Document 1"},
145
+ {"id": "doc2", "embedding": [0.3, 0.4, ...], "title": "Document 2"}
146
+ ])
147
+
148
+ # Delete vectors (IVF index only)
149
+ index.delete(["doc1", "doc2"])
150
+ """
151
+
152
+ def __init__(self, client, table_name: str, vector_column: str):
153
+ """
154
+ Initialize PineconeCompatibleIndex.
155
+
156
+ Args:
157
+
158
+ client: MatrixOne client instance (Client or AsyncClient)
159
+ table_name: Name of the table containing vectors
160
+ vector_column: Name of the vector column containing embeddings
161
+
162
+ Note:
163
+
164
+ The table must already exist and contain a vector column.
165
+ The primary key column will be automatically detected.
166
+ Metadata columns are all non-primary-key, non-vector columns.
167
+ """
168
+ self.client = client
169
+ self.table_name = table_name
170
+ self.vector_column = vector_column
171
+ self._index_info = None
172
+ self._metadata_columns = None # Will be auto-detected
173
+ self._id_column = None # Will be auto-detected as primary key
174
+
175
+ @property
176
+ def metadata_columns(self):
177
+ """Get metadata columns (all columns except id and vector columns)"""
178
+ # Check if this is an async client
179
+ if hasattr(self.client, "execute") and hasattr(self.client.execute, "__call__"):
180
+ import asyncio
181
+
182
+ if asyncio.iscoroutinefunction(self.client.execute):
183
+ raise RuntimeError("Use _get_metadata_columns_async() for async clients")
184
+ return self._get_metadata_columns()
185
+
186
+ def _get_id_column(self):
187
+ """Get the primary key column name"""
188
+ if self._id_column is not None:
189
+ return self._id_column
190
+
191
+ # Check if this is an async client
192
+ if hasattr(self.client, "execute") and hasattr(self.client.execute, "__call__"):
193
+ # Check if execute returns a coroutine (async client)
194
+ import asyncio
195
+
196
+ if asyncio.iscoroutinefunction(self.client.execute):
197
+ raise RuntimeError("Use _get_id_column_async() for async clients")
198
+
199
+ # Get table schema to find primary key
200
+ schema_result = self.client.execute(f"DESCRIBE {self.table_name}")
201
+ if not schema_result.rows:
202
+ # Fallback to 'id' if table not found
203
+ self._id_column = "id"
204
+ return self._id_column
205
+
206
+ # Find primary key column
207
+ for row in schema_result.rows:
208
+ column_name = row[0]
209
+ key_info = row[3] if len(row) > 3 else "" # Key column
210
+ if "PRI" in key_info.upper():
211
+ self._id_column = column_name
212
+ return self._id_column
213
+
214
+ # Fallback to 'id' if no primary key found
215
+ self._id_column = "id"
216
+ return self._id_column
217
+
218
+ async def _get_id_column_async(self):
219
+ """Get the primary key column name - async version"""
220
+ if self._id_column is not None:
221
+ return self._id_column
222
+
223
+ # Get table schema to find primary key
224
+ schema_result = await self.client.execute(f"DESCRIBE {self.table_name}")
225
+ if not schema_result.rows:
226
+ # Fallback to 'id' if table not found
227
+ self._id_column = "id"
228
+ return self._id_column
229
+
230
+ # Find primary key column
231
+ for row in schema_result.rows:
232
+ column_name = row[0]
233
+ key_info = row[3] if len(row) > 3 else "" # Key column
234
+ if "PRI" in key_info.upper():
235
+ self._id_column = column_name
236
+ return self._id_column
237
+
238
+ # Fallback to 'id' if no primary key found
239
+ self._id_column = "id"
240
+ return self._id_column
241
+
242
+ def _get_metadata_columns(self):
243
+ """Get metadata columns (all columns except id and vector columns)"""
244
+ if self._metadata_columns is not None:
245
+ return self._metadata_columns
246
+
247
+ # Get table schema
248
+ schema_result = self.client.execute(f"DESCRIBE {self.table_name}")
249
+ if not schema_result.rows:
250
+ self._metadata_columns = []
251
+ return self._metadata_columns
252
+
253
+ # Extract column names, excluding id and vector columns
254
+ all_columns = [row[0] for row in schema_result.rows]
255
+ id_column = self._get_id_column()
256
+ self._metadata_columns = [
257
+ col for col in all_columns if col.lower() not in [id_column.lower(), self.vector_column.lower()]
258
+ ]
259
+ return self._metadata_columns
260
+
261
+ async def _get_metadata_columns_async(self):
262
+ """Get metadata columns (all columns except id and vector columns) - async version"""
263
+ if self._metadata_columns is not None:
264
+ return self._metadata_columns
265
+
266
+ # Get table schema
267
+ schema_result = await self.client.execute(f"DESCRIBE {self.table_name}")
268
+ if not schema_result.rows:
269
+ self._metadata_columns = []
270
+ return self._metadata_columns
271
+
272
+ # Extract column names, excluding id and vector columns
273
+ all_columns = [row[0] for row in schema_result.rows]
274
+ id_column = await self._get_id_column_async()
275
+ self._metadata_columns = [
276
+ col for col in all_columns if col.lower() not in [id_column.lower(), self.vector_column.lower()]
277
+ ]
278
+ return self._metadata_columns
279
+
280
+ async def _get_index_info_async(self):
281
+ """Get index information for async client"""
282
+ if self._index_info is not None:
283
+ return self._index_info
284
+
285
+ # Get table schema
286
+ schema_result = await self.client.execute(f"SHOW CREATE TABLE {self.table_name}")
287
+ if not schema_result.rows:
288
+ raise ValueError(f"Table {self.table_name} not found")
289
+
290
+ create_sql = schema_result.rows[0][1] # Second column contains CREATE statement
291
+
292
+ # Parse vector index information from CREATE statement
293
+ self._index_info = self._parse_index_info(create_sql)
294
+ return self._index_info
295
+
296
+ def _get_index_info(self):
297
+ """Get index information for sync client"""
298
+ if self._index_info is not None:
299
+ return self._index_info
300
+
301
+ # Get table schema
302
+ schema_result = self.client.execute(f"SHOW CREATE TABLE {self.table_name}")
303
+ if not schema_result.rows:
304
+ raise ValueError(f"Table {self.table_name} not found")
305
+
306
+ create_sql = schema_result.rows[0][1] # Second column contains CREATE statement
307
+
308
+ # Parse vector index information from CREATE statement
309
+ self._index_info = self._parse_index_info(create_sql)
310
+ return self._index_info
311
+
312
+ def _parse_index_info(self, create_sql: str) -> Dict[str, Any]:
313
+ """
314
+ Parse vector index information from CREATE TABLE statement.
315
+
316
+ Args:
317
+
318
+ create_sql: CREATE TABLE SQL statement
319
+
320
+ Returns:
321
+
322
+ Dictionary containing index information
323
+ """
324
+ index_info = {
325
+ "algorithm": "ivf",
326
+ "metric": "l2",
327
+ "dimensions": None,
328
+ "parameters": {},
329
+ } # default # default
330
+
331
+ # Extract vector column definition
332
+ vector_pattern = rf"`?{self.vector_column}`?\s+vec(?:f32|f64)\s*\(\s*(\d+)\s*\)"
333
+ vector_match = re.search(vector_pattern, create_sql, re.IGNORECASE)
334
+ if vector_match:
335
+ index_info["dimensions"] = int(vector_match.group(1))
336
+
337
+ # Extract index creation statements - match both CREATE INDEX and KEY definitions
338
+ index_pattern = (
339
+ r"(?:CREATE\s+(?:INDEX|VECTOR\s+INDEX)\s+(\w+)\s+ON\s+\w+\s*\([^)]+\)\s+USING\s+(\w+)"
340
+ r"(?:\s+WITH\s+\(([^)]+)\))?|KEY\s+`?(\w+)`?\s+USING\s+(\w+)\s+\([^)]+\)\s+([^,\n]+))"
341
+ )
342
+ index_matches = re.findall(index_pattern, create_sql, re.IGNORECASE | re.MULTILINE)
343
+
344
+ for match in index_matches:
345
+ # Handle both CREATE INDEX and KEY formats
346
+ if len(match) == 3: # CREATE INDEX format
347
+ algorithm, params = match[1], match[2]
348
+ else: # KEY format
349
+ algorithm, params = match[4], match[5]
350
+
351
+ if self.vector_column in create_sql:
352
+ index_info["algorithm"] = algorithm.lower()
353
+
354
+ # Parse parameters
355
+ if params:
356
+ # Parse parameters like "m = 16 ef_construction = 200 ef_search = 50 op_type 'vector_l2_ops'"
357
+ param_pairs = re.findall(r"(\w+)\s*=\s*([^,\s]+)", params)
358
+ for key, value in param_pairs:
359
+ # Convert string values to appropriate types
360
+ value = value.strip().strip("'\"")
361
+ if value.isdigit():
362
+ index_info["parameters"][key] = int(value)
363
+ elif value.replace(".", "").isdigit():
364
+ index_info["parameters"][key] = float(value)
365
+ else:
366
+ index_info["parameters"][key] = value
367
+
368
+ # Parse op_type parameter
369
+ op_type_match = re.search(r"op_type\s+'([^']+)'", params)
370
+ if op_type_match:
371
+ op_type = op_type_match.group(1)
372
+ if "cosine" in op_type.lower():
373
+ index_info["metric"] = "cosine"
374
+ elif "ip" in op_type.lower():
375
+ index_info["metric"] = "ip"
376
+ else:
377
+ index_info["metric"] = "l2"
378
+ break
379
+
380
+ return index_info
381
+
382
+ def _parse_pinecone_filter(self, filter_dict: Dict[str, Any]) -> tuple:
383
+ """
384
+ Parse Pinecone-compatible filter into SQL WHERE conditions and parameters.
385
+
386
+ Args:
387
+
388
+ filter_dict: Pinecone filter dictionary
389
+
390
+ Returns:
391
+
392
+ Tuple of (where_conditions, where_params)
393
+ """
394
+ if not filter_dict:
395
+ return [], []
396
+
397
+ where_conditions = []
398
+ where_params = []
399
+
400
+ def parse_condition(key: str, value: Any) -> str:
401
+ """Parse a single filter condition"""
402
+ if isinstance(value, dict):
403
+ # Handle operators like $eq, $in, $gte, etc.
404
+ if "$eq" in value:
405
+ where_params.append(value["$eq"])
406
+ return f"{key} = ?"
407
+ elif "$ne" in value:
408
+ where_params.append(value["$ne"])
409
+ return f"{key} != ?"
410
+ elif "$in" in value:
411
+ if not value["$in"]: # Empty list
412
+ return "1=0" # Always false condition
413
+ placeholders = ",".join(["?" for _ in value["$in"]])
414
+ where_params.extend(value["$in"])
415
+ return f"{key} IN ({placeholders})"
416
+ elif "$nin" in value:
417
+ if not value["$nin"]: # Empty list
418
+ return "1=1" # Always true condition
419
+ placeholders = ",".join(["?" for _ in value["$nin"]])
420
+ where_params.extend(value["$nin"])
421
+ return f"{key} NOT IN ({placeholders})"
422
+ elif "$gt" in value:
423
+ where_params.append(value["$gt"])
424
+ return f"{key} > ?"
425
+ elif "$gte" in value:
426
+ where_params.append(value["$gte"])
427
+ return f"{key} >= ?"
428
+ elif "$lt" in value:
429
+ where_params.append(value["$lt"])
430
+ return f"{key} < ?"
431
+ elif "$lte" in value:
432
+ where_params.append(value["$lte"])
433
+ return f"{key} <= ?"
434
+ elif "$and" in value:
435
+ # Handle nested $and conditions
436
+ and_conditions = []
437
+ for condition in value["$and"]:
438
+ for sub_key, sub_value in condition.items():
439
+ and_conditions.append(parse_condition(sub_key, sub_value))
440
+ return f"({' AND '.join(and_conditions)})"
441
+ elif "$or" in value:
442
+ # Handle nested $or conditions
443
+ or_conditions = []
444
+ for condition in value["$or"]:
445
+ for sub_key, sub_value in condition.items():
446
+ or_conditions.append(parse_condition(sub_key, sub_value))
447
+ return f"({' OR '.join(or_conditions)})"
448
+ else:
449
+ raise ValueError(f"Unsupported operator in filter: {list(value.keys())}")
450
+ else:
451
+ # Direct value comparison (equivalent to $eq)
452
+ where_params.append(value)
453
+ return f"{key} = ?"
454
+
455
+ def parse_nested_condition(condition_dict: dict) -> str:
456
+ """Parse a nested condition that might contain $and or $or"""
457
+ if "$and" in condition_dict:
458
+ and_conditions = []
459
+ for condition in condition_dict["$and"]:
460
+ and_conditions.append(parse_nested_condition(condition))
461
+ return f"({' AND '.join(and_conditions)})"
462
+ elif "$or" in condition_dict:
463
+ or_conditions = []
464
+ for condition in condition_dict["$or"]:
465
+ or_conditions.append(parse_nested_condition(condition))
466
+ return f"({' OR '.join(or_conditions)})"
467
+ else:
468
+ # This is a simple condition, parse it normally
469
+ conditions = []
470
+ for key, value in condition_dict.items():
471
+ conditions.append(parse_condition(key, value))
472
+ return " AND ".join(conditions)
473
+
474
+ # Parse top-level conditions
475
+ for key, value in filter_dict.items():
476
+ if key == "$and":
477
+ # Handle top-level $and
478
+ and_conditions = []
479
+ for condition in value:
480
+ and_conditions.append(parse_nested_condition(condition))
481
+ where_conditions.append(f"({' AND '.join(and_conditions)})")
482
+ elif key == "$or":
483
+ # Handle top-level $or
484
+ or_conditions = []
485
+ for condition in value:
486
+ or_conditions.append(parse_nested_condition(condition))
487
+ where_conditions.append(f"({' OR '.join(or_conditions)})")
488
+ else:
489
+ condition = parse_condition(key, value)
490
+ where_conditions.append(condition)
491
+
492
+ return where_conditions, where_params
493
+
494
+ def query(
495
+ self,
496
+ vector: List[float],
497
+ top_k: int = 10,
498
+ include_metadata: bool = True,
499
+ include_values: bool = False,
500
+ filter: Optional[Dict[str, Any]] = None,
501
+ namespace: str = "",
502
+ ) -> QueryResponse:
503
+ """
504
+ Query the vector index using similarity search (Pinecone-compatible API).
505
+
506
+ Performs vector similarity search and returns the most similar vectors
507
+ based on the configured distance metric (L2, cosine, or inner product).
508
+
509
+ Args:
510
+
511
+ vector: Query vector for similarity search. Must match the dimension
512
+ of vectors in the index.
513
+ top_k: Maximum number of results to return (default: 10)
514
+ include_metadata: Whether to include metadata fields in results (default: True)
515
+ include_values: Whether to include vector values in results (default: False)
516
+ filter: Optional metadata filter using Pinecone-compatible syntax:
517
+ - Equality: {"category": "technology"} or {"category": {"$eq": "technology"}}
518
+ - Not Equal: {"status": {"$ne": "inactive"}}
519
+ - Greater Than: {"price": {"$gt": 100}}
520
+ - Greater Than or Equal: {"price": {"$gte": 100}}
521
+ - Less Than: {"price": {"$lt": 500}}
522
+ - Less Than or Equal: {"price": {"$lte": 500}}
523
+ - In: {"status": {"$in": ["active", "pending", "review"]}}
524
+ - Not In: {"category": {"$nin": ["deprecated", "archived"]}}
525
+ - Logical AND: {"$and": [{"category": "tech"}, {"price": {"$gt": 50}}]}
526
+ - Logical OR: {"$or": [{"status": "active"}, {"priority": "high"}]}
527
+ - Nested conditions: {"$and": [{"$or": [{"a": 1}, {"b": 2}]}, {"c": 3}]}
528
+ namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
529
+
530
+ Returns:
531
+
532
+ QueryResponse: Object containing:
533
+ - matches: List of VectorMatch objects with id, score, metadata, and optional values
534
+ - namespace: Namespace (empty string for MatrixOne)
535
+ - usage: Dictionary with read_units count
536
+
537
+ Example:
538
+
539
+ # Basic similarity search
540
+ results = index.query([0.1, 0.2, 0.3], top_k=5)
541
+
542
+ # Simple equality filter
543
+ results = index.query(
544
+ vector=[0.1, 0.2, 0.3],
545
+ filter={"category": "technology"}
546
+ )
547
+
548
+ # Comparison operators
549
+ results = index.query(
550
+ vector=[0.1, 0.2, 0.3],
551
+ filter={"price": {"$gte": 100, "$lt": 500}}
552
+ )
553
+
554
+ # In/Not In operators
555
+ results = index.query(
556
+ vector=[0.1, 0.2, 0.3],
557
+ filter={"status": {"$in": ["active", "pending"]}}
558
+ )
559
+
560
+ # Logical AND/OR operators
561
+ results = index.query(
562
+ vector=[0.1, 0.2, 0.3],
563
+ filter={
564
+ "$and": [
565
+ {"category": {"$in": ["tech", "science"]}},
566
+ {"$or": [{"price": {"$lt": 100}}, {"discount": True}]}
567
+ ]
568
+ }
569
+ )
570
+
571
+ # Complex nested conditions
572
+ results = index.query(
573
+ vector=[0.1, 0.2, 0.3],
574
+ filter={
575
+ "$and": [
576
+ {"$or": [{"priority": "high"}, {"urgent": True}]},
577
+ {"status": {"$ne": "archived"}},
578
+ {"created_date": {"$gte": "2024-01-01"}}
579
+ ]
580
+ }
581
+ )
582
+
583
+ Raises:
584
+
585
+ ValueError: If vector dimension doesn't match index dimension
586
+ RuntimeError: If used with async client (use query_async instead)
587
+ """
588
+ index_info = self._get_index_info()
589
+
590
+ # Parse filter if provided
591
+ where_conditions, where_params = self._parse_pinecone_filter(filter)
592
+
593
+ # Build similarity search query
594
+ id_column = self._get_id_column()
595
+ select_columns = [id_column]
596
+ if include_metadata:
597
+ metadata_columns = self._get_metadata_columns()
598
+ select_columns.extend(metadata_columns)
599
+ if include_values:
600
+ select_columns.append(self.vector_column)
601
+
602
+ # Use the appropriate similarity search method based on algorithm
603
+ if index_info["algorithm"] == "hnsw":
604
+ results = self.client.vector_ops.similarity_search(
605
+ self.table_name,
606
+ vector_column=self.vector_column,
607
+ query_vector=vector,
608
+ limit=top_k,
609
+ distance_type=index_info.get("metric", "l2"),
610
+ select_columns=select_columns,
611
+ where_conditions=where_conditions,
612
+ where_params=where_params,
613
+ )
614
+ else: # default to IVF
615
+ results = self.client.vector_ops.similarity_search(
616
+ self.table_name,
617
+ vector_column=self.vector_column,
618
+ query_vector=vector,
619
+ limit=top_k,
620
+ distance_type=index_info.get("metric", "l2"),
621
+ select_columns=select_columns,
622
+ where_conditions=where_conditions,
623
+ where_params=where_params,
624
+ )
625
+
626
+ # Convert results to MatrixOne format (using real primary key)
627
+ matches = []
628
+ for row in results:
629
+ # Use the actual primary key value and column name
630
+ pk_value = row[0] # Primary key value (can be any type)
631
+ score = float(row[-1]) if len(row) > 1 else 0.0 # Last column is usually score
632
+
633
+ # Extract metadata (including primary key as a field)
634
+ metadata = {}
635
+ if include_metadata:
636
+ metadata_columns = self._get_metadata_columns()
637
+ for i, col in enumerate(metadata_columns):
638
+ if i + 1 < len(row):
639
+ metadata[col] = row[i + 1]
640
+
641
+ # Add primary key to metadata with its real column name
642
+ id_column = self._get_id_column()
643
+ metadata[id_column] = pk_value
644
+
645
+ # Extract vector values if requested
646
+ values = None
647
+ if include_values and self.vector_column in select_columns:
648
+ # Find vector column index case-insensitively
649
+ vector_idx = next(i for i, col in enumerate(select_columns) if col.lower() == self.vector_column.lower())
650
+ if vector_idx < len(row):
651
+ values = row[vector_idx]
652
+
653
+ # Use primary key value as the match ID (convert to string for compatibility)
654
+ matches.append(VectorMatch(id=str(pk_value), score=score, metadata=metadata, values=values))
655
+
656
+ return QueryResponse(matches=matches, namespace=namespace, usage={"read_units": len(matches)})
657
+
658
+ async def query_async(
659
+ self,
660
+ vector: List[float],
661
+ top_k: int = 10,
662
+ include_metadata: bool = True,
663
+ include_values: bool = False,
664
+ filter: Optional[Dict[str, Any]] = None,
665
+ namespace: str = "",
666
+ ) -> QueryResponse:
667
+ """
668
+ Async version of query method.
669
+
670
+ Args:
671
+
672
+ vector: Query vector
673
+ top_k: Number of results to return
674
+ include_metadata: Whether to include metadata in results
675
+ include_values: Whether to include vector values in results
676
+ filter: Optional metadata filter (Pinecone-compatible)
677
+ namespace: Namespace (not used in MatrixOne)
678
+
679
+ Returns:
680
+
681
+ QueryResponse object with matches
682
+ """
683
+ index_info = await self._get_index_info_async()
684
+
685
+ # Parse filter if provided
686
+ where_conditions, where_params = self._parse_pinecone_filter(filter)
687
+
688
+ # Build similarity search query
689
+ id_column = await self._get_id_column_async()
690
+ select_columns = [id_column]
691
+ if include_metadata:
692
+ metadata_columns = await self._get_metadata_columns_async()
693
+ select_columns.extend(metadata_columns)
694
+ if include_values:
695
+ select_columns.append(self.vector_column)
696
+
697
+ # Use unified SQL builder for async queries
698
+ from .sql_builder import DistanceFunction, build_vector_similarity_query
699
+
700
+ # Convert metric to distance function enum
701
+ metric = index_info.get("metric", "l2")
702
+ if metric == "l2":
703
+ distance_func = DistanceFunction.L2
704
+ elif metric == "cosine":
705
+ distance_func = DistanceFunction.COSINE
706
+ elif metric == "ip":
707
+ distance_func = DistanceFunction.INNER_PRODUCT
708
+ else:
709
+ distance_func = DistanceFunction.L2
710
+
711
+ # Build query using unified SQL builder
712
+ sql = build_vector_similarity_query(
713
+ table_name=self.table_name,
714
+ vector_column=self.vector_column,
715
+ query_vector=vector,
716
+ distance_func=distance_func,
717
+ limit=top_k,
718
+ select_columns=select_columns,
719
+ where_conditions=where_conditions,
720
+ where_params=where_params,
721
+ )
722
+
723
+ # Execute query
724
+ result = await self.client.execute(sql)
725
+ results = result.rows
726
+
727
+ # Convert results to MatrixOne format (using real primary key)
728
+ matches = []
729
+ for row in results:
730
+ # Use the actual primary key value and column name
731
+ pk_value = row[0] # Primary key value (can be any type)
732
+ score = float(row[-1]) if len(row) > 1 else 0.0 # Last column is usually score
733
+
734
+ # Extract metadata (including primary key as a field)
735
+ metadata = {}
736
+ if include_metadata:
737
+ metadata_columns = await self._get_metadata_columns_async()
738
+ for i, col in enumerate(metadata_columns):
739
+ if i + 1 < len(row):
740
+ metadata[col] = row[i + 1]
741
+
742
+ # Add primary key to metadata with its real column name
743
+ id_column = await self._get_id_column_async()
744
+ metadata[id_column] = pk_value
745
+
746
+ # Extract vector values if requested
747
+ values = None
748
+ if include_values and self.vector_column in select_columns:
749
+ # Find vector column index case-insensitively
750
+ vector_idx = next(i for i, col in enumerate(select_columns) if col.lower() == self.vector_column.lower())
751
+ if vector_idx < len(row):
752
+ values = row[vector_idx]
753
+
754
+ # Use primary key value as the match ID (convert to string for compatibility)
755
+ matches.append(VectorMatch(id=str(pk_value), score=score, metadata=metadata, values=values))
756
+
757
+ return QueryResponse(matches=matches, namespace=namespace, usage={"read_units": len(matches)})
758
+
759
+ def delete(self, ids: List[Any], namespace: str = ""):
760
+ """
761
+ Delete vectors by their primary key IDs (IVF index only).
762
+
763
+ Removes vectors from the index based on their primary key values.
764
+ This operation is only supported for IVF indexes, not HNSW indexes.
765
+
766
+ Args:
767
+
768
+ ids: List of primary key values to delete. Can be any type (str, int, etc.)
769
+ that matches the primary key column type.
770
+ namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
771
+
772
+ Returns:
773
+
774
+ None
775
+
776
+ Example:
777
+
778
+ # Delete vectors by ID
779
+ index.delete(["doc1", "doc2", "doc3"])
780
+
781
+ # Delete vectors with integer IDs
782
+ index.delete([1, 2, 3, 4, 5])
783
+
784
+ # Delete a single vector
785
+ index.delete(["single_doc_id"])
786
+
787
+ Raises:
788
+
789
+ ValueError: If the index type is HNSW (not supported for delete operations)
790
+ RuntimeError: If used with async client (use delete_async instead)
791
+
792
+ Note:
793
+
794
+ - Only IVF indexes support delete operations
795
+ - HNSW indexes are read-only and do not support upsert/delete
796
+ - IDs must match the primary key column type and values
797
+ - Non-existent IDs are silently ignored (no error raised)
798
+ """
799
+ index_info = self._get_index_info()
800
+
801
+ # Check if index type supports delete operations
802
+ if index_info["algorithm"] == "hnsw":
803
+ raise ValueError(
804
+ "HNSW index does not support delete operations. " "Only IVF index supports INSERT/UPDATE/DELETE operations."
805
+ )
806
+
807
+ if ids:
808
+ # Use unified SQL builder for DELETE
809
+ from .sql_builder import build_delete_query
810
+
811
+ id_column = self._get_id_column()
812
+ placeholders = ",".join(["?" for _ in ids])
813
+ where_condition = f"{id_column} IN ({placeholders})"
814
+
815
+ sql, params = build_delete_query(
816
+ table_name=self.table_name, where_conditions=[where_condition], where_params=ids
817
+ )
818
+ self.client.execute(sql, params)
819
+
820
+ async def delete_async(self, ids: List[Any], namespace: str = ""):
821
+ """
822
+ Async version of delete method.
823
+
824
+ Args:
825
+
826
+ ids: List of vector IDs to delete (can be any type: str, int, etc.)
827
+ namespace: Namespace (not used in MatrixOne)
828
+
829
+ Raises:
830
+
831
+ ValueError: If the index type is HNSW (not supported for delete operations)
832
+ """
833
+ index_info = await self._get_index_info_async()
834
+
835
+ # Check if index type supports delete operations
836
+ if index_info["algorithm"] == "hnsw":
837
+ raise ValueError(
838
+ "HNSW index does not support delete operations. " "Only IVF index supports INSERT/UPDATE/DELETE operations."
839
+ )
840
+
841
+ if ids:
842
+ # Use unified SQL builder for DELETE
843
+ from .sql_builder import build_delete_query
844
+
845
+ id_column = await self._get_id_column_async()
846
+ placeholders = ",".join(["?" for _ in ids])
847
+ where_condition = f"{id_column} IN ({placeholders})"
848
+
849
+ sql, params = build_delete_query(
850
+ table_name=self.table_name, where_conditions=[where_condition], where_params=ids
851
+ )
852
+ await self.client.execute(sql, params)
853
+
854
+ def describe_index_stats(self) -> Dict[str, Any]:
855
+ """
856
+ Get comprehensive index statistics (Pinecone-compatible API).
857
+
858
+ Returns detailed information about the vector index including dimensions,
859
+ vector count, and namespace information.
860
+
861
+ Returns:
862
+
863
+ Dict: Index statistics containing:
864
+ - dimension: Vector dimension size
865
+ - index_fullness: Index fullness ratio (always 0.0 for MatrixOne)
866
+ - total_vector_count: Total number of vectors in the index
867
+ - namespaces: Dictionary with namespace information:
868
+ - "": Default namespace with vector_count
869
+
870
+ Example:
871
+
872
+ stats = index.describe_index_stats()
873
+ print(f"Index has {stats['total_vector_count']} vectors")
874
+ print(f"Vector dimension: {stats['dimension']}")
875
+ print(f"Namespace vector count: {stats['namespaces']['']['vector_count']}")
876
+
877
+ Note:
878
+
879
+ - index_fullness is always 0.0 as MatrixOne doesn't use this concept
880
+ - Only the default namespace ("") is supported
881
+ - Vector count is the total number of rows in the table
882
+ """
883
+ # Get table row count using unified SQL builder
884
+ from .sql_builder import build_select_query
885
+
886
+ sql = build_select_query(table_name=self.table_name, select_columns=["COUNT(*)"])
887
+ count_result = self.client.execute(sql)
888
+ total_vector_count = count_result.rows[0][0] if count_result.rows else 0
889
+
890
+ index_info = self._get_index_info()
891
+
892
+ return {
893
+ "dimension": index_info.get("dimensions", 0),
894
+ "index_fullness": 0.0, # Not applicable to MatrixOne
895
+ "total_vector_count": total_vector_count,
896
+ "namespaces": {"": {"vector_count": total_vector_count}},
897
+ }
898
+
899
+ async def describe_index_stats_async(self) -> Dict[str, Any]:
900
+ """
901
+ Async version of describe_index_stats method.
902
+
903
+ Returns:
904
+
905
+ Dictionary with index statistics
906
+ """
907
+ # Get table row count using unified SQL builder
908
+ from .sql_builder import build_select_query
909
+
910
+ sql = build_select_query(table_name=self.table_name, select_columns=["COUNT(*)"])
911
+ count_result = await self.client.execute(sql)
912
+ total_vector_count = count_result.rows[0][0] if count_result.rows else 0
913
+
914
+ index_info = await self._get_index_info_async()
915
+
916
+ return {
917
+ "dimension": index_info.get("dimensions", 0),
918
+ "index_fullness": 0.0, # Not applicable to MatrixOne
919
+ "total_vector_count": total_vector_count,
920
+ "namespaces": {"": {"vector_count": total_vector_count}},
921
+ }
922
+
923
+ def upsert(self, vectors: List[Dict[str, Any]], namespace: str = ""):
924
+ """
925
+ Upsert vectors into the index (IVF index only).
926
+
927
+ Inserts new vectors or updates existing ones based on the primary key.
928
+ This operation is only supported for IVF indexes, not HNSW indexes.
929
+
930
+ Args:
931
+
932
+ vectors: List of vector dictionaries to upsert. Each vector dict must contain:
933
+ - Primary key field: Value for the primary key column (required)
934
+ - Vector field: Vector values as a list of floats (required)
935
+ - Additional fields: Any metadata fields to store
936
+ namespace: Namespace identifier (not used in MatrixOne, kept for compatibility)
937
+
938
+ Returns:
939
+
940
+ Dict: Statistics about the upsert operation:
941
+ - upserted_count: Number of vectors successfully upserted
942
+
943
+ Example:
944
+
945
+ # Upsert vectors with metadata
946
+ vectors = [
947
+ {
948
+ "id": "doc1", # Primary key field
949
+ "embedding": [0.1, 0.2, 0.3, 0.4], # Vector field
950
+ "title": "Document 1",
951
+ "category": "technology",
952
+ "price": 99.99
953
+ },
954
+ {
955
+ "id": "doc2",
956
+ "embedding": [0.5, 0.6, 0.7, 0.8],
957
+ "title": "Document 2",
958
+ "category": "science",
959
+ "price": 149.99
960
+ }
961
+ ]
962
+ result = index.upsert(vectors)
963
+ print(f"Upserted {result['upserted_count']} vectors")
964
+
965
+ Raises:
966
+
967
+ ValueError: If the index type is HNSW (not supported for upsert operations)
968
+ ValueError: If a vector is missing required fields (primary key or vector)
969
+ RuntimeError: If used with async client (use upsert_async instead)
970
+
971
+ Note:
972
+
973
+ - Only IVF indexes support upsert operations
974
+ - HNSW indexes are read-only and do not support upsert/delete
975
+ - Vector dimensions must match the index configuration
976
+ - Primary key values must be unique within the table
977
+ """
978
+ if not vectors:
979
+ return {"upserted_count": 0}
980
+
981
+ # Get the actual primary key column name
982
+ id_column = self._get_id_column()
983
+
984
+ # Process each vector individually for proper upsert behavior
985
+ for vector in vectors:
986
+ # Check if primary key field exists
987
+ if id_column not in vector:
988
+ raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
989
+
990
+ # Check if vector field exists
991
+ if self.vector_column not in vector:
992
+ raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
993
+
994
+ # Prepare data - use all fields from the vector dict
995
+ data = dict(vector)
996
+
997
+ # Build upsert SQL using INSERT ... ON DUPLICATE KEY UPDATE
998
+ columns = list(data.keys())
999
+ columns_str = ", ".join(columns)
1000
+
1001
+ # Format values - use proper vector format
1002
+ formatted_values = []
1003
+ for col in columns:
1004
+ value = data[col]
1005
+ if isinstance(value, list):
1006
+ # Format vector as string with proper escaping
1007
+ vector_str = "[" + ",".join(map(str, value)) + "]"
1008
+ formatted_values.append(f"'{vector_str}'")
1009
+ else:
1010
+ formatted_values.append(f"'{value}'")
1011
+ values_str = "(" + ", ".join(formatted_values) + ")"
1012
+
1013
+ # Build ON DUPLICATE KEY UPDATE clause
1014
+ update_clauses = []
1015
+ for col in columns:
1016
+ if col != id_column: # Don't update the primary key
1017
+ update_clauses.append(f"{col} = VALUES({col})")
1018
+ update_str = ", ".join(update_clauses)
1019
+
1020
+ # Execute upsert SQL
1021
+ sql = (
1022
+ f"INSERT INTO {self.table_name} ({columns_str}) VALUES {values_str} " f"ON DUPLICATE KEY UPDATE {update_str}"
1023
+ )
1024
+ self.client.execute(sql)
1025
+
1026
+ return {"upserted_count": len(vectors)}
1027
+
1028
+ async def upsert_async(self, vectors: List[Dict[str, Any]], namespace: str = ""):
1029
+ """
1030
+ Async version of upsert method.
1031
+
1032
+ Args:
1033
+
1034
+ vectors: List of vectors to upsert. Each vector should be a dict with:
1035
+ - Primary key field: Value for the primary key column (required)
1036
+ - Vector field: Vector values (required)
1037
+ - Other fields: Any additional metadata fields
1038
+ namespace: Namespace (not used in MatrixOne)
1039
+
1040
+ Returns:
1041
+
1042
+ Dict with upsert statistics
1043
+ """
1044
+ if not vectors:
1045
+ return {"upserted_count": 0}
1046
+
1047
+ # Get the actual primary key column name
1048
+ id_column = await self._get_id_column_async()
1049
+
1050
+ # Process each vector individually for proper upsert behavior
1051
+ for vector in vectors:
1052
+ # Check if primary key field exists
1053
+ if id_column not in vector:
1054
+ raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
1055
+
1056
+ # Check if vector field exists
1057
+ if self.vector_column not in vector:
1058
+ raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
1059
+
1060
+ # Prepare data - use all fields from the vector dict
1061
+ data = dict(vector)
1062
+
1063
+ # Build upsert SQL using INSERT ... ON DUPLICATE KEY UPDATE
1064
+ columns = list(data.keys())
1065
+ columns_str = ", ".join(columns)
1066
+
1067
+ # Format values - use proper vector format
1068
+ formatted_values = []
1069
+ for col in columns:
1070
+ value = data[col]
1071
+ if isinstance(value, list):
1072
+ # Format vector as string with proper escaping
1073
+ vector_str = "[" + ",".join(map(str, value)) + "]"
1074
+ formatted_values.append(f"'{vector_str}'")
1075
+ else:
1076
+ formatted_values.append(f"'{value}'")
1077
+ values_str = "(" + ", ".join(formatted_values) + ")"
1078
+
1079
+ # Build ON DUPLICATE KEY UPDATE clause
1080
+ update_clauses = []
1081
+ for col in columns:
1082
+ if col != id_column: # Don't update the primary key
1083
+ update_clauses.append(f"{col} = VALUES({col})")
1084
+ update_str = ", ".join(update_clauses)
1085
+
1086
+ # Execute upsert SQL
1087
+ sql = (
1088
+ f"INSERT INTO {self.table_name} ({columns_str}) VALUES {values_str} " f"ON DUPLICATE KEY UPDATE {update_str}"
1089
+ )
1090
+ await self.client.execute(sql)
1091
+
1092
+ return {"upserted_count": len(vectors)}
1093
+
1094
+ def batch_insert(self, vectors: List[Dict[str, Any]], namespace: str = ""):
1095
+ """
1096
+ Batch insert vectors (Pinecone-compatible API).
1097
+
1098
+ Args:
1099
+
1100
+ vectors: List of vectors to insert. Each vector should be a dict with:
1101
+ - Primary key field: Value for the primary key column (required)
1102
+ - Vector field: Vector values (required)
1103
+ - Other fields: Any additional metadata fields
1104
+ namespace: Namespace (not used in MatrixOne)
1105
+
1106
+ Returns:
1107
+
1108
+ Dict with insert statistics
1109
+ """
1110
+ if not vectors:
1111
+ return {"inserted_count": 0}
1112
+
1113
+ # Get the actual primary key column name
1114
+ id_column = self._get_id_column()
1115
+
1116
+ # Prepare data for batch insert
1117
+ batch_data = []
1118
+ for vector in vectors:
1119
+ # Check if primary key field exists
1120
+ if id_column not in vector:
1121
+ raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
1122
+
1123
+ # Check if vector field exists
1124
+ if self.vector_column not in vector:
1125
+ raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
1126
+
1127
+ # Prepare row data
1128
+ row_data = dict(vector)
1129
+ batch_data.append(row_data)
1130
+
1131
+ # Use client's batch_insert method
1132
+ self.client.batch_insert(self.table_name, batch_data)
1133
+
1134
+ return {"inserted_count": len(vectors)}
1135
+
1136
+ async def batch_insert_async(self, vectors: List[Dict[str, Any]], namespace: str = ""):
1137
+ """
1138
+ Async version of batch_insert method.
1139
+
1140
+ Args:
1141
+
1142
+ vectors: List of vectors to insert. Each vector should be a dict with:
1143
+ - Primary key field: Value for the primary key column (required)
1144
+ - Vector field: Vector values (required)
1145
+ - Other fields: Any additional metadata fields
1146
+ namespace: Namespace (not used in MatrixOne)
1147
+
1148
+ Returns:
1149
+
1150
+ Dict with insert statistics
1151
+ """
1152
+ if not vectors:
1153
+ return {"inserted_count": 0}
1154
+
1155
+ # Get the actual primary key column name
1156
+ id_column = await self._get_id_column_async()
1157
+
1158
+ # Prepare data for batch insert
1159
+ batch_data = []
1160
+ for vector in vectors:
1161
+ # Check if primary key field exists
1162
+ if id_column not in vector:
1163
+ raise ValueError(f"Each vector must have '{id_column}' field (primary key)")
1164
+
1165
+ # Check if vector field exists
1166
+ if self.vector_column not in vector:
1167
+ raise ValueError(f"Each vector must have '{self.vector_column}' field (vector values)")
1168
+
1169
+ # Prepare row data
1170
+ row_data = dict(vector)
1171
+ batch_data.append(row_data)
1172
+
1173
+ # Use client's batch_insert_async method
1174
+ await self.client.batch_insert_async(self.table_name, batch_data)
1175
+
1176
+ return {"inserted_count": len(vectors)}