pyobvector 0.2.15__tar.gz → 0.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {pyobvector-0.2.15 → pyobvector-0.2.17}/PKG-INFO +65 -7
  2. {pyobvector-0.2.15 → pyobvector-0.2.17}/README.md +59 -3
  3. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/__init__.py +3 -0
  4. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/collection_schema.py +6 -6
  5. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/exceptions.py +4 -4
  6. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/fts_index_param.py +2 -3
  7. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/index_param.py +21 -8
  8. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/milvus_like_client.py +126 -90
  9. pyobvector-0.2.17/pyobvector/client/ob_client.py +459 -0
  10. pyobvector-0.2.17/pyobvector/client/ob_vec_client.py +522 -0
  11. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/schema_type.py +4 -2
  12. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/__init__.py +3 -0
  13. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/dialect.py +3 -0
  14. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/reflection.py +1 -1
  15. pyobvector-0.2.17/pyobvector/schema/sparse_vector.py +35 -0
  16. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/vector_index.py +1 -1
  17. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/util/__init__.py +3 -1
  18. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/util/ob_version.py +1 -1
  19. pyobvector-0.2.17/pyobvector/util/sparse_vector.py +48 -0
  20. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/util/vector.py +10 -4
  21. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyproject.toml +3 -3
  22. pyobvector-0.2.15/pyobvector/client/ob_vec_client.py +0 -862
  23. {pyobvector-0.2.15 → pyobvector-0.2.17}/LICENSE +0 -0
  24. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/__init__.py +0 -0
  25. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/enum.py +0 -0
  26. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/ob_vec_json_table_client.py +0 -0
  27. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/client/partitions.py +0 -0
  28. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/json_table/__init__.py +0 -0
  29. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/json_table/json_value_returning_func.py +0 -0
  30. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/json_table/oceanbase_dialect.py +0 -0
  31. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/json_table/virtual_data_type.py +0 -0
  32. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/array.py +0 -0
  33. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/full_text_index.py +0 -0
  34. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/geo_srid_point.py +0 -0
  35. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/gis_func.py +0 -0
  36. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/match_against_func.py +0 -0
  37. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/ob_table.py +0 -0
  38. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/replace_stmt.py +0 -0
  39. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/vec_dist_func.py +0 -0
  40. {pyobvector-0.2.15 → pyobvector-0.2.17}/pyobvector/schema/vector.py +0 -0
@@ -1,7 +1,8 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: pyobvector
3
- Version: 0.2.15
3
+ Version: 0.2.17
4
4
  Summary: A python SDK for OceanBase Vector Store, based on SQLAlchemy, compatible with Milvus API.
5
+ License-File: LICENSE
5
6
  Author: shanhaikang.shk
6
7
  Author-email: shanhaikang.shk@oceanbase.com
7
8
  Requires-Python: >=3.9,<4.0
@@ -11,12 +12,13 @@ Classifier: Programming Language :: Python :: 3.10
11
12
  Classifier: Programming Language :: Python :: 3.11
12
13
  Classifier: Programming Language :: Python :: 3.12
13
14
  Classifier: Programming Language :: Python :: 3.13
14
- Requires-Dist: aiomysql (>=0.2.0,<0.3.0)
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Requires-Dist: aiomysql (>=0.3.2,<0.4.0)
15
17
  Requires-Dist: numpy (>=1.17.0,<2.0.0)
16
18
  Requires-Dist: pydantic (>=2.7.0,<3)
17
19
  Requires-Dist: pymysql (>=1.1.1,<2.0.0)
18
20
  Requires-Dist: sqlalchemy (>=1.4,<=3)
19
- Requires-Dist: sqlglot (>=26.0.1,<27.0.0)
21
+ Requires-Dist: sqlglot (>=26.0.1)
20
22
  Description-Content-Type: text/markdown
21
23
 
22
24
  # pyobvector
@@ -36,7 +38,7 @@ poetry install
36
38
  - install with pip:
37
39
 
38
40
  ```shell
39
- pip install pyobvector==0.2.14
41
+ pip install pyobvector==0.2.17
40
42
  ```
41
43
 
42
44
  ## Build Doc
@@ -174,19 +176,75 @@ client.insert(test_collection_name, data=data1)
174
176
  - do ann search:
175
177
 
176
178
  ```python
177
- # perform ann search
179
+ # perform ann search with basic column selection
178
180
  res = self.client.ann_search(
179
181
  test_collection_name,
180
182
  vec_data=[0,0,0],
181
183
  vec_column_name='embedding',
182
184
  distance_func=l2_distance,
183
185
  topk=5,
184
- output_column_names=['id']
186
+ output_column_names=['id'] # Legacy parameter
185
187
  )
186
188
  # For example, the result will be:
187
189
  # [(112,), (111,), (10,), (11,), (12,)]
190
+
191
+ # perform ann search with SQLAlchemy expressions (recommended)
192
+ from sqlalchemy import Table, text, func
193
+
194
+ table = Table(test_collection_name, client.metadata_obj, autoload_with=client.engine)
195
+ res = self.client.ann_search(
196
+ test_collection_name,
197
+ vec_data=[0,0,0],
198
+ vec_column_name='embedding',
199
+ distance_func=l2_distance,
200
+ topk=5,
201
+ output_columns=[
202
+ table.c.id,
203
+ table.c.meta,
204
+ (table.c.id + 1000).label('id_plus_1000'),
205
+ text("JSON_EXTRACT(meta, '$.key') as extracted_key")
206
+ ]
207
+ )
208
+ # For example, the result will be:
209
+ # [(112, '{"key": "value"}', 1112, 'value'), ...]
210
+
211
+ # perform ann search with distance threshold (filter results by distance)
212
+ res = self.client.ann_search(
213
+ test_collection_name,
214
+ vec_data=[0,0,0],
215
+ vec_column_name='embedding',
216
+ distance_func=l2_distance,
217
+ with_dist=True,
218
+ topk=10,
219
+ output_column_names=['id'],
220
+ distance_threshold=0.5 # Only return results where distance <= 0.5
221
+ )
222
+ # Only returns results with distance <= 0.5
223
+ # For example, the result will be:
224
+ # [(10, 0.0), (11, 0.0), ...] # Only includes results with distance <= 0.5
188
225
  ```
189
226
 
227
+ #### ann_search Parameters
228
+
229
+ The `ann_search` method supports flexible output column selection through the `output_columns` parameter:
230
+
231
+ - **`output_columns`** (recommended): Accepts SQLAlchemy Column objects, expressions, or a mix of both
232
+ - Column objects: `table.c.id`, `table.c.name`
233
+ - Expressions: `(table.c.age + 10).label('age_plus_10')`
234
+ - JSON queries: `text("JSON_EXTRACT(meta, '$.key') as extracted_key")`
235
+ - String functions: `func.concat(table.c.name, ' (', table.c.age, ')').label('name_age')`
236
+
237
+ - **`output_column_names`** (legacy): Accepts list of column name strings
238
+ - Example: `['id', 'name', 'meta']`
239
+
240
+ - **Parameter Priority**: `output_columns` takes precedence over `output_column_names` when both are provided
241
+
242
+ - **`distance_threshold`** (optional): Filter results by distance threshold
243
+ - Type: `Optional[float]`
244
+ - Only returns results where `distance <= threshold`
245
+ - Example: `distance_threshold=0.5` returns only results with distance <= 0.5
246
+ - Use case: Quality control for similarity search, only return highly similar results
247
+
190
248
  - If you want to use pure `SQLAlchemy` API with `OceanBase` dialect, you can just get an `SQLAlchemy.engine` via `client.engine`. The engine can also be created as following:
191
249
 
192
250
  ```python
@@ -15,7 +15,7 @@ poetry install
15
15
  - install with pip:
16
16
 
17
17
  ```shell
18
- pip install pyobvector==0.2.14
18
+ pip install pyobvector==0.2.17
19
19
  ```
20
20
 
21
21
  ## Build Doc
@@ -153,19 +153,75 @@ client.insert(test_collection_name, data=data1)
153
153
  - do ann search:
154
154
 
155
155
  ```python
156
- # perform ann search
156
+ # perform ann search with basic column selection
157
157
  res = self.client.ann_search(
158
158
  test_collection_name,
159
159
  vec_data=[0,0,0],
160
160
  vec_column_name='embedding',
161
161
  distance_func=l2_distance,
162
162
  topk=5,
163
- output_column_names=['id']
163
+ output_column_names=['id'] # Legacy parameter
164
164
  )
165
165
  # For example, the result will be:
166
166
  # [(112,), (111,), (10,), (11,), (12,)]
167
+
168
+ # perform ann search with SQLAlchemy expressions (recommended)
169
+ from sqlalchemy import Table, text, func
170
+
171
+ table = Table(test_collection_name, client.metadata_obj, autoload_with=client.engine)
172
+ res = self.client.ann_search(
173
+ test_collection_name,
174
+ vec_data=[0,0,0],
175
+ vec_column_name='embedding',
176
+ distance_func=l2_distance,
177
+ topk=5,
178
+ output_columns=[
179
+ table.c.id,
180
+ table.c.meta,
181
+ (table.c.id + 1000).label('id_plus_1000'),
182
+ text("JSON_EXTRACT(meta, '$.key') as extracted_key")
183
+ ]
184
+ )
185
+ # For example, the result will be:
186
+ # [(112, '{"key": "value"}', 1112, 'value'), ...]
187
+
188
+ # perform ann search with distance threshold (filter results by distance)
189
+ res = self.client.ann_search(
190
+ test_collection_name,
191
+ vec_data=[0,0,0],
192
+ vec_column_name='embedding',
193
+ distance_func=l2_distance,
194
+ with_dist=True,
195
+ topk=10,
196
+ output_column_names=['id'],
197
+ distance_threshold=0.5 # Only return results where distance <= 0.5
198
+ )
199
+ # Only returns results with distance <= 0.5
200
+ # For example, the result will be:
201
+ # [(10, 0.0), (11, 0.0), ...] # Only includes results with distance <= 0.5
167
202
  ```
168
203
 
204
+ #### ann_search Parameters
205
+
206
+ The `ann_search` method supports flexible output column selection through the `output_columns` parameter:
207
+
208
+ - **`output_columns`** (recommended): Accepts SQLAlchemy Column objects, expressions, or a mix of both
209
+ - Column objects: `table.c.id`, `table.c.name`
210
+ - Expressions: `(table.c.age + 10).label('age_plus_10')`
211
+ - JSON queries: `text("JSON_EXTRACT(meta, '$.key') as extracted_key")`
212
+ - String functions: `func.concat(table.c.name, ' (', table.c.age, ')').label('name_age')`
213
+
214
+ - **`output_column_names`** (legacy): Accepts list of column name strings
215
+ - Example: `['id', 'name', 'meta']`
216
+
217
+ - **Parameter Priority**: `output_columns` takes precedence over `output_column_names` when both are provided
218
+
219
+ - **`distance_threshold`** (optional): Filter results by distance threshold
220
+ - Type: `Optional[float]`
221
+ - Only returns results where `distance <= threshold`
222
+ - Example: `distance_threshold=0.5` returns only results with distance <= 0.5
223
+ - Use case: Quality control for similarity search, only return highly similar results
224
+
169
225
  - If you want to use pure `SQLAlchemy` API with `OceanBase` dialect, you can just get an `SQLAlchemy.engine` via `client.engine`. The engine can also be created as following:
170
226
 
171
227
  ```python
@@ -14,6 +14,7 @@ In this mode, you can regard `pyobvector` as an extension of SQLAlchemy.
14
14
  * IndexParams A list of IndexParam to create vector index in batch
15
15
  * DataType Specify field type in collection schema for MilvusLikeClient
16
16
  * VECTOR An extended data type in SQLAlchemy for ObVecClient
17
+ * SPARSE_VECTOR An extended data type in SQLAlchemy for ObVecClient
17
18
  * VectorIndex An extended index type in SQLAlchemy for ObVecClient
18
19
  * FtsIndex Full Text Search Index
19
20
  * FieldSchema Clas to define field schema in collection for MilvusLikeClient
@@ -43,6 +44,7 @@ from .client import *
43
44
  from .schema import (
44
45
  ARRAY,
45
46
  VECTOR,
47
+ SPARSE_VECTOR,
46
48
  POINT,
47
49
  VectorIndex,
48
50
  OceanBaseDialect,
@@ -70,6 +72,7 @@ __all__ = [
70
72
  "DataType",
71
73
  "ARRAY",
72
74
  "VECTOR",
75
+ "SPARSE_VECTOR",
73
76
  "POINT",
74
77
  "VectorIndex",
75
78
  "FtsIndex",
@@ -79,14 +79,14 @@ class FieldSchema:
79
79
  if "max_length" not in self.kwargs:
80
80
  raise VarcharFieldParamException(
81
81
  code=ErrorCode.INVALID_ARGUMENT,
82
- message=ExceptionsMessage.VarcharFieldMissinglengthParam,
82
+ message=ExceptionsMessage.VarcharFieldMissingLengthParam,
83
83
  )
84
84
  self.type_params["length"] = self.kwargs["max_length"]
85
85
  elif self.dtype == DataType.ARRAY:
86
86
  if "element_type" not in self.kwargs:
87
87
  raise ArrayFieldParamException(
88
88
  code=ErrorCode.INVALID_ARGUMENT,
89
- message=ExceptionsMessage.ArrayFiledMissingElementType,
89
+ message=ExceptionsMessage.ArrayFieldMissingElementType,
90
90
  )
91
91
  if self.kwargs["element_type"] in (
92
92
  DataType.ARRAY,
@@ -95,7 +95,7 @@ class FieldSchema:
95
95
  ):
96
96
  raise ArrayFieldParamException(
97
97
  code=ErrorCode.INVALID_ARGUMENT,
98
- message=ExceptionsMessage.ArrayFiledInvalidElementType,
98
+ message=ExceptionsMessage.ArrayFieldInvalidElementType,
99
99
  )
100
100
 
101
101
  self.type_params["item_type"] = convert_datatype_to_sqltype(
@@ -147,9 +147,9 @@ class CollectionSchema:
147
147
  """Add field to collection.
148
148
 
149
149
  Args:
150
- :param field_name (string) : new field name
151
- :param datatype (DataType) : field data type
152
- :param kwargs : parameters for data type
150
+ field_name (string): new field name
151
+ datatype (DataType): field data type
152
+ **kwargs: parameters for data type
153
153
  """
154
154
  field = FieldSchema(field_name, datatype, **kwargs)
155
155
  cur_idx = len(self.fields)
@@ -101,14 +101,14 @@ class ExceptionsMessage:
101
101
  )
102
102
  PrimaryFieldType = "Param primary_field must be int or str type."
103
103
  VectorFieldMissingDimParam = "Param 'dim' must be set for vector field."
104
- VarcharFieldMissinglengthParam = "Param 'max_length' must be set for varchar field."
105
- ArrayFiledMissingElementType = "Param 'element_type' must be set for array field."
106
- ArrayFiledInvalidElementType = (
104
+ VarcharFieldMissingLengthParam = "Param 'max_length' must be set for varchar field."
105
+ ArrayFieldMissingElementType = "Param 'element_type' must be set for array field."
106
+ ArrayFieldInvalidElementType = (
107
107
  "Param 'element_type' can not be array/vector/varchar."
108
108
  )
109
109
  CollectionNotExists = "Collection does not exist."
110
110
  MetricTypeParamTypeInvalid = "MetricType param type should be string."
111
- MetricTypeValueInvalid = "MetricType should be 'l2'/'ip' in ann search."
111
+ MetricTypeValueInvalid = "MetricType should be 'l2'/'ip'/'neg_ip'/'cosine' in ann search."
112
112
  UsingInIDsWhenMultiPrimaryKey = "Using 'ids' when table has multi primary key."
113
113
  ClusterVersionIsLow = (
114
114
  "OceanBase Vector Store is not supported because cluster version is below 4.3.3.0."
@@ -18,13 +18,12 @@ class FtsIndexParam:
18
18
  self.field_names = field_names
19
19
  self.parser_type = parser_type
20
20
 
21
- def param_str(self) -> str:
22
- if self.parser_type is None:
23
- return None
21
+ def param_str(self) -> str | None:
24
22
  if self.parser_type == FtsParser.IK:
25
23
  return "ik"
26
24
  if self.parser_type == FtsParser.NGRAM:
27
25
  return "ngram"
26
+ return None
28
27
 
29
28
  def __iter__(self):
30
29
  yield "index_name", self.index_name
@@ -9,7 +9,7 @@ class VecIndexType(Enum):
9
9
  IVFFLAT = 2
10
10
  IVFSQ = 3
11
11
  IVFPQ = 4
12
-
12
+ DAAT = 5
13
13
 
14
14
  class IndexParam:
15
15
  """Vector index parameters.
@@ -31,6 +31,7 @@ class IndexParam:
31
31
  IVFFLAT_ALGO_NAME = "ivf_flat"
32
32
  IVFSQ_ALGO_NAME = "ivf_sq8"
33
33
  IVFPQ_ALGO_NAME = "ivf_pq"
34
+ DAAT_ALGO_NAME = "daat"
34
35
 
35
36
  def __init__(
36
37
  self, index_name: str, field_name: str, index_type: Union[VecIndexType, str], **kwargs
@@ -57,6 +58,11 @@ class IndexParam:
57
58
  return self.index_type in [
58
59
  IndexParam.IVFPQ_ALGO_NAME,
59
60
  ]
61
+
62
+ def is_index_type_sparse_vector(self):
63
+ return self.index_type in [
64
+ IndexParam.DAAT_ALGO_NAME,
65
+ ]
60
66
 
61
67
  def _get_vector_index_type_str(self):
62
68
  """Parse vector index type to string."""
@@ -71,6 +77,8 @@ class IndexParam:
71
77
  return IndexParam.IVFSQ_ALGO_NAME
72
78
  elif self.index_type == VecIndexType.IVFPQ:
73
79
  return IndexParam.IVFPQ_ALGO_NAME
80
+ elif self.index_type == VecIndexType.DAAT:
81
+ return IndexParam.DAAT_ALGO_NAME
74
82
  raise ValueError(f"unsupported vector index type: {self.index_type}")
75
83
  assert isinstance(self.index_type, str)
76
84
  index_type = self.index_type.lower()
@@ -80,6 +88,7 @@ class IndexParam:
80
88
  IndexParam.IVFFLAT_ALGO_NAME,
81
89
  IndexParam.IVFSQ_ALGO_NAME,
82
90
  IndexParam.IVFPQ_ALGO_NAME,
91
+ IndexParam.DAAT_ALGO_NAME,
83
92
  ]:
84
93
  raise ValueError(f"unsupported vector index type: {self.index_type}")
85
94
  return index_type
@@ -124,15 +133,19 @@ class IndexParam:
124
133
  ob_params['ef_construction'] = params['efConstruction']
125
134
  if 'efSearch' in params:
126
135
  ob_params['ef_search'] = params['efSearch']
136
+
137
+ if self.is_index_type_sparse_vector() and ob_params['distance'] != 'inner_product':
138
+ raise ValueError("Metric type should be 'inner_product' for sparse vector index.")
127
139
  return ob_params
128
140
 
129
141
  def param_str(self):
130
142
  """Parse vector index parameters to string."""
131
143
  ob_param = self._parse_kwargs()
132
144
  partial_str = ",".join([f"{k}={v}" for k, v in ob_param.items()])
133
- if len(partial_str) > 0:
134
- partial_str += ","
135
- partial_str += f"type={self.index_type}"
145
+ if not self.is_index_type_sparse_vector():
146
+ if len(partial_str) > 0:
147
+ partial_str += ","
148
+ partial_str += f"type={self.index_type}"
136
149
  return partial_str
137
150
 
138
151
  def __iter__(self):
@@ -165,10 +178,10 @@ class IndexParams:
165
178
  """Add `IndexParam` to `IndexParams`
166
179
 
167
180
  Args:
168
- :param field_name (string) : vector index built on which field
169
- :param index_type (VecIndexType) :
170
- vector index algorithms (Only HNSW supported)
171
- :param index_name (string) : vector index name
181
+ field_name (string): vector index built on which field
182
+ index_type (VecIndexType): vector index algorithms (Only HNSW supported)
183
+ index_name (string): vector index name
184
+ **kwargs: additional parameters for different index types
172
185
  """
173
186
  index_param = IndexParam(index_name, field_name, index_type, **kwargs)
174
187
  pair_key = (field_name, index_name)