vectordb-bench 0.0.19__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +49 -24
- vectordb_bench/__main__.py +4 -3
- vectordb_bench/backend/assembler.py +12 -13
- vectordb_bench/backend/cases.py +55 -45
- vectordb_bench/backend/clients/__init__.py +75 -14
- vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
- vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +111 -70
- vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
- vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
- vectordb_bench/backend/clients/alloydb/cli.py +51 -34
- vectordb_bench/backend/clients/alloydb/config.py +30 -30
- vectordb_bench/backend/clients/api.py +5 -9
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
- vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
- vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
- vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
- vectordb_bench/backend/clients/chroma/chroma.py +38 -36
- vectordb_bench/backend/clients/chroma/config.py +4 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
- vectordb_bench/backend/clients/memorydb/cli.py +8 -8
- vectordb_bench/backend/clients/memorydb/config.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
- vectordb_bench/backend/clients/milvus/cli.py +41 -83
- vectordb_bench/backend/clients/milvus/config.py +18 -8
- vectordb_bench/backend/clients/milvus/milvus.py +18 -19
- vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
- vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
- vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
- vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
- vectordb_bench/backend/clients/pgvector/cli.py +40 -31
- vectordb_bench/backend/clients/pgvector/config.py +63 -73
- vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
- vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
- vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
- vectordb_bench/backend/clients/pinecone/config.py +1 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
- vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
- vectordb_bench/backend/clients/redis/cli.py +6 -12
- vectordb_bench/backend/clients/redis/config.py +7 -5
- vectordb_bench/backend/clients/redis/redis.py +94 -58
- vectordb_bench/backend/clients/test/cli.py +1 -2
- vectordb_bench/backend/clients/test/config.py +2 -2
- vectordb_bench/backend/clients/test/test.py +4 -5
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
- vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
- vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/data_source.py +30 -18
- vectordb_bench/backend/dataset.py +47 -27
- vectordb_bench/backend/result_collector.py +2 -3
- vectordb_bench/backend/runner/__init__.py +4 -6
- vectordb_bench/backend/runner/mp_runner.py +85 -34
- vectordb_bench/backend/runner/rate_runner.py +30 -19
- vectordb_bench/backend/runner/read_write_runner.py +51 -23
- vectordb_bench/backend/runner/serial_runner.py +91 -48
- vectordb_bench/backend/runner/util.py +4 -3
- vectordb_bench/backend/task_runner.py +92 -72
- vectordb_bench/backend/utils.py +17 -10
- vectordb_bench/base.py +0 -1
- vectordb_bench/cli/cli.py +65 -60
- vectordb_bench/cli/vectordbbench.py +6 -7
- vectordb_bench/frontend/components/check_results/charts.py +8 -19
- vectordb_bench/frontend/components/check_results/data.py +4 -16
- vectordb_bench/frontend/components/check_results/filters.py +8 -16
- vectordb_bench/frontend/components/check_results/nav.py +4 -4
- vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
- vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
- vectordb_bench/frontend/components/concurrent/charts.py +12 -12
- vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
- vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
- vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
- vectordb_bench/frontend/components/custom/initStyle.py +1 -1
- vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
- vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
- vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
- vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
- vectordb_bench/frontend/components/tables/data.py +3 -6
- vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
- vectordb_bench/frontend/pages/concurrent.py +3 -5
- vectordb_bench/frontend/pages/custom.py +30 -9
- vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
- vectordb_bench/frontend/pages/run_test.py +3 -7
- vectordb_bench/frontend/utils.py +1 -1
- vectordb_bench/frontend/vdb_benchmark.py +4 -6
- vectordb_bench/interface.py +56 -26
- vectordb_bench/log_util.py +59 -64
- vectordb_bench/metric.py +10 -11
- vectordb_bench/models.py +26 -43
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +22 -15
- vectordb_bench-0.0.20.dist-info/RECORD +135 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
- vectordb_bench-0.0.19.dist-info/RECORD +0 -135
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,12 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
|
2
|
+
|
3
3
|
from .api import (
|
4
|
-
VectorDB,
|
5
|
-
DBConfig,
|
6
4
|
DBCaseConfig,
|
5
|
+
DBConfig,
|
7
6
|
EmptyDBCaseConfig,
|
8
7
|
IndexType,
|
9
8
|
MetricType,
|
9
|
+
VectorDB,
|
10
10
|
)
|
11
11
|
|
12
12
|
|
@@ -41,200 +41,255 @@ class DB(Enum):
|
|
41
41
|
Test = "test"
|
42
42
|
AliyunOpenSearch = "AliyunOpenSearch"
|
43
43
|
|
44
|
-
|
45
44
|
@property
|
46
|
-
def init_cls(self) ->
|
45
|
+
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912
|
47
46
|
"""Import while in use"""
|
48
47
|
if self == DB.Milvus:
|
49
48
|
from .milvus.milvus import Milvus
|
49
|
+
|
50
50
|
return Milvus
|
51
51
|
|
52
52
|
if self == DB.ZillizCloud:
|
53
53
|
from .zilliz_cloud.zilliz_cloud import ZillizCloud
|
54
|
+
|
54
55
|
return ZillizCloud
|
55
56
|
|
56
57
|
if self == DB.Pinecone:
|
57
58
|
from .pinecone.pinecone import Pinecone
|
59
|
+
|
58
60
|
return Pinecone
|
59
61
|
|
60
62
|
if self == DB.ElasticCloud:
|
61
63
|
from .elastic_cloud.elastic_cloud import ElasticCloud
|
64
|
+
|
62
65
|
return ElasticCloud
|
63
66
|
|
64
67
|
if self == DB.QdrantCloud:
|
65
68
|
from .qdrant_cloud.qdrant_cloud import QdrantCloud
|
69
|
+
|
66
70
|
return QdrantCloud
|
67
71
|
|
68
72
|
if self == DB.WeaviateCloud:
|
69
73
|
from .weaviate_cloud.weaviate_cloud import WeaviateCloud
|
74
|
+
|
70
75
|
return WeaviateCloud
|
71
76
|
|
72
77
|
if self == DB.PgVector:
|
73
78
|
from .pgvector.pgvector import PgVector
|
79
|
+
|
74
80
|
return PgVector
|
75
81
|
|
76
82
|
if self == DB.PgVectoRS:
|
77
83
|
from .pgvecto_rs.pgvecto_rs import PgVectoRS
|
84
|
+
|
78
85
|
return PgVectoRS
|
79
|
-
|
86
|
+
|
80
87
|
if self == DB.PgVectorScale:
|
81
88
|
from .pgvectorscale.pgvectorscale import PgVectorScale
|
89
|
+
|
82
90
|
return PgVectorScale
|
83
91
|
|
84
92
|
if self == DB.PgDiskANN:
|
85
93
|
from .pgdiskann.pgdiskann import PgDiskANN
|
94
|
+
|
86
95
|
return PgDiskANN
|
87
96
|
|
88
97
|
if self == DB.Redis:
|
89
98
|
from .redis.redis import Redis
|
99
|
+
|
90
100
|
return Redis
|
91
|
-
|
101
|
+
|
92
102
|
if self == DB.MemoryDB:
|
93
103
|
from .memorydb.memorydb import MemoryDB
|
104
|
+
|
94
105
|
return MemoryDB
|
95
106
|
|
96
107
|
if self == DB.Chroma:
|
97
108
|
from .chroma.chroma import ChromaClient
|
109
|
+
|
98
110
|
return ChromaClient
|
99
111
|
|
100
112
|
if self == DB.AWSOpenSearch:
|
101
113
|
from .aws_opensearch.aws_opensearch import AWSOpenSearch
|
114
|
+
|
102
115
|
return AWSOpenSearch
|
103
|
-
|
116
|
+
|
104
117
|
if self == DB.AlloyDB:
|
105
118
|
from .alloydb.alloydb import AlloyDB
|
119
|
+
|
106
120
|
return AlloyDB
|
107
121
|
|
108
122
|
if self == DB.AliyunElasticsearch:
|
109
123
|
from .aliyun_elasticsearch.aliyun_elasticsearch import AliyunElasticsearch
|
124
|
+
|
110
125
|
return AliyunElasticsearch
|
111
126
|
|
112
127
|
if self == DB.AliyunOpenSearch:
|
113
128
|
from .aliyun_opensearch.aliyun_opensearch import AliyunOpenSearch
|
129
|
+
|
114
130
|
return AliyunOpenSearch
|
115
131
|
|
132
|
+
msg = f"Unknown DB: {self.name}"
|
133
|
+
raise ValueError(msg)
|
134
|
+
|
116
135
|
@property
|
117
|
-
def config_cls(self) ->
|
136
|
+
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912
|
118
137
|
"""Import while in use"""
|
119
138
|
if self == DB.Milvus:
|
120
139
|
from .milvus.config import MilvusConfig
|
140
|
+
|
121
141
|
return MilvusConfig
|
122
142
|
|
123
143
|
if self == DB.ZillizCloud:
|
124
144
|
from .zilliz_cloud.config import ZillizCloudConfig
|
145
|
+
|
125
146
|
return ZillizCloudConfig
|
126
147
|
|
127
148
|
if self == DB.Pinecone:
|
128
149
|
from .pinecone.config import PineconeConfig
|
150
|
+
|
129
151
|
return PineconeConfig
|
130
152
|
|
131
153
|
if self == DB.ElasticCloud:
|
132
154
|
from .elastic_cloud.config import ElasticCloudConfig
|
155
|
+
|
133
156
|
return ElasticCloudConfig
|
134
157
|
|
135
158
|
if self == DB.QdrantCloud:
|
136
159
|
from .qdrant_cloud.config import QdrantConfig
|
160
|
+
|
137
161
|
return QdrantConfig
|
138
162
|
|
139
163
|
if self == DB.WeaviateCloud:
|
140
164
|
from .weaviate_cloud.config import WeaviateConfig
|
165
|
+
|
141
166
|
return WeaviateConfig
|
142
167
|
|
143
168
|
if self == DB.PgVector:
|
144
169
|
from .pgvector.config import PgVectorConfig
|
170
|
+
|
145
171
|
return PgVectorConfig
|
146
172
|
|
147
173
|
if self == DB.PgVectoRS:
|
148
174
|
from .pgvecto_rs.config import PgVectoRSConfig
|
175
|
+
|
149
176
|
return PgVectoRSConfig
|
150
177
|
|
151
178
|
if self == DB.PgVectorScale:
|
152
179
|
from .pgvectorscale.config import PgVectorScaleConfig
|
180
|
+
|
153
181
|
return PgVectorScaleConfig
|
154
182
|
|
155
183
|
if self == DB.PgDiskANN:
|
156
184
|
from .pgdiskann.config import PgDiskANNConfig
|
185
|
+
|
157
186
|
return PgDiskANNConfig
|
158
187
|
|
159
188
|
if self == DB.Redis:
|
160
189
|
from .redis.config import RedisConfig
|
190
|
+
|
161
191
|
return RedisConfig
|
162
|
-
|
192
|
+
|
163
193
|
if self == DB.MemoryDB:
|
164
194
|
from .memorydb.config import MemoryDBConfig
|
195
|
+
|
165
196
|
return MemoryDBConfig
|
166
197
|
|
167
198
|
if self == DB.Chroma:
|
168
199
|
from .chroma.config import ChromaConfig
|
200
|
+
|
169
201
|
return ChromaConfig
|
170
202
|
|
171
203
|
if self == DB.AWSOpenSearch:
|
172
204
|
from .aws_opensearch.config import AWSOpenSearchConfig
|
205
|
+
|
173
206
|
return AWSOpenSearchConfig
|
174
|
-
|
207
|
+
|
175
208
|
if self == DB.AlloyDB:
|
176
209
|
from .alloydb.config import AlloyDBConfig
|
210
|
+
|
177
211
|
return AlloyDBConfig
|
178
212
|
|
179
213
|
if self == DB.AliyunElasticsearch:
|
180
214
|
from .aliyun_elasticsearch.config import AliyunElasticsearchConfig
|
215
|
+
|
181
216
|
return AliyunElasticsearchConfig
|
182
217
|
|
183
218
|
if self == DB.AliyunOpenSearch:
|
184
219
|
from .aliyun_opensearch.config import AliyunOpenSearchConfig
|
220
|
+
|
185
221
|
return AliyunOpenSearchConfig
|
186
222
|
|
187
|
-
|
223
|
+
msg = f"Unknown DB: {self.name}"
|
224
|
+
raise ValueError(msg)
|
225
|
+
|
226
|
+
def case_config_cls( # noqa: PLR0911
|
227
|
+
self,
|
228
|
+
index_type: IndexType | None = None,
|
229
|
+
) -> type[DBCaseConfig]:
|
188
230
|
if self == DB.Milvus:
|
189
231
|
from .milvus.config import _milvus_case_config
|
232
|
+
|
190
233
|
return _milvus_case_config.get(index_type)
|
191
234
|
|
192
235
|
if self == DB.ZillizCloud:
|
193
236
|
from .zilliz_cloud.config import AutoIndexConfig
|
237
|
+
|
194
238
|
return AutoIndexConfig
|
195
239
|
|
196
240
|
if self == DB.ElasticCloud:
|
197
241
|
from .elastic_cloud.config import ElasticCloudIndexConfig
|
242
|
+
|
198
243
|
return ElasticCloudIndexConfig
|
199
244
|
|
200
245
|
if self == DB.QdrantCloud:
|
201
246
|
from .qdrant_cloud.config import QdrantIndexConfig
|
247
|
+
|
202
248
|
return QdrantIndexConfig
|
203
249
|
|
204
250
|
if self == DB.WeaviateCloud:
|
205
251
|
from .weaviate_cloud.config import WeaviateIndexConfig
|
252
|
+
|
206
253
|
return WeaviateIndexConfig
|
207
254
|
|
208
255
|
if self == DB.PgVector:
|
209
256
|
from .pgvector.config import _pgvector_case_config
|
257
|
+
|
210
258
|
return _pgvector_case_config.get(index_type)
|
211
259
|
|
212
260
|
if self == DB.PgVectoRS:
|
213
261
|
from .pgvecto_rs.config import _pgvecto_rs_case_config
|
262
|
+
|
214
263
|
return _pgvecto_rs_case_config.get(index_type)
|
215
264
|
|
216
265
|
if self == DB.AWSOpenSearch:
|
217
266
|
from .aws_opensearch.config import AWSOpenSearchIndexConfig
|
267
|
+
|
218
268
|
return AWSOpenSearchIndexConfig
|
219
269
|
|
220
270
|
if self == DB.PgVectorScale:
|
221
271
|
from .pgvectorscale.config import _pgvectorscale_case_config
|
272
|
+
|
222
273
|
return _pgvectorscale_case_config.get(index_type)
|
223
274
|
|
224
275
|
if self == DB.PgDiskANN:
|
225
276
|
from .pgdiskann.config import _pgdiskann_case_config
|
277
|
+
|
226
278
|
return _pgdiskann_case_config.get(index_type)
|
227
|
-
|
279
|
+
|
228
280
|
if self == DB.AlloyDB:
|
229
281
|
from .alloydb.config import _alloydb_case_config
|
282
|
+
|
230
283
|
return _alloydb_case_config.get(index_type)
|
231
284
|
|
232
285
|
if self == DB.AliyunElasticsearch:
|
233
286
|
from .elastic_cloud.config import ElasticCloudIndexConfig
|
287
|
+
|
234
288
|
return ElasticCloudIndexConfig
|
235
289
|
|
236
290
|
if self == DB.AliyunOpenSearch:
|
237
291
|
from .aliyun_opensearch.config import AliyunOpenSearchIndexConfig
|
292
|
+
|
238
293
|
return AliyunOpenSearchIndexConfig
|
239
294
|
|
240
295
|
# DB.Pinecone, DB.Chroma, DB.Redis
|
@@ -242,5 +297,11 @@ class DB(Enum):
|
|
242
297
|
|
243
298
|
|
244
299
|
__all__ = [
|
245
|
-
"DB",
|
300
|
+
"DB",
|
301
|
+
"DBCaseConfig",
|
302
|
+
"DBConfig",
|
303
|
+
"EmptyDBCaseConfig",
|
304
|
+
"IndexType",
|
305
|
+
"MetricType",
|
306
|
+
"VectorDB",
|
246
307
|
]
|
@@ -1,5 +1,5 @@
|
|
1
|
-
from ..elastic_cloud.elastic_cloud import ElasticCloud
|
2
1
|
from ..elastic_cloud.config import ElasticCloudIndexConfig
|
2
|
+
from ..elastic_cloud.elastic_cloud import ElasticCloud
|
3
3
|
|
4
4
|
|
5
5
|
class AliyunElasticsearch(ElasticCloud):
|
@@ -24,4 +24,3 @@ class AliyunElasticsearch(ElasticCloud):
|
|
24
24
|
drop_old=drop_old,
|
25
25
|
**kwargs,
|
26
26
|
)
|
27
|
-
|
@@ -1,7 +1,6 @@
|
|
1
|
-
from
|
2
|
-
from pydantic import SecretStr, BaseModel
|
1
|
+
from pydantic import BaseModel, SecretStr
|
3
2
|
|
4
|
-
from ..api import DBConfig
|
3
|
+
from ..api import DBConfig
|
5
4
|
|
6
5
|
|
7
6
|
class AliyunElasticsearchConfig(DBConfig, BaseModel):
|
@@ -14,6 +13,6 @@ class AliyunElasticsearchConfig(DBConfig, BaseModel):
|
|
14
13
|
|
15
14
|
def to_dict(self) -> dict:
|
16
15
|
return {
|
17
|
-
"hosts": [{
|
16
|
+
"hosts": [{"scheme": self.scheme, "host": self.host, "port": self.port}],
|
18
17
|
"basic_auth": (self.user, self.password.get_secret_value()),
|
19
18
|
}
|
@@ -1,32 +1,32 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from contextlib import contextmanager
|
4
3
|
import time
|
4
|
+
from contextlib import contextmanager
|
5
5
|
|
6
|
+
from alibabacloud_ha3engine_vector import client, models
|
6
7
|
from alibabacloud_ha3engine_vector.models import QueryRequest
|
7
|
-
|
8
|
-
from ..api import VectorDB, MetricType
|
9
|
-
from .config import AliyunOpenSearchIndexConfig
|
10
|
-
|
11
|
-
from alibabacloud_searchengine20211025.client import Client as searchengineClient
|
12
8
|
from alibabacloud_searchengine20211025 import models as searchengine_models
|
9
|
+
from alibabacloud_searchengine20211025.client import Client as searchengineClient
|
13
10
|
from alibabacloud_tea_openapi import models as open_api_models
|
14
|
-
|
11
|
+
|
12
|
+
from ..api import MetricType, VectorDB
|
13
|
+
from .config import AliyunOpenSearchIndexConfig
|
15
14
|
|
16
15
|
log = logging.getLogger(__name__)
|
17
16
|
|
18
17
|
ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH = 2 * 1024 * 1024 # 2MB
|
19
18
|
ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH = 100
|
20
19
|
|
20
|
+
|
21
21
|
class AliyunOpenSearch(VectorDB):
|
22
22
|
def __init__(
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
23
|
+
self,
|
24
|
+
dim: int,
|
25
|
+
db_config: dict,
|
26
|
+
db_case_config: AliyunOpenSearchIndexConfig,
|
27
|
+
collection_name: str = "VectorDBBenchCollection",
|
28
|
+
drop_old: bool = False,
|
29
|
+
**kwargs,
|
30
30
|
):
|
31
31
|
self.control_client = None
|
32
32
|
self.dim = dim
|
@@ -41,14 +41,17 @@ class AliyunOpenSearch(VectorDB):
|
|
41
41
|
self._index_name = "vector_idx"
|
42
42
|
|
43
43
|
self.batch_size = int(
|
44
|
-
min(
|
44
|
+
min(
|
45
|
+
ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH / (dim * 25),
|
46
|
+
ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH,
|
47
|
+
),
|
45
48
|
)
|
46
49
|
|
47
50
|
log.info(f"Aliyun_OpenSearch client config: {self.db_config}")
|
48
51
|
control_config = open_api_models.Config(
|
49
52
|
access_key_id=self.db_config["ak"],
|
50
53
|
access_key_secret=self.db_config["sk"],
|
51
|
-
endpoint=self.db_config["control_host"]
|
54
|
+
endpoint=self.db_config["control_host"],
|
52
55
|
)
|
53
56
|
self.control_client = searchengineClient(control_config)
|
54
57
|
|
@@ -67,7 +70,7 @@ class AliyunOpenSearch(VectorDB):
|
|
67
70
|
create_table_request.field_schema = {
|
68
71
|
self._primary_field: "INT64",
|
69
72
|
self._vector_field: "MULTI_FLOAT",
|
70
|
-
self._scalar_field: "INT64"
|
73
|
+
self._scalar_field: "INT64",
|
71
74
|
}
|
72
75
|
vector_index = searchengine_models.ModifyTableRequestVectorIndex()
|
73
76
|
vector_index.index_name = self._index_name
|
@@ -77,8 +80,25 @@ class AliyunOpenSearch(VectorDB):
|
|
77
80
|
vector_index.vector_index_type = "HNSW"
|
78
81
|
|
79
82
|
advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
|
80
|
-
|
81
|
-
|
83
|
+
str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
|
84
|
+
str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
|
85
|
+
str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
|
86
|
+
str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
|
87
|
+
str_thread_count = '"proxima.hnsw.builder.thread_count":16'
|
88
|
+
|
89
|
+
params = ",".join(
|
90
|
+
[
|
91
|
+
str_max_neighbor_count,
|
92
|
+
str_efc,
|
93
|
+
str_enable_adsampling,
|
94
|
+
str_slack_pruning_factor,
|
95
|
+
str_thread_count,
|
96
|
+
],
|
97
|
+
)
|
98
|
+
advance_params.build_index_params = params
|
99
|
+
advance_params.search_index_params = (
|
100
|
+
'{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
|
101
|
+
)
|
82
102
|
vector_index.advance_params = advance_params
|
83
103
|
create_table_request.vector_index = [vector_index]
|
84
104
|
|
@@ -88,7 +108,7 @@ class AliyunOpenSearch(VectorDB):
|
|
88
108
|
except Exception as error:
|
89
109
|
log.info(error.message)
|
90
110
|
log.info(error.data.get("Recommend"))
|
91
|
-
log.info(f"Failed to create index: error: {
|
111
|
+
log.info(f"Failed to create index: error: {error!s}")
|
92
112
|
raise error from None
|
93
113
|
|
94
114
|
# check if index create success
|
@@ -102,22 +122,22 @@ class AliyunOpenSearch(VectorDB):
|
|
102
122
|
log.info(f"begin to {retry_times} times get table")
|
103
123
|
retry_times += 1
|
104
124
|
response = client.get_table(self.instance_id, self.collection_name)
|
105
|
-
if response.body.result.status ==
|
125
|
+
if response.body.result.status == "IN_USE":
|
106
126
|
log.info(f"{self.collection_name} table begin to use.")
|
107
127
|
return
|
108
128
|
|
109
129
|
def _index_exists(self, client: searchengineClient) -> bool:
|
110
130
|
try:
|
111
131
|
client.get_table(self.instance_id, self.collection_name)
|
112
|
-
|
113
|
-
|
114
|
-
log.info(f'get table from searchengine error')
|
115
|
-
log.info(error.message)
|
132
|
+
except Exception as err:
|
133
|
+
log.warning(f"get table from searchengine error, err={err}")
|
116
134
|
return False
|
135
|
+
else:
|
136
|
+
return True
|
117
137
|
|
118
138
|
# check if index build success, Insert the embeddings to the vector database after index build success
|
119
139
|
def _index_build_success(self, client: searchengineClient) -> None:
|
120
|
-
log.info(
|
140
|
+
log.info("begin to check if table build success.")
|
121
141
|
time.sleep(50)
|
122
142
|
|
123
143
|
retry_times = 0
|
@@ -139,9 +159,9 @@ class AliyunOpenSearch(VectorDB):
|
|
139
159
|
cur_fsm = fsm
|
140
160
|
break
|
141
161
|
if cur_fsm is None:
|
142
|
-
|
162
|
+
log.warning("no build index fsm")
|
143
163
|
return
|
144
|
-
if "
|
164
|
+
if cur_fsm["status"] == "success":
|
145
165
|
return
|
146
166
|
|
147
167
|
def _modify_index(self, client: searchengineClient) -> None:
|
@@ -154,7 +174,7 @@ class AliyunOpenSearch(VectorDB):
|
|
154
174
|
modify_table_request.field_schema = {
|
155
175
|
self._primary_field: "INT64",
|
156
176
|
self._vector_field: "MULTI_FLOAT",
|
157
|
-
self._scalar_field: "INT64"
|
177
|
+
self._scalar_field: "INT64",
|
158
178
|
}
|
159
179
|
vector_index = searchengine_models.ModifyTableRequestVectorIndex()
|
160
180
|
vector_index.index_name = self._index_name
|
@@ -163,19 +183,41 @@ class AliyunOpenSearch(VectorDB):
|
|
163
183
|
vector_index.vector_field = self._vector_field
|
164
184
|
vector_index.vector_index_type = "HNSW"
|
165
185
|
advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
|
166
|
-
|
167
|
-
|
186
|
+
|
187
|
+
str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
|
188
|
+
str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
|
189
|
+
str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
|
190
|
+
str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
|
191
|
+
str_thread_count = '"proxima.hnsw.builder.thread_count":16'
|
192
|
+
|
193
|
+
params = ",".join(
|
194
|
+
[
|
195
|
+
str_max_neighbor_count,
|
196
|
+
str_efc,
|
197
|
+
str_enable_adsampling,
|
198
|
+
str_slack_pruning_factor,
|
199
|
+
str_thread_count,
|
200
|
+
],
|
201
|
+
)
|
202
|
+
advance_params.build_index_params = params
|
203
|
+
advance_params.search_index_params = (
|
204
|
+
'{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
|
205
|
+
)
|
168
206
|
vector_index.advance_params = advance_params
|
169
207
|
|
170
208
|
modify_table_request.vector_index = [vector_index]
|
171
209
|
|
172
210
|
try:
|
173
|
-
response = client.modify_table(
|
211
|
+
response = client.modify_table(
|
212
|
+
self.instance_id,
|
213
|
+
self.collection_name,
|
214
|
+
modify_table_request,
|
215
|
+
)
|
174
216
|
log.info(f"modify table success: {response.body}")
|
175
217
|
except Exception as error:
|
176
218
|
log.info(error.message)
|
177
219
|
log.info(error.data.get("Recommend"))
|
178
|
-
log.info(f"Failed to modify index: error: {
|
220
|
+
log.info(f"Failed to modify index: error: {error!s}")
|
179
221
|
raise error from None
|
180
222
|
|
181
223
|
# check if modify index & delete data fsm success
|
@@ -185,15 +227,14 @@ class AliyunOpenSearch(VectorDB):
|
|
185
227
|
def _get_total_count(self):
|
186
228
|
try:
|
187
229
|
response = self.client.stats(self.collection_name)
|
230
|
+
except Exception as e:
|
231
|
+
log.warning(f"Error querying index: {e}")
|
232
|
+
else:
|
188
233
|
body = json.loads(response.body)
|
189
234
|
log.info(f"stats info: {response.body}")
|
190
235
|
|
191
236
|
if "result" in body and "totalDocCount" in body.get("result"):
|
192
237
|
return body.get("result").get("totalDocCount")
|
193
|
-
else:
|
194
|
-
return 0
|
195
|
-
except Exception as e:
|
196
|
-
print(f"Error querying index: {e}")
|
197
238
|
return 0
|
198
239
|
|
199
240
|
@contextmanager
|
@@ -203,21 +244,20 @@ class AliyunOpenSearch(VectorDB):
|
|
203
244
|
endpoint=self.db_config["host"],
|
204
245
|
protocol="http",
|
205
246
|
access_user_name=self.db_config["user"],
|
206
|
-
access_pass_word=self.db_config["password"]
|
247
|
+
access_pass_word=self.db_config["password"],
|
207
248
|
)
|
208
249
|
|
209
250
|
self.client = client.Client(config)
|
210
251
|
|
211
252
|
yield
|
212
|
-
# self.client.transport.close()
|
213
253
|
self.client = None
|
214
254
|
del self.client
|
215
255
|
|
216
256
|
def insert_embeddings(
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
257
|
+
self,
|
258
|
+
embeddings: list[list[float]],
|
259
|
+
metadata: list[int],
|
260
|
+
**kwargs,
|
221
261
|
) -> tuple[int, Exception]:
|
222
262
|
"""Insert the embeddings to the opensearch."""
|
223
263
|
assert self.client is not None, "should self.init() first"
|
@@ -226,25 +266,24 @@ class AliyunOpenSearch(VectorDB):
|
|
226
266
|
|
227
267
|
try:
|
228
268
|
for batch_start_offset in range(0, len(embeddings), self.batch_size):
|
229
|
-
batch_end_offset = min(
|
230
|
-
batch_start_offset + self.batch_size, len(embeddings)
|
231
|
-
)
|
269
|
+
batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
|
232
270
|
documents = []
|
233
271
|
for i in range(batch_start_offset, batch_end_offset):
|
234
|
-
|
272
|
+
document_fields = {
|
235
273
|
self._primary_field: metadata[i],
|
236
274
|
self._vector_field: embeddings[i],
|
237
275
|
self._scalar_field: metadata[i],
|
238
|
-
"ops_build_channel": "inc"
|
239
|
-
}
|
240
|
-
document = {
|
241
|
-
"fields": documentFields,
|
242
|
-
"cmd": "add"
|
276
|
+
"ops_build_channel": "inc",
|
243
277
|
}
|
278
|
+
document = {"fields": document_fields, "cmd": "add"}
|
244
279
|
documents.append(document)
|
245
280
|
|
246
|
-
|
247
|
-
self.client.push_documents(
|
281
|
+
push_doc_req = models.PushDocumentsRequest({}, documents)
|
282
|
+
self.client.push_documents(
|
283
|
+
self.collection_name,
|
284
|
+
self._primary_field,
|
285
|
+
push_doc_req,
|
286
|
+
)
|
248
287
|
insert_count += batch_end_offset - batch_start_offset
|
249
288
|
except Exception as e:
|
250
289
|
log.info(f"Failed to insert data: {e}")
|
@@ -252,33 +291,36 @@ class AliyunOpenSearch(VectorDB):
|
|
252
291
|
return (insert_count, None)
|
253
292
|
|
254
293
|
def search_embedding(
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
294
|
+
self,
|
295
|
+
query: list[float],
|
296
|
+
k: int = 100,
|
297
|
+
filters: dict | None = None,
|
259
298
|
) -> list[int]:
|
260
299
|
assert self.client is not None, "should self.init() first"
|
261
|
-
search_params =
|
300
|
+
search_params = '{"proxima.hnsw.searcher.ef":' + str(self.case_config.ef_search) + "}"
|
262
301
|
|
263
302
|
os_filter = f"{self._scalar_field} {filters.get('metadata')}" if filters else ""
|
264
303
|
|
265
304
|
try:
|
266
|
-
request = QueryRequest(
|
267
|
-
|
268
|
-
|
269
|
-
|
305
|
+
request = QueryRequest(
|
306
|
+
table_name=self.collection_name,
|
307
|
+
vector=query,
|
308
|
+
top_k=k,
|
309
|
+
search_params=search_params,
|
310
|
+
filter=os_filter,
|
311
|
+
)
|
270
312
|
result = self.client.query(request)
|
271
313
|
except Exception as e:
|
272
314
|
log.info(f"Error querying index: {e}")
|
273
|
-
raise e
|
274
|
-
|
275
|
-
|
276
|
-
|
315
|
+
raise e from e
|
316
|
+
else:
|
317
|
+
res = json.loads(result.body)
|
318
|
+
return [one_res["id"] for one_res in res["result"]]
|
277
319
|
|
278
320
|
def need_normalize_cosine(self) -> bool:
|
279
321
|
"""Wheather this database need to normalize dataset to support COSINE"""
|
280
322
|
if self.case_config.metric_type == MetricType.COSINE:
|
281
|
-
log.info(
|
323
|
+
log.info("cosine dataset need normalize.")
|
282
324
|
return True
|
283
325
|
|
284
326
|
return False
|
@@ -296,9 +338,8 @@ class AliyunOpenSearch(VectorDB):
|
|
296
338
|
total_count = self._get_total_count()
|
297
339
|
# check if the data is inserted
|
298
340
|
if total_count == data_size:
|
299
|
-
log.info(
|
341
|
+
log.info("optimize table finish.")
|
300
342
|
return
|
301
343
|
|
302
344
|
def ready_to_load(self):
|
303
345
|
"""ready_to_load will be called before load in load cases."""
|
304
|
-
pass
|