vectordb-bench 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +49 -24
- vectordb_bench/__main__.py +4 -3
- vectordb_bench/backend/assembler.py +12 -13
- vectordb_bench/backend/cases.py +56 -46
- vectordb_bench/backend/clients/__init__.py +101 -14
- vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +26 -0
- vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +18 -0
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +345 -0
- vectordb_bench/backend/clients/aliyun_opensearch/config.py +47 -0
- vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
- vectordb_bench/backend/clients/alloydb/cli.py +52 -35
- vectordb_bench/backend/clients/alloydb/config.py +30 -30
- vectordb_bench/backend/clients/api.py +8 -9
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
- vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
- vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
- vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
- vectordb_bench/backend/clients/chroma/chroma.py +38 -36
- vectordb_bench/backend/clients/chroma/config.py +4 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
- vectordb_bench/backend/clients/memorydb/cli.py +8 -8
- vectordb_bench/backend/clients/memorydb/config.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
- vectordb_bench/backend/clients/milvus/cli.py +62 -80
- vectordb_bench/backend/clients/milvus/config.py +31 -7
- vectordb_bench/backend/clients/milvus/milvus.py +23 -26
- vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
- vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
- vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
- vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
- vectordb_bench/backend/clients/pgvector/cli.py +40 -31
- vectordb_bench/backend/clients/pgvector/config.py +63 -73
- vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
- vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
- vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
- vectordb_bench/backend/clients/pinecone/config.py +1 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
- vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
- vectordb_bench/backend/clients/redis/cli.py +6 -12
- vectordb_bench/backend/clients/redis/config.py +7 -5
- vectordb_bench/backend/clients/redis/redis.py +94 -58
- vectordb_bench/backend/clients/test/cli.py +1 -2
- vectordb_bench/backend/clients/test/config.py +2 -2
- vectordb_bench/backend/clients/test/test.py +4 -5
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
- vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
- vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/data_source.py +30 -18
- vectordb_bench/backend/dataset.py +47 -27
- vectordb_bench/backend/result_collector.py +2 -3
- vectordb_bench/backend/runner/__init__.py +4 -6
- vectordb_bench/backend/runner/mp_runner.py +85 -34
- vectordb_bench/backend/runner/rate_runner.py +51 -23
- vectordb_bench/backend/runner/read_write_runner.py +140 -46
- vectordb_bench/backend/runner/serial_runner.py +99 -50
- vectordb_bench/backend/runner/util.py +4 -19
- vectordb_bench/backend/task_runner.py +95 -74
- vectordb_bench/backend/utils.py +17 -9
- vectordb_bench/base.py +0 -1
- vectordb_bench/cli/cli.py +65 -60
- vectordb_bench/cli/vectordbbench.py +6 -7
- vectordb_bench/frontend/components/check_results/charts.py +8 -19
- vectordb_bench/frontend/components/check_results/data.py +4 -16
- vectordb_bench/frontend/components/check_results/filters.py +8 -16
- vectordb_bench/frontend/components/check_results/nav.py +4 -4
- vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
- vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
- vectordb_bench/frontend/components/concurrent/charts.py +12 -12
- vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
- vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
- vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
- vectordb_bench/frontend/components/custom/initStyle.py +1 -1
- vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
- vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
- vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
- vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
- vectordb_bench/frontend/components/tables/data.py +3 -6
- vectordb_bench/frontend/config/dbCaseConfigs.py +108 -83
- vectordb_bench/frontend/pages/concurrent.py +3 -5
- vectordb_bench/frontend/pages/custom.py +30 -9
- vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
- vectordb_bench/frontend/pages/run_test.py +3 -7
- vectordb_bench/frontend/utils.py +1 -1
- vectordb_bench/frontend/vdb_benchmark.py +4 -6
- vectordb_bench/interface.py +56 -26
- vectordb_bench/log_util.py +59 -64
- vectordb_bench/metric.py +10 -11
- vectordb_bench/models.py +26 -43
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +34 -42
- vectordb_bench-0.0.20.dist-info/RECORD +135 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
- vectordb_bench-0.0.18.dist-info/RECORD +0 -131
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,12 @@
|
|
1
1
|
from enum import Enum
|
2
|
-
|
2
|
+
|
3
3
|
from .api import (
|
4
|
-
VectorDB,
|
5
|
-
DBConfig,
|
6
4
|
DBCaseConfig,
|
5
|
+
DBConfig,
|
7
6
|
EmptyDBCaseConfig,
|
8
7
|
IndexType,
|
9
8
|
MetricType,
|
9
|
+
VectorDB,
|
10
10
|
)
|
11
11
|
|
12
12
|
|
@@ -37,184 +37,271 @@ class DB(Enum):
|
|
37
37
|
MemoryDB = "MemoryDB"
|
38
38
|
Chroma = "Chroma"
|
39
39
|
AWSOpenSearch = "OpenSearch"
|
40
|
+
AliyunElasticsearch = "AliyunElasticsearch"
|
40
41
|
Test = "test"
|
41
|
-
|
42
|
+
AliyunOpenSearch = "AliyunOpenSearch"
|
42
43
|
|
43
44
|
@property
|
44
|
-
def init_cls(self) ->
|
45
|
+
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912
|
45
46
|
"""Import while in use"""
|
46
47
|
if self == DB.Milvus:
|
47
48
|
from .milvus.milvus import Milvus
|
49
|
+
|
48
50
|
return Milvus
|
49
51
|
|
50
52
|
if self == DB.ZillizCloud:
|
51
53
|
from .zilliz_cloud.zilliz_cloud import ZillizCloud
|
54
|
+
|
52
55
|
return ZillizCloud
|
53
56
|
|
54
57
|
if self == DB.Pinecone:
|
55
58
|
from .pinecone.pinecone import Pinecone
|
59
|
+
|
56
60
|
return Pinecone
|
57
61
|
|
58
62
|
if self == DB.ElasticCloud:
|
59
63
|
from .elastic_cloud.elastic_cloud import ElasticCloud
|
64
|
+
|
60
65
|
return ElasticCloud
|
61
66
|
|
62
67
|
if self == DB.QdrantCloud:
|
63
68
|
from .qdrant_cloud.qdrant_cloud import QdrantCloud
|
69
|
+
|
64
70
|
return QdrantCloud
|
65
71
|
|
66
72
|
if self == DB.WeaviateCloud:
|
67
73
|
from .weaviate_cloud.weaviate_cloud import WeaviateCloud
|
74
|
+
|
68
75
|
return WeaviateCloud
|
69
76
|
|
70
77
|
if self == DB.PgVector:
|
71
78
|
from .pgvector.pgvector import PgVector
|
79
|
+
|
72
80
|
return PgVector
|
73
81
|
|
74
82
|
if self == DB.PgVectoRS:
|
75
83
|
from .pgvecto_rs.pgvecto_rs import PgVectoRS
|
84
|
+
|
76
85
|
return PgVectoRS
|
77
|
-
|
86
|
+
|
78
87
|
if self == DB.PgVectorScale:
|
79
88
|
from .pgvectorscale.pgvectorscale import PgVectorScale
|
89
|
+
|
80
90
|
return PgVectorScale
|
81
91
|
|
82
92
|
if self == DB.PgDiskANN:
|
83
93
|
from .pgdiskann.pgdiskann import PgDiskANN
|
94
|
+
|
84
95
|
return PgDiskANN
|
85
96
|
|
86
97
|
if self == DB.Redis:
|
87
98
|
from .redis.redis import Redis
|
99
|
+
|
88
100
|
return Redis
|
89
|
-
|
101
|
+
|
90
102
|
if self == DB.MemoryDB:
|
91
103
|
from .memorydb.memorydb import MemoryDB
|
104
|
+
|
92
105
|
return MemoryDB
|
93
106
|
|
94
107
|
if self == DB.Chroma:
|
95
108
|
from .chroma.chroma import ChromaClient
|
109
|
+
|
96
110
|
return ChromaClient
|
97
111
|
|
98
112
|
if self == DB.AWSOpenSearch:
|
99
113
|
from .aws_opensearch.aws_opensearch import AWSOpenSearch
|
114
|
+
|
100
115
|
return AWSOpenSearch
|
101
|
-
|
116
|
+
|
102
117
|
if self == DB.AlloyDB:
|
103
118
|
from .alloydb.alloydb import AlloyDB
|
119
|
+
|
104
120
|
return AlloyDB
|
105
121
|
|
122
|
+
if self == DB.AliyunElasticsearch:
|
123
|
+
from .aliyun_elasticsearch.aliyun_elasticsearch import AliyunElasticsearch
|
124
|
+
|
125
|
+
return AliyunElasticsearch
|
126
|
+
|
127
|
+
if self == DB.AliyunOpenSearch:
|
128
|
+
from .aliyun_opensearch.aliyun_opensearch import AliyunOpenSearch
|
129
|
+
|
130
|
+
return AliyunOpenSearch
|
131
|
+
|
132
|
+
msg = f"Unknown DB: {self.name}"
|
133
|
+
raise ValueError(msg)
|
134
|
+
|
106
135
|
@property
|
107
|
-
def config_cls(self) ->
|
136
|
+
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912
|
108
137
|
"""Import while in use"""
|
109
138
|
if self == DB.Milvus:
|
110
139
|
from .milvus.config import MilvusConfig
|
140
|
+
|
111
141
|
return MilvusConfig
|
112
142
|
|
113
143
|
if self == DB.ZillizCloud:
|
114
144
|
from .zilliz_cloud.config import ZillizCloudConfig
|
145
|
+
|
115
146
|
return ZillizCloudConfig
|
116
147
|
|
117
148
|
if self == DB.Pinecone:
|
118
149
|
from .pinecone.config import PineconeConfig
|
150
|
+
|
119
151
|
return PineconeConfig
|
120
152
|
|
121
153
|
if self == DB.ElasticCloud:
|
122
154
|
from .elastic_cloud.config import ElasticCloudConfig
|
155
|
+
|
123
156
|
return ElasticCloudConfig
|
124
157
|
|
125
158
|
if self == DB.QdrantCloud:
|
126
159
|
from .qdrant_cloud.config import QdrantConfig
|
160
|
+
|
127
161
|
return QdrantConfig
|
128
162
|
|
129
163
|
if self == DB.WeaviateCloud:
|
130
164
|
from .weaviate_cloud.config import WeaviateConfig
|
165
|
+
|
131
166
|
return WeaviateConfig
|
132
167
|
|
133
168
|
if self == DB.PgVector:
|
134
169
|
from .pgvector.config import PgVectorConfig
|
170
|
+
|
135
171
|
return PgVectorConfig
|
136
172
|
|
137
173
|
if self == DB.PgVectoRS:
|
138
174
|
from .pgvecto_rs.config import PgVectoRSConfig
|
175
|
+
|
139
176
|
return PgVectoRSConfig
|
140
177
|
|
141
178
|
if self == DB.PgVectorScale:
|
142
179
|
from .pgvectorscale.config import PgVectorScaleConfig
|
180
|
+
|
143
181
|
return PgVectorScaleConfig
|
144
182
|
|
145
183
|
if self == DB.PgDiskANN:
|
146
184
|
from .pgdiskann.config import PgDiskANNConfig
|
185
|
+
|
147
186
|
return PgDiskANNConfig
|
148
187
|
|
149
188
|
if self == DB.Redis:
|
150
189
|
from .redis.config import RedisConfig
|
190
|
+
|
151
191
|
return RedisConfig
|
152
|
-
|
192
|
+
|
153
193
|
if self == DB.MemoryDB:
|
154
194
|
from .memorydb.config import MemoryDBConfig
|
195
|
+
|
155
196
|
return MemoryDBConfig
|
156
197
|
|
157
198
|
if self == DB.Chroma:
|
158
199
|
from .chroma.config import ChromaConfig
|
200
|
+
|
159
201
|
return ChromaConfig
|
160
202
|
|
161
203
|
if self == DB.AWSOpenSearch:
|
162
204
|
from .aws_opensearch.config import AWSOpenSearchConfig
|
205
|
+
|
163
206
|
return AWSOpenSearchConfig
|
164
|
-
|
207
|
+
|
165
208
|
if self == DB.AlloyDB:
|
166
209
|
from .alloydb.config import AlloyDBConfig
|
210
|
+
|
167
211
|
return AlloyDBConfig
|
168
212
|
|
169
|
-
|
213
|
+
if self == DB.AliyunElasticsearch:
|
214
|
+
from .aliyun_elasticsearch.config import AliyunElasticsearchConfig
|
215
|
+
|
216
|
+
return AliyunElasticsearchConfig
|
217
|
+
|
218
|
+
if self == DB.AliyunOpenSearch:
|
219
|
+
from .aliyun_opensearch.config import AliyunOpenSearchConfig
|
220
|
+
|
221
|
+
return AliyunOpenSearchConfig
|
222
|
+
|
223
|
+
msg = f"Unknown DB: {self.name}"
|
224
|
+
raise ValueError(msg)
|
225
|
+
|
226
|
+
def case_config_cls( # noqa: PLR0911
|
227
|
+
self,
|
228
|
+
index_type: IndexType | None = None,
|
229
|
+
) -> type[DBCaseConfig]:
|
170
230
|
if self == DB.Milvus:
|
171
231
|
from .milvus.config import _milvus_case_config
|
232
|
+
|
172
233
|
return _milvus_case_config.get(index_type)
|
173
234
|
|
174
235
|
if self == DB.ZillizCloud:
|
175
236
|
from .zilliz_cloud.config import AutoIndexConfig
|
237
|
+
|
176
238
|
return AutoIndexConfig
|
177
239
|
|
178
240
|
if self == DB.ElasticCloud:
|
179
241
|
from .elastic_cloud.config import ElasticCloudIndexConfig
|
242
|
+
|
180
243
|
return ElasticCloudIndexConfig
|
181
244
|
|
182
245
|
if self == DB.QdrantCloud:
|
183
246
|
from .qdrant_cloud.config import QdrantIndexConfig
|
247
|
+
|
184
248
|
return QdrantIndexConfig
|
185
249
|
|
186
250
|
if self == DB.WeaviateCloud:
|
187
251
|
from .weaviate_cloud.config import WeaviateIndexConfig
|
252
|
+
|
188
253
|
return WeaviateIndexConfig
|
189
254
|
|
190
255
|
if self == DB.PgVector:
|
191
256
|
from .pgvector.config import _pgvector_case_config
|
257
|
+
|
192
258
|
return _pgvector_case_config.get(index_type)
|
193
259
|
|
194
260
|
if self == DB.PgVectoRS:
|
195
261
|
from .pgvecto_rs.config import _pgvecto_rs_case_config
|
262
|
+
|
196
263
|
return _pgvecto_rs_case_config.get(index_type)
|
197
264
|
|
198
265
|
if self == DB.AWSOpenSearch:
|
199
266
|
from .aws_opensearch.config import AWSOpenSearchIndexConfig
|
267
|
+
|
200
268
|
return AWSOpenSearchIndexConfig
|
201
269
|
|
202
270
|
if self == DB.PgVectorScale:
|
203
271
|
from .pgvectorscale.config import _pgvectorscale_case_config
|
272
|
+
|
204
273
|
return _pgvectorscale_case_config.get(index_type)
|
205
274
|
|
206
275
|
if self == DB.PgDiskANN:
|
207
276
|
from .pgdiskann.config import _pgdiskann_case_config
|
277
|
+
|
208
278
|
return _pgdiskann_case_config.get(index_type)
|
209
|
-
|
279
|
+
|
210
280
|
if self == DB.AlloyDB:
|
211
281
|
from .alloydb.config import _alloydb_case_config
|
282
|
+
|
212
283
|
return _alloydb_case_config.get(index_type)
|
213
284
|
|
285
|
+
if self == DB.AliyunElasticsearch:
|
286
|
+
from .elastic_cloud.config import ElasticCloudIndexConfig
|
287
|
+
|
288
|
+
return ElasticCloudIndexConfig
|
289
|
+
|
290
|
+
if self == DB.AliyunOpenSearch:
|
291
|
+
from .aliyun_opensearch.config import AliyunOpenSearchIndexConfig
|
292
|
+
|
293
|
+
return AliyunOpenSearchIndexConfig
|
294
|
+
|
214
295
|
# DB.Pinecone, DB.Chroma, DB.Redis
|
215
296
|
return EmptyDBCaseConfig
|
216
297
|
|
217
298
|
|
218
299
|
__all__ = [
|
219
|
-
"DB",
|
300
|
+
"DB",
|
301
|
+
"DBCaseConfig",
|
302
|
+
"DBConfig",
|
303
|
+
"EmptyDBCaseConfig",
|
304
|
+
"IndexType",
|
305
|
+
"MetricType",
|
306
|
+
"VectorDB",
|
220
307
|
]
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from ..elastic_cloud.config import ElasticCloudIndexConfig
|
2
|
+
from ..elastic_cloud.elastic_cloud import ElasticCloud
|
3
|
+
|
4
|
+
|
5
|
+
class AliyunElasticsearch(ElasticCloud):
|
6
|
+
def __init__(
|
7
|
+
self,
|
8
|
+
dim: int,
|
9
|
+
db_config: dict,
|
10
|
+
db_case_config: ElasticCloudIndexConfig,
|
11
|
+
indice: str = "vdb_bench_indice", # must be lowercase
|
12
|
+
id_col_name: str = "id",
|
13
|
+
vector_col_name: str = "vector",
|
14
|
+
drop_old: bool = False,
|
15
|
+
**kwargs,
|
16
|
+
):
|
17
|
+
super().__init__(
|
18
|
+
dim=dim,
|
19
|
+
db_config=db_config,
|
20
|
+
db_case_config=db_case_config,
|
21
|
+
indice=indice,
|
22
|
+
id_col_name=id_col_name,
|
23
|
+
vector_col_name=vector_col_name,
|
24
|
+
drop_old=drop_old,
|
25
|
+
**kwargs,
|
26
|
+
)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from pydantic import BaseModel, SecretStr
|
2
|
+
|
3
|
+
from ..api import DBConfig
|
4
|
+
|
5
|
+
|
6
|
+
class AliyunElasticsearchConfig(DBConfig, BaseModel):
|
7
|
+
#: Protocol in use to connect to the node
|
8
|
+
scheme: str = "http"
|
9
|
+
host: str = ""
|
10
|
+
port: int = 9200
|
11
|
+
user: str = "elastic"
|
12
|
+
password: SecretStr
|
13
|
+
|
14
|
+
def to_dict(self) -> dict:
|
15
|
+
return {
|
16
|
+
"hosts": [{"scheme": self.scheme, "host": self.host, "port": self.port}],
|
17
|
+
"basic_auth": (self.user, self.password.get_secret_value()),
|
18
|
+
}
|
@@ -0,0 +1,345 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
import time
|
4
|
+
from contextlib import contextmanager
|
5
|
+
|
6
|
+
from alibabacloud_ha3engine_vector import client, models
|
7
|
+
from alibabacloud_ha3engine_vector.models import QueryRequest
|
8
|
+
from alibabacloud_searchengine20211025 import models as searchengine_models
|
9
|
+
from alibabacloud_searchengine20211025.client import Client as searchengineClient
|
10
|
+
from alibabacloud_tea_openapi import models as open_api_models
|
11
|
+
|
12
|
+
from ..api import MetricType, VectorDB
|
13
|
+
from .config import AliyunOpenSearchIndexConfig
|
14
|
+
|
15
|
+
log = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH = 2 * 1024 * 1024 # 2MB
|
18
|
+
ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH = 100
|
19
|
+
|
20
|
+
|
21
|
+
class AliyunOpenSearch(VectorDB):
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
dim: int,
|
25
|
+
db_config: dict,
|
26
|
+
db_case_config: AliyunOpenSearchIndexConfig,
|
27
|
+
collection_name: str = "VectorDBBenchCollection",
|
28
|
+
drop_old: bool = False,
|
29
|
+
**kwargs,
|
30
|
+
):
|
31
|
+
self.control_client = None
|
32
|
+
self.dim = dim
|
33
|
+
self.db_config = db_config
|
34
|
+
self.case_config = db_case_config
|
35
|
+
self.collection_name = collection_name
|
36
|
+
self.instance_id = db_config["host"].split(".")[0].replace("http://", "").replace("https://", "")
|
37
|
+
|
38
|
+
self._primary_field = "id"
|
39
|
+
self._scalar_field = "int_id"
|
40
|
+
self._vector_field = "vector"
|
41
|
+
self._index_name = "vector_idx"
|
42
|
+
|
43
|
+
self.batch_size = int(
|
44
|
+
min(
|
45
|
+
ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH / (dim * 25),
|
46
|
+
ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH,
|
47
|
+
),
|
48
|
+
)
|
49
|
+
|
50
|
+
log.info(f"Aliyun_OpenSearch client config: {self.db_config}")
|
51
|
+
control_config = open_api_models.Config(
|
52
|
+
access_key_id=self.db_config["ak"],
|
53
|
+
access_key_secret=self.db_config["sk"],
|
54
|
+
endpoint=self.db_config["control_host"],
|
55
|
+
)
|
56
|
+
self.control_client = searchengineClient(control_config)
|
57
|
+
|
58
|
+
if drop_old:
|
59
|
+
log.info(f"aliyun_OpenSearch client drop old index: {self.collection_name}")
|
60
|
+
if self._index_exists(self.control_client):
|
61
|
+
self._modify_index(self.control_client)
|
62
|
+
else:
|
63
|
+
self._create_index(self.control_client)
|
64
|
+
|
65
|
+
def _create_index(self, client: searchengineClient):
|
66
|
+
create_table_request = searchengine_models.CreateTableRequest()
|
67
|
+
create_table_request.name = self.collection_name
|
68
|
+
create_table_request.primary_key = self._primary_field
|
69
|
+
create_table_request.partition_count = 1
|
70
|
+
create_table_request.field_schema = {
|
71
|
+
self._primary_field: "INT64",
|
72
|
+
self._vector_field: "MULTI_FLOAT",
|
73
|
+
self._scalar_field: "INT64",
|
74
|
+
}
|
75
|
+
vector_index = searchengine_models.ModifyTableRequestVectorIndex()
|
76
|
+
vector_index.index_name = self._index_name
|
77
|
+
vector_index.dimension = self.dim
|
78
|
+
vector_index.distance_type = self.case_config.distance_type()
|
79
|
+
vector_index.vector_field = self._vector_field
|
80
|
+
vector_index.vector_index_type = "HNSW"
|
81
|
+
|
82
|
+
advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
|
83
|
+
str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
|
84
|
+
str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
|
85
|
+
str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
|
86
|
+
str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
|
87
|
+
str_thread_count = '"proxima.hnsw.builder.thread_count":16'
|
88
|
+
|
89
|
+
params = ",".join(
|
90
|
+
[
|
91
|
+
str_max_neighbor_count,
|
92
|
+
str_efc,
|
93
|
+
str_enable_adsampling,
|
94
|
+
str_slack_pruning_factor,
|
95
|
+
str_thread_count,
|
96
|
+
],
|
97
|
+
)
|
98
|
+
advance_params.build_index_params = params
|
99
|
+
advance_params.search_index_params = (
|
100
|
+
'{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
|
101
|
+
)
|
102
|
+
vector_index.advance_params = advance_params
|
103
|
+
create_table_request.vector_index = [vector_index]
|
104
|
+
|
105
|
+
try:
|
106
|
+
response = client.create_table(self.instance_id, create_table_request)
|
107
|
+
log.info(f"create table success: {response.body}")
|
108
|
+
except Exception as error:
|
109
|
+
log.info(error.message)
|
110
|
+
log.info(error.data.get("Recommend"))
|
111
|
+
log.info(f"Failed to create index: error: {error!s}")
|
112
|
+
raise error from None
|
113
|
+
|
114
|
+
# check if index create success
|
115
|
+
self._active_index(client)
|
116
|
+
|
117
|
+
# check if index create success
|
118
|
+
def _active_index(self, client: searchengineClient) -> None:
|
119
|
+
retry_times = 0
|
120
|
+
while True:
|
121
|
+
time.sleep(10)
|
122
|
+
log.info(f"begin to {retry_times} times get table")
|
123
|
+
retry_times += 1
|
124
|
+
response = client.get_table(self.instance_id, self.collection_name)
|
125
|
+
if response.body.result.status == "IN_USE":
|
126
|
+
log.info(f"{self.collection_name} table begin to use.")
|
127
|
+
return
|
128
|
+
|
129
|
+
def _index_exists(self, client: searchengineClient) -> bool:
|
130
|
+
try:
|
131
|
+
client.get_table(self.instance_id, self.collection_name)
|
132
|
+
except Exception as err:
|
133
|
+
log.warning(f"get table from searchengine error, err={err}")
|
134
|
+
return False
|
135
|
+
else:
|
136
|
+
return True
|
137
|
+
|
138
|
+
# check if index build success, Insert the embeddings to the vector database after index build success
|
139
|
+
def _index_build_success(self, client: searchengineClient) -> None:
|
140
|
+
log.info("begin to check if table build success.")
|
141
|
+
time.sleep(50)
|
142
|
+
|
143
|
+
retry_times = 0
|
144
|
+
while True:
|
145
|
+
time.sleep(10)
|
146
|
+
log.info(f"begin to {retry_times} times get table fsm")
|
147
|
+
retry_times += 1
|
148
|
+
request = searchengine_models.ListTasksRequest()
|
149
|
+
request.start = (int(time.time()) - 3600) * 1000
|
150
|
+
request.end = int(time.time()) * 1000
|
151
|
+
response = client.list_tasks(self.instance_id, request)
|
152
|
+
fsms = response.body.result
|
153
|
+
cur_fsm = None
|
154
|
+
for fsm in fsms:
|
155
|
+
if fsm["type"] != "datasource_flow_fsm":
|
156
|
+
continue
|
157
|
+
if self.collection_name not in fsm["fsmId"]:
|
158
|
+
continue
|
159
|
+
cur_fsm = fsm
|
160
|
+
break
|
161
|
+
if cur_fsm is None:
|
162
|
+
log.warning("no build index fsm")
|
163
|
+
return
|
164
|
+
if cur_fsm["status"] == "success":
|
165
|
+
return
|
166
|
+
|
167
|
+
def _modify_index(self, client: searchengineClient) -> None:
|
168
|
+
# check if index create success
|
169
|
+
self._active_index(client)
|
170
|
+
|
171
|
+
modify_table_request = searchengine_models.ModifyTableRequest()
|
172
|
+
modify_table_request.partition_count = 1
|
173
|
+
modify_table_request.primary_key = self._primary_field
|
174
|
+
modify_table_request.field_schema = {
|
175
|
+
self._primary_field: "INT64",
|
176
|
+
self._vector_field: "MULTI_FLOAT",
|
177
|
+
self._scalar_field: "INT64",
|
178
|
+
}
|
179
|
+
vector_index = searchengine_models.ModifyTableRequestVectorIndex()
|
180
|
+
vector_index.index_name = self._index_name
|
181
|
+
vector_index.dimension = self.dim
|
182
|
+
vector_index.distance_type = self.case_config.distance_type()
|
183
|
+
vector_index.vector_field = self._vector_field
|
184
|
+
vector_index.vector_index_type = "HNSW"
|
185
|
+
advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
|
186
|
+
|
187
|
+
str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
|
188
|
+
str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
|
189
|
+
str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
|
190
|
+
str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
|
191
|
+
str_thread_count = '"proxima.hnsw.builder.thread_count":16'
|
192
|
+
|
193
|
+
params = ",".join(
|
194
|
+
[
|
195
|
+
str_max_neighbor_count,
|
196
|
+
str_efc,
|
197
|
+
str_enable_adsampling,
|
198
|
+
str_slack_pruning_factor,
|
199
|
+
str_thread_count,
|
200
|
+
],
|
201
|
+
)
|
202
|
+
advance_params.build_index_params = params
|
203
|
+
advance_params.search_index_params = (
|
204
|
+
'{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
|
205
|
+
)
|
206
|
+
vector_index.advance_params = advance_params
|
207
|
+
|
208
|
+
modify_table_request.vector_index = [vector_index]
|
209
|
+
|
210
|
+
try:
|
211
|
+
response = client.modify_table(
|
212
|
+
self.instance_id,
|
213
|
+
self.collection_name,
|
214
|
+
modify_table_request,
|
215
|
+
)
|
216
|
+
log.info(f"modify table success: {response.body}")
|
217
|
+
except Exception as error:
|
218
|
+
log.info(error.message)
|
219
|
+
log.info(error.data.get("Recommend"))
|
220
|
+
log.info(f"Failed to modify index: error: {error!s}")
|
221
|
+
raise error from None
|
222
|
+
|
223
|
+
# check if modify index & delete data fsm success
|
224
|
+
self._index_build_success(client)
|
225
|
+
|
226
|
+
# get collection records total count
|
227
|
+
def _get_total_count(self):
|
228
|
+
try:
|
229
|
+
response = self.client.stats(self.collection_name)
|
230
|
+
except Exception as e:
|
231
|
+
log.warning(f"Error querying index: {e}")
|
232
|
+
else:
|
233
|
+
body = json.loads(response.body)
|
234
|
+
log.info(f"stats info: {response.body}")
|
235
|
+
|
236
|
+
if "result" in body and "totalDocCount" in body.get("result"):
|
237
|
+
return body.get("result").get("totalDocCount")
|
238
|
+
return 0
|
239
|
+
|
240
|
+
@contextmanager
|
241
|
+
def init(self) -> None:
|
242
|
+
"""connect to aliyun opensearch"""
|
243
|
+
config = models.Config(
|
244
|
+
endpoint=self.db_config["host"],
|
245
|
+
protocol="http",
|
246
|
+
access_user_name=self.db_config["user"],
|
247
|
+
access_pass_word=self.db_config["password"],
|
248
|
+
)
|
249
|
+
|
250
|
+
self.client = client.Client(config)
|
251
|
+
|
252
|
+
yield
|
253
|
+
self.client = None
|
254
|
+
del self.client
|
255
|
+
|
256
|
+
def insert_embeddings(
|
257
|
+
self,
|
258
|
+
embeddings: list[list[float]],
|
259
|
+
metadata: list[int],
|
260
|
+
**kwargs,
|
261
|
+
) -> tuple[int, Exception]:
|
262
|
+
"""Insert the embeddings to the opensearch."""
|
263
|
+
assert self.client is not None, "should self.init() first"
|
264
|
+
assert len(embeddings) == len(metadata)
|
265
|
+
insert_count = 0
|
266
|
+
|
267
|
+
try:
|
268
|
+
for batch_start_offset in range(0, len(embeddings), self.batch_size):
|
269
|
+
batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
|
270
|
+
documents = []
|
271
|
+
for i in range(batch_start_offset, batch_end_offset):
|
272
|
+
document_fields = {
|
273
|
+
self._primary_field: metadata[i],
|
274
|
+
self._vector_field: embeddings[i],
|
275
|
+
self._scalar_field: metadata[i],
|
276
|
+
"ops_build_channel": "inc",
|
277
|
+
}
|
278
|
+
document = {"fields": document_fields, "cmd": "add"}
|
279
|
+
documents.append(document)
|
280
|
+
|
281
|
+
push_doc_req = models.PushDocumentsRequest({}, documents)
|
282
|
+
self.client.push_documents(
|
283
|
+
self.collection_name,
|
284
|
+
self._primary_field,
|
285
|
+
push_doc_req,
|
286
|
+
)
|
287
|
+
insert_count += batch_end_offset - batch_start_offset
|
288
|
+
except Exception as e:
|
289
|
+
log.info(f"Failed to insert data: {e}")
|
290
|
+
return (insert_count, e)
|
291
|
+
return (insert_count, None)
|
292
|
+
|
293
|
+
def search_embedding(
|
294
|
+
self,
|
295
|
+
query: list[float],
|
296
|
+
k: int = 100,
|
297
|
+
filters: dict | None = None,
|
298
|
+
) -> list[int]:
|
299
|
+
assert self.client is not None, "should self.init() first"
|
300
|
+
search_params = '{"proxima.hnsw.searcher.ef":' + str(self.case_config.ef_search) + "}"
|
301
|
+
|
302
|
+
os_filter = f"{self._scalar_field} {filters.get('metadata')}" if filters else ""
|
303
|
+
|
304
|
+
try:
|
305
|
+
request = QueryRequest(
|
306
|
+
table_name=self.collection_name,
|
307
|
+
vector=query,
|
308
|
+
top_k=k,
|
309
|
+
search_params=search_params,
|
310
|
+
filter=os_filter,
|
311
|
+
)
|
312
|
+
result = self.client.query(request)
|
313
|
+
except Exception as e:
|
314
|
+
log.info(f"Error querying index: {e}")
|
315
|
+
raise e from e
|
316
|
+
else:
|
317
|
+
res = json.loads(result.body)
|
318
|
+
return [one_res["id"] for one_res in res["result"]]
|
319
|
+
|
320
|
+
def need_normalize_cosine(self) -> bool:
|
321
|
+
"""Wheather this database need to normalize dataset to support COSINE"""
|
322
|
+
if self.case_config.metric_type == MetricType.COSINE:
|
323
|
+
log.info("cosine dataset need normalize.")
|
324
|
+
return True
|
325
|
+
|
326
|
+
return False
|
327
|
+
|
328
|
+
def optimize(self):
|
329
|
+
pass
|
330
|
+
|
331
|
+
def optimize_with_size(self, data_size: int):
|
332
|
+
log.info(f"optimize count: {data_size}")
|
333
|
+
retry_times = 0
|
334
|
+
while True:
|
335
|
+
time.sleep(10)
|
336
|
+
log.info(f"begin to {retry_times} times get optimize table")
|
337
|
+
retry_times += 1
|
338
|
+
total_count = self._get_total_count()
|
339
|
+
# check if the data is inserted
|
340
|
+
if total_count == data_size:
|
341
|
+
log.info("optimize table finish.")
|
342
|
+
return
|
343
|
+
|
344
|
+
def ready_to_load(self):
|
345
|
+
"""ready_to_load will be called before load in load cases."""
|