vectordb-bench 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. vectordb_bench/__init__.py +49 -24
  2. vectordb_bench/__main__.py +4 -3
  3. vectordb_bench/backend/assembler.py +12 -13
  4. vectordb_bench/backend/cases.py +56 -46
  5. vectordb_bench/backend/clients/__init__.py +101 -14
  6. vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +26 -0
  7. vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +18 -0
  8. vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +345 -0
  9. vectordb_bench/backend/clients/aliyun_opensearch/config.py +47 -0
  10. vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
  11. vectordb_bench/backend/clients/alloydb/cli.py +52 -35
  12. vectordb_bench/backend/clients/alloydb/config.py +30 -30
  13. vectordb_bench/backend/clients/api.py +8 -9
  14. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
  15. vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
  16. vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
  17. vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
  18. vectordb_bench/backend/clients/chroma/chroma.py +38 -36
  19. vectordb_bench/backend/clients/chroma/config.py +4 -2
  20. vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
  21. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
  22. vectordb_bench/backend/clients/memorydb/cli.py +8 -8
  23. vectordb_bench/backend/clients/memorydb/config.py +2 -2
  24. vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
  25. vectordb_bench/backend/clients/milvus/cli.py +62 -80
  26. vectordb_bench/backend/clients/milvus/config.py +31 -7
  27. vectordb_bench/backend/clients/milvus/milvus.py +23 -26
  28. vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
  29. vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
  30. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
  31. vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
  32. vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
  33. vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
  34. vectordb_bench/backend/clients/pgvector/cli.py +40 -31
  35. vectordb_bench/backend/clients/pgvector/config.py +63 -73
  36. vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
  37. vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
  38. vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
  39. vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
  40. vectordb_bench/backend/clients/pinecone/config.py +1 -0
  41. vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
  42. vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
  43. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
  44. vectordb_bench/backend/clients/redis/cli.py +6 -12
  45. vectordb_bench/backend/clients/redis/config.py +7 -5
  46. vectordb_bench/backend/clients/redis/redis.py +94 -58
  47. vectordb_bench/backend/clients/test/cli.py +1 -2
  48. vectordb_bench/backend/clients/test/config.py +2 -2
  49. vectordb_bench/backend/clients/test/test.py +4 -5
  50. vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
  51. vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
  52. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
  53. vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
  54. vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
  55. vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
  56. vectordb_bench/backend/data_source.py +30 -18
  57. vectordb_bench/backend/dataset.py +47 -27
  58. vectordb_bench/backend/result_collector.py +2 -3
  59. vectordb_bench/backend/runner/__init__.py +4 -6
  60. vectordb_bench/backend/runner/mp_runner.py +85 -34
  61. vectordb_bench/backend/runner/rate_runner.py +51 -23
  62. vectordb_bench/backend/runner/read_write_runner.py +140 -46
  63. vectordb_bench/backend/runner/serial_runner.py +99 -50
  64. vectordb_bench/backend/runner/util.py +4 -19
  65. vectordb_bench/backend/task_runner.py +95 -74
  66. vectordb_bench/backend/utils.py +17 -9
  67. vectordb_bench/base.py +0 -1
  68. vectordb_bench/cli/cli.py +65 -60
  69. vectordb_bench/cli/vectordbbench.py +6 -7
  70. vectordb_bench/frontend/components/check_results/charts.py +8 -19
  71. vectordb_bench/frontend/components/check_results/data.py +4 -16
  72. vectordb_bench/frontend/components/check_results/filters.py +8 -16
  73. vectordb_bench/frontend/components/check_results/nav.py +4 -4
  74. vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
  75. vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
  76. vectordb_bench/frontend/components/concurrent/charts.py +12 -12
  77. vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
  78. vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
  79. vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
  80. vectordb_bench/frontend/components/custom/initStyle.py +1 -1
  81. vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
  82. vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
  83. vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
  84. vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
  85. vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
  86. vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
  87. vectordb_bench/frontend/components/tables/data.py +3 -6
  88. vectordb_bench/frontend/config/dbCaseConfigs.py +108 -83
  89. vectordb_bench/frontend/pages/concurrent.py +3 -5
  90. vectordb_bench/frontend/pages/custom.py +30 -9
  91. vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
  92. vectordb_bench/frontend/pages/run_test.py +3 -7
  93. vectordb_bench/frontend/utils.py +1 -1
  94. vectordb_bench/frontend/vdb_benchmark.py +4 -6
  95. vectordb_bench/interface.py +56 -26
  96. vectordb_bench/log_util.py +59 -64
  97. vectordb_bench/metric.py +10 -11
  98. vectordb_bench/models.py +26 -43
  99. {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +34 -42
  100. vectordb_bench-0.0.20.dist-info/RECORD +135 -0
  101. {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
  102. vectordb_bench-0.0.18.dist-info/RECORD +0 -131
  103. {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
  104. {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
  105. {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,12 @@
1
1
  from enum import Enum
2
- from typing import Type
2
+
3
3
  from .api import (
4
- VectorDB,
5
- DBConfig,
6
4
  DBCaseConfig,
5
+ DBConfig,
7
6
  EmptyDBCaseConfig,
8
7
  IndexType,
9
8
  MetricType,
9
+ VectorDB,
10
10
  )
11
11
 
12
12
 
@@ -37,184 +37,271 @@ class DB(Enum):
37
37
  MemoryDB = "MemoryDB"
38
38
  Chroma = "Chroma"
39
39
  AWSOpenSearch = "OpenSearch"
40
+ AliyunElasticsearch = "AliyunElasticsearch"
40
41
  Test = "test"
41
-
42
+ AliyunOpenSearch = "AliyunOpenSearch"
42
43
 
43
44
  @property
44
- def init_cls(self) -> Type[VectorDB]:
45
+ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912
45
46
  """Import while in use"""
46
47
  if self == DB.Milvus:
47
48
  from .milvus.milvus import Milvus
49
+
48
50
  return Milvus
49
51
 
50
52
  if self == DB.ZillizCloud:
51
53
  from .zilliz_cloud.zilliz_cloud import ZillizCloud
54
+
52
55
  return ZillizCloud
53
56
 
54
57
  if self == DB.Pinecone:
55
58
  from .pinecone.pinecone import Pinecone
59
+
56
60
  return Pinecone
57
61
 
58
62
  if self == DB.ElasticCloud:
59
63
  from .elastic_cloud.elastic_cloud import ElasticCloud
64
+
60
65
  return ElasticCloud
61
66
 
62
67
  if self == DB.QdrantCloud:
63
68
  from .qdrant_cloud.qdrant_cloud import QdrantCloud
69
+
64
70
  return QdrantCloud
65
71
 
66
72
  if self == DB.WeaviateCloud:
67
73
  from .weaviate_cloud.weaviate_cloud import WeaviateCloud
74
+
68
75
  return WeaviateCloud
69
76
 
70
77
  if self == DB.PgVector:
71
78
  from .pgvector.pgvector import PgVector
79
+
72
80
  return PgVector
73
81
 
74
82
  if self == DB.PgVectoRS:
75
83
  from .pgvecto_rs.pgvecto_rs import PgVectoRS
84
+
76
85
  return PgVectoRS
77
-
86
+
78
87
  if self == DB.PgVectorScale:
79
88
  from .pgvectorscale.pgvectorscale import PgVectorScale
89
+
80
90
  return PgVectorScale
81
91
 
82
92
  if self == DB.PgDiskANN:
83
93
  from .pgdiskann.pgdiskann import PgDiskANN
94
+
84
95
  return PgDiskANN
85
96
 
86
97
  if self == DB.Redis:
87
98
  from .redis.redis import Redis
99
+
88
100
  return Redis
89
-
101
+
90
102
  if self == DB.MemoryDB:
91
103
  from .memorydb.memorydb import MemoryDB
104
+
92
105
  return MemoryDB
93
106
 
94
107
  if self == DB.Chroma:
95
108
  from .chroma.chroma import ChromaClient
109
+
96
110
  return ChromaClient
97
111
 
98
112
  if self == DB.AWSOpenSearch:
99
113
  from .aws_opensearch.aws_opensearch import AWSOpenSearch
114
+
100
115
  return AWSOpenSearch
101
-
116
+
102
117
  if self == DB.AlloyDB:
103
118
  from .alloydb.alloydb import AlloyDB
119
+
104
120
  return AlloyDB
105
121
 
122
+ if self == DB.AliyunElasticsearch:
123
+ from .aliyun_elasticsearch.aliyun_elasticsearch import AliyunElasticsearch
124
+
125
+ return AliyunElasticsearch
126
+
127
+ if self == DB.AliyunOpenSearch:
128
+ from .aliyun_opensearch.aliyun_opensearch import AliyunOpenSearch
129
+
130
+ return AliyunOpenSearch
131
+
132
+ msg = f"Unknown DB: {self.name}"
133
+ raise ValueError(msg)
134
+
106
135
  @property
107
- def config_cls(self) -> Type[DBConfig]:
136
+ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912
108
137
  """Import while in use"""
109
138
  if self == DB.Milvus:
110
139
  from .milvus.config import MilvusConfig
140
+
111
141
  return MilvusConfig
112
142
 
113
143
  if self == DB.ZillizCloud:
114
144
  from .zilliz_cloud.config import ZillizCloudConfig
145
+
115
146
  return ZillizCloudConfig
116
147
 
117
148
  if self == DB.Pinecone:
118
149
  from .pinecone.config import PineconeConfig
150
+
119
151
  return PineconeConfig
120
152
 
121
153
  if self == DB.ElasticCloud:
122
154
  from .elastic_cloud.config import ElasticCloudConfig
155
+
123
156
  return ElasticCloudConfig
124
157
 
125
158
  if self == DB.QdrantCloud:
126
159
  from .qdrant_cloud.config import QdrantConfig
160
+
127
161
  return QdrantConfig
128
162
 
129
163
  if self == DB.WeaviateCloud:
130
164
  from .weaviate_cloud.config import WeaviateConfig
165
+
131
166
  return WeaviateConfig
132
167
 
133
168
  if self == DB.PgVector:
134
169
  from .pgvector.config import PgVectorConfig
170
+
135
171
  return PgVectorConfig
136
172
 
137
173
  if self == DB.PgVectoRS:
138
174
  from .pgvecto_rs.config import PgVectoRSConfig
175
+
139
176
  return PgVectoRSConfig
140
177
 
141
178
  if self == DB.PgVectorScale:
142
179
  from .pgvectorscale.config import PgVectorScaleConfig
180
+
143
181
  return PgVectorScaleConfig
144
182
 
145
183
  if self == DB.PgDiskANN:
146
184
  from .pgdiskann.config import PgDiskANNConfig
185
+
147
186
  return PgDiskANNConfig
148
187
 
149
188
  if self == DB.Redis:
150
189
  from .redis.config import RedisConfig
190
+
151
191
  return RedisConfig
152
-
192
+
153
193
  if self == DB.MemoryDB:
154
194
  from .memorydb.config import MemoryDBConfig
195
+
155
196
  return MemoryDBConfig
156
197
 
157
198
  if self == DB.Chroma:
158
199
  from .chroma.config import ChromaConfig
200
+
159
201
  return ChromaConfig
160
202
 
161
203
  if self == DB.AWSOpenSearch:
162
204
  from .aws_opensearch.config import AWSOpenSearchConfig
205
+
163
206
  return AWSOpenSearchConfig
164
-
207
+
165
208
  if self == DB.AlloyDB:
166
209
  from .alloydb.config import AlloyDBConfig
210
+
167
211
  return AlloyDBConfig
168
212
 
169
- def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
213
+ if self == DB.AliyunElasticsearch:
214
+ from .aliyun_elasticsearch.config import AliyunElasticsearchConfig
215
+
216
+ return AliyunElasticsearchConfig
217
+
218
+ if self == DB.AliyunOpenSearch:
219
+ from .aliyun_opensearch.config import AliyunOpenSearchConfig
220
+
221
+ return AliyunOpenSearchConfig
222
+
223
+ msg = f"Unknown DB: {self.name}"
224
+ raise ValueError(msg)
225
+
226
+ def case_config_cls( # noqa: PLR0911
227
+ self,
228
+ index_type: IndexType | None = None,
229
+ ) -> type[DBCaseConfig]:
170
230
  if self == DB.Milvus:
171
231
  from .milvus.config import _milvus_case_config
232
+
172
233
  return _milvus_case_config.get(index_type)
173
234
 
174
235
  if self == DB.ZillizCloud:
175
236
  from .zilliz_cloud.config import AutoIndexConfig
237
+
176
238
  return AutoIndexConfig
177
239
 
178
240
  if self == DB.ElasticCloud:
179
241
  from .elastic_cloud.config import ElasticCloudIndexConfig
242
+
180
243
  return ElasticCloudIndexConfig
181
244
 
182
245
  if self == DB.QdrantCloud:
183
246
  from .qdrant_cloud.config import QdrantIndexConfig
247
+
184
248
  return QdrantIndexConfig
185
249
 
186
250
  if self == DB.WeaviateCloud:
187
251
  from .weaviate_cloud.config import WeaviateIndexConfig
252
+
188
253
  return WeaviateIndexConfig
189
254
 
190
255
  if self == DB.PgVector:
191
256
  from .pgvector.config import _pgvector_case_config
257
+
192
258
  return _pgvector_case_config.get(index_type)
193
259
 
194
260
  if self == DB.PgVectoRS:
195
261
  from .pgvecto_rs.config import _pgvecto_rs_case_config
262
+
196
263
  return _pgvecto_rs_case_config.get(index_type)
197
264
 
198
265
  if self == DB.AWSOpenSearch:
199
266
  from .aws_opensearch.config import AWSOpenSearchIndexConfig
267
+
200
268
  return AWSOpenSearchIndexConfig
201
269
 
202
270
  if self == DB.PgVectorScale:
203
271
  from .pgvectorscale.config import _pgvectorscale_case_config
272
+
204
273
  return _pgvectorscale_case_config.get(index_type)
205
274
 
206
275
  if self == DB.PgDiskANN:
207
276
  from .pgdiskann.config import _pgdiskann_case_config
277
+
208
278
  return _pgdiskann_case_config.get(index_type)
209
-
279
+
210
280
  if self == DB.AlloyDB:
211
281
  from .alloydb.config import _alloydb_case_config
282
+
212
283
  return _alloydb_case_config.get(index_type)
213
284
 
285
+ if self == DB.AliyunElasticsearch:
286
+ from .elastic_cloud.config import ElasticCloudIndexConfig
287
+
288
+ return ElasticCloudIndexConfig
289
+
290
+ if self == DB.AliyunOpenSearch:
291
+ from .aliyun_opensearch.config import AliyunOpenSearchIndexConfig
292
+
293
+ return AliyunOpenSearchIndexConfig
294
+
214
295
  # DB.Pinecone, DB.Chroma, DB.Redis
215
296
  return EmptyDBCaseConfig
216
297
 
217
298
 
218
299
  __all__ = [
219
- "DB", "VectorDB", "DBConfig", "DBCaseConfig", "IndexType", "MetricType", "EmptyDBCaseConfig",
300
+ "DB",
301
+ "DBCaseConfig",
302
+ "DBConfig",
303
+ "EmptyDBCaseConfig",
304
+ "IndexType",
305
+ "MetricType",
306
+ "VectorDB",
220
307
  ]
@@ -0,0 +1,26 @@
1
+ from ..elastic_cloud.config import ElasticCloudIndexConfig
2
+ from ..elastic_cloud.elastic_cloud import ElasticCloud
3
+
4
+
5
+ class AliyunElasticsearch(ElasticCloud):
6
+ def __init__(
7
+ self,
8
+ dim: int,
9
+ db_config: dict,
10
+ db_case_config: ElasticCloudIndexConfig,
11
+ indice: str = "vdb_bench_indice", # must be lowercase
12
+ id_col_name: str = "id",
13
+ vector_col_name: str = "vector",
14
+ drop_old: bool = False,
15
+ **kwargs,
16
+ ):
17
+ super().__init__(
18
+ dim=dim,
19
+ db_config=db_config,
20
+ db_case_config=db_case_config,
21
+ indice=indice,
22
+ id_col_name=id_col_name,
23
+ vector_col_name=vector_col_name,
24
+ drop_old=drop_old,
25
+ **kwargs,
26
+ )
@@ -0,0 +1,18 @@
1
+ from pydantic import BaseModel, SecretStr
2
+
3
+ from ..api import DBConfig
4
+
5
+
6
+ class AliyunElasticsearchConfig(DBConfig, BaseModel):
7
+ #: Protocol in use to connect to the node
8
+ scheme: str = "http"
9
+ host: str = ""
10
+ port: int = 9200
11
+ user: str = "elastic"
12
+ password: SecretStr
13
+
14
+ def to_dict(self) -> dict:
15
+ return {
16
+ "hosts": [{"scheme": self.scheme, "host": self.host, "port": self.port}],
17
+ "basic_auth": (self.user, self.password.get_secret_value()),
18
+ }
@@ -0,0 +1,345 @@
1
+ import json
2
+ import logging
3
+ import time
4
+ from contextlib import contextmanager
5
+
6
+ from alibabacloud_ha3engine_vector import client, models
7
+ from alibabacloud_ha3engine_vector.models import QueryRequest
8
+ from alibabacloud_searchengine20211025 import models as searchengine_models
9
+ from alibabacloud_searchengine20211025.client import Client as searchengineClient
10
+ from alibabacloud_tea_openapi import models as open_api_models
11
+
12
+ from ..api import MetricType, VectorDB
13
+ from .config import AliyunOpenSearchIndexConfig
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+ ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH = 2 * 1024 * 1024 # 2MB
18
+ ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH = 100
19
+
20
+
21
+ class AliyunOpenSearch(VectorDB):
22
+ def __init__(
23
+ self,
24
+ dim: int,
25
+ db_config: dict,
26
+ db_case_config: AliyunOpenSearchIndexConfig,
27
+ collection_name: str = "VectorDBBenchCollection",
28
+ drop_old: bool = False,
29
+ **kwargs,
30
+ ):
31
+ self.control_client = None
32
+ self.dim = dim
33
+ self.db_config = db_config
34
+ self.case_config = db_case_config
35
+ self.collection_name = collection_name
36
+ self.instance_id = db_config["host"].split(".")[0].replace("http://", "").replace("https://", "")
37
+
38
+ self._primary_field = "id"
39
+ self._scalar_field = "int_id"
40
+ self._vector_field = "vector"
41
+ self._index_name = "vector_idx"
42
+
43
+ self.batch_size = int(
44
+ min(
45
+ ALIYUN_OPENSEARCH_MAX_SIZE_PER_BATCH / (dim * 25),
46
+ ALIYUN_OPENSEARCH_MAX_NUM_PER_BATCH,
47
+ ),
48
+ )
49
+
50
+ log.info(f"Aliyun_OpenSearch client config: {self.db_config}")
51
+ control_config = open_api_models.Config(
52
+ access_key_id=self.db_config["ak"],
53
+ access_key_secret=self.db_config["sk"],
54
+ endpoint=self.db_config["control_host"],
55
+ )
56
+ self.control_client = searchengineClient(control_config)
57
+
58
+ if drop_old:
59
+ log.info(f"aliyun_OpenSearch client drop old index: {self.collection_name}")
60
+ if self._index_exists(self.control_client):
61
+ self._modify_index(self.control_client)
62
+ else:
63
+ self._create_index(self.control_client)
64
+
65
+ def _create_index(self, client: searchengineClient):
66
+ create_table_request = searchengine_models.CreateTableRequest()
67
+ create_table_request.name = self.collection_name
68
+ create_table_request.primary_key = self._primary_field
69
+ create_table_request.partition_count = 1
70
+ create_table_request.field_schema = {
71
+ self._primary_field: "INT64",
72
+ self._vector_field: "MULTI_FLOAT",
73
+ self._scalar_field: "INT64",
74
+ }
75
+ vector_index = searchengine_models.ModifyTableRequestVectorIndex()
76
+ vector_index.index_name = self._index_name
77
+ vector_index.dimension = self.dim
78
+ vector_index.distance_type = self.case_config.distance_type()
79
+ vector_index.vector_field = self._vector_field
80
+ vector_index.vector_index_type = "HNSW"
81
+
82
+ advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
83
+ str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
84
+ str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
85
+ str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
86
+ str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
87
+ str_thread_count = '"proxima.hnsw.builder.thread_count":16'
88
+
89
+ params = ",".join(
90
+ [
91
+ str_max_neighbor_count,
92
+ str_efc,
93
+ str_enable_adsampling,
94
+ str_slack_pruning_factor,
95
+ str_thread_count,
96
+ ],
97
+ )
98
+ advance_params.build_index_params = params
99
+ advance_params.search_index_params = (
100
+ '{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
101
+ )
102
+ vector_index.advance_params = advance_params
103
+ create_table_request.vector_index = [vector_index]
104
+
105
+ try:
106
+ response = client.create_table(self.instance_id, create_table_request)
107
+ log.info(f"create table success: {response.body}")
108
+ except Exception as error:
109
+ log.info(error.message)
110
+ log.info(error.data.get("Recommend"))
111
+ log.info(f"Failed to create index: error: {error!s}")
112
+ raise error from None
113
+
114
+ # check if index create success
115
+ self._active_index(client)
116
+
117
+ # check if index create success
118
+ def _active_index(self, client: searchengineClient) -> None:
119
+ retry_times = 0
120
+ while True:
121
+ time.sleep(10)
122
+ log.info(f"begin to {retry_times} times get table")
123
+ retry_times += 1
124
+ response = client.get_table(self.instance_id, self.collection_name)
125
+ if response.body.result.status == "IN_USE":
126
+ log.info(f"{self.collection_name} table begin to use.")
127
+ return
128
+
129
+ def _index_exists(self, client: searchengineClient) -> bool:
130
+ try:
131
+ client.get_table(self.instance_id, self.collection_name)
132
+ except Exception as err:
133
+ log.warning(f"get table from searchengine error, err={err}")
134
+ return False
135
+ else:
136
+ return True
137
+
138
+ # check if index build success, Insert the embeddings to the vector database after index build success
139
+ def _index_build_success(self, client: searchengineClient) -> None:
140
+ log.info("begin to check if table build success.")
141
+ time.sleep(50)
142
+
143
+ retry_times = 0
144
+ while True:
145
+ time.sleep(10)
146
+ log.info(f"begin to {retry_times} times get table fsm")
147
+ retry_times += 1
148
+ request = searchengine_models.ListTasksRequest()
149
+ request.start = (int(time.time()) - 3600) * 1000
150
+ request.end = int(time.time()) * 1000
151
+ response = client.list_tasks(self.instance_id, request)
152
+ fsms = response.body.result
153
+ cur_fsm = None
154
+ for fsm in fsms:
155
+ if fsm["type"] != "datasource_flow_fsm":
156
+ continue
157
+ if self.collection_name not in fsm["fsmId"]:
158
+ continue
159
+ cur_fsm = fsm
160
+ break
161
+ if cur_fsm is None:
162
+ log.warning("no build index fsm")
163
+ return
164
+ if cur_fsm["status"] == "success":
165
+ return
166
+
167
+ def _modify_index(self, client: searchengineClient) -> None:
168
+ # check if index create success
169
+ self._active_index(client)
170
+
171
+ modify_table_request = searchengine_models.ModifyTableRequest()
172
+ modify_table_request.partition_count = 1
173
+ modify_table_request.primary_key = self._primary_field
174
+ modify_table_request.field_schema = {
175
+ self._primary_field: "INT64",
176
+ self._vector_field: "MULTI_FLOAT",
177
+ self._scalar_field: "INT64",
178
+ }
179
+ vector_index = searchengine_models.ModifyTableRequestVectorIndex()
180
+ vector_index.index_name = self._index_name
181
+ vector_index.dimension = self.dim
182
+ vector_index.distance_type = self.case_config.distance_type()
183
+ vector_index.vector_field = self._vector_field
184
+ vector_index.vector_index_type = "HNSW"
185
+ advance_params = searchengine_models.ModifyTableRequestVectorIndexAdvanceParams()
186
+
187
+ str_max_neighbor_count = f'"proxima.hnsw.builder.max_neighbor_count":{self.case_config.M}'
188
+ str_efc = f'"proxima.hnsw.builder.efconstruction":{self.case_config.ef_construction}'
189
+ str_enable_adsampling = '"proxima.hnsw.builder.enable_adsampling":true'
190
+ str_slack_pruning_factor = '"proxima.hnsw.builder.slack_pruning_factor":1.1'
191
+ str_thread_count = '"proxima.hnsw.builder.thread_count":16'
192
+
193
+ params = ",".join(
194
+ [
195
+ str_max_neighbor_count,
196
+ str_efc,
197
+ str_enable_adsampling,
198
+ str_slack_pruning_factor,
199
+ str_thread_count,
200
+ ],
201
+ )
202
+ advance_params.build_index_params = params
203
+ advance_params.search_index_params = (
204
+ '{"proxima.hnsw.searcher.ef":400,"proxima.hnsw.searcher.dynamic_termination.prob_threshold":0.7}'
205
+ )
206
+ vector_index.advance_params = advance_params
207
+
208
+ modify_table_request.vector_index = [vector_index]
209
+
210
+ try:
211
+ response = client.modify_table(
212
+ self.instance_id,
213
+ self.collection_name,
214
+ modify_table_request,
215
+ )
216
+ log.info(f"modify table success: {response.body}")
217
+ except Exception as error:
218
+ log.info(error.message)
219
+ log.info(error.data.get("Recommend"))
220
+ log.info(f"Failed to modify index: error: {error!s}")
221
+ raise error from None
222
+
223
+ # check if modify index & delete data fsm success
224
+ self._index_build_success(client)
225
+
226
+ # get collection records total count
227
+ def _get_total_count(self):
228
+ try:
229
+ response = self.client.stats(self.collection_name)
230
+ except Exception as e:
231
+ log.warning(f"Error querying index: {e}")
232
+ else:
233
+ body = json.loads(response.body)
234
+ log.info(f"stats info: {response.body}")
235
+
236
+ if "result" in body and "totalDocCount" in body.get("result"):
237
+ return body.get("result").get("totalDocCount")
238
+ return 0
239
+
240
+ @contextmanager
241
+ def init(self) -> None:
242
+ """connect to aliyun opensearch"""
243
+ config = models.Config(
244
+ endpoint=self.db_config["host"],
245
+ protocol="http",
246
+ access_user_name=self.db_config["user"],
247
+ access_pass_word=self.db_config["password"],
248
+ )
249
+
250
+ self.client = client.Client(config)
251
+
252
+ yield
253
+ self.client = None
254
+ del self.client
255
+
256
+ def insert_embeddings(
257
+ self,
258
+ embeddings: list[list[float]],
259
+ metadata: list[int],
260
+ **kwargs,
261
+ ) -> tuple[int, Exception]:
262
+ """Insert the embeddings to the opensearch."""
263
+ assert self.client is not None, "should self.init() first"
264
+ assert len(embeddings) == len(metadata)
265
+ insert_count = 0
266
+
267
+ try:
268
+ for batch_start_offset in range(0, len(embeddings), self.batch_size):
269
+ batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
270
+ documents = []
271
+ for i in range(batch_start_offset, batch_end_offset):
272
+ document_fields = {
273
+ self._primary_field: metadata[i],
274
+ self._vector_field: embeddings[i],
275
+ self._scalar_field: metadata[i],
276
+ "ops_build_channel": "inc",
277
+ }
278
+ document = {"fields": document_fields, "cmd": "add"}
279
+ documents.append(document)
280
+
281
+ push_doc_req = models.PushDocumentsRequest({}, documents)
282
+ self.client.push_documents(
283
+ self.collection_name,
284
+ self._primary_field,
285
+ push_doc_req,
286
+ )
287
+ insert_count += batch_end_offset - batch_start_offset
288
+ except Exception as e:
289
+ log.info(f"Failed to insert data: {e}")
290
+ return (insert_count, e)
291
+ return (insert_count, None)
292
+
293
+ def search_embedding(
294
+ self,
295
+ query: list[float],
296
+ k: int = 100,
297
+ filters: dict | None = None,
298
+ ) -> list[int]:
299
+ assert self.client is not None, "should self.init() first"
300
+ search_params = '{"proxima.hnsw.searcher.ef":' + str(self.case_config.ef_search) + "}"
301
+
302
+ os_filter = f"{self._scalar_field} {filters.get('metadata')}" if filters else ""
303
+
304
+ try:
305
+ request = QueryRequest(
306
+ table_name=self.collection_name,
307
+ vector=query,
308
+ top_k=k,
309
+ search_params=search_params,
310
+ filter=os_filter,
311
+ )
312
+ result = self.client.query(request)
313
+ except Exception as e:
314
+ log.info(f"Error querying index: {e}")
315
+ raise e from e
316
+ else:
317
+ res = json.loads(result.body)
318
+ return [one_res["id"] for one_res in res["result"]]
319
+
320
+ def need_normalize_cosine(self) -> bool:
321
+ """Wheather this database need to normalize dataset to support COSINE"""
322
+ if self.case_config.metric_type == MetricType.COSINE:
323
+ log.info("cosine dataset need normalize.")
324
+ return True
325
+
326
+ return False
327
+
328
+ def optimize(self):
329
+ pass
330
+
331
+ def optimize_with_size(self, data_size: int):
332
+ log.info(f"optimize count: {data_size}")
333
+ retry_times = 0
334
+ while True:
335
+ time.sleep(10)
336
+ log.info(f"begin to {retry_times} times get optimize table")
337
+ retry_times += 1
338
+ total_count = self._get_total_count()
339
+ # check if the data is inserted
340
+ if total_count == data_size:
341
+ log.info("optimize table finish.")
342
+ return
343
+
344
+ def ready_to_load(self):
345
+ """ready_to_load will be called before load in load cases."""