vectordb-bench 0.0.29__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +14 -27
- vectordb_bench/backend/assembler.py +19 -6
- vectordb_bench/backend/cases.py +186 -23
- vectordb_bench/backend/clients/__init__.py +32 -0
- vectordb_bench/backend/clients/api.py +22 -1
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +249 -43
- vectordb_bench/backend/clients/aws_opensearch/cli.py +51 -21
- vectordb_bench/backend/clients/aws_opensearch/config.py +58 -16
- vectordb_bench/backend/clients/chroma/chroma.py +6 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +19 -1
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +133 -45
- vectordb_bench/backend/clients/lancedb/cli.py +62 -8
- vectordb_bench/backend/clients/lancedb/config.py +14 -1
- vectordb_bench/backend/clients/lancedb/lancedb.py +21 -9
- vectordb_bench/backend/clients/memorydb/memorydb.py +2 -2
- vectordb_bench/backend/clients/milvus/cli.py +30 -9
- vectordb_bench/backend/clients/milvus/config.py +3 -0
- vectordb_bench/backend/clients/milvus/milvus.py +81 -23
- vectordb_bench/backend/clients/oceanbase/cli.py +100 -0
- vectordb_bench/backend/clients/oceanbase/config.py +125 -0
- vectordb_bench/backend/clients/oceanbase/oceanbase.py +215 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +39 -25
- vectordb_bench/backend/clients/qdrant_cloud/config.py +59 -3
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +100 -33
- vectordb_bench/backend/clients/qdrant_local/cli.py +60 -0
- vectordb_bench/backend/clients/qdrant_local/config.py +47 -0
- vectordb_bench/backend/clients/qdrant_local/qdrant_local.py +232 -0
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +29 -3
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -0
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +5 -0
- vectordb_bench/backend/dataset.py +143 -27
- vectordb_bench/backend/filter.py +76 -0
- vectordb_bench/backend/runner/__init__.py +3 -3
- vectordb_bench/backend/runner/mp_runner.py +52 -39
- vectordb_bench/backend/runner/rate_runner.py +68 -52
- vectordb_bench/backend/runner/read_write_runner.py +125 -68
- vectordb_bench/backend/runner/serial_runner.py +56 -23
- vectordb_bench/backend/task_runner.py +48 -20
- vectordb_bench/cli/batch_cli.py +121 -0
- vectordb_bench/cli/cli.py +59 -1
- vectordb_bench/cli/vectordbbench.py +7 -0
- vectordb_bench/config-files/batch_sample_config.yml +17 -0
- vectordb_bench/frontend/components/check_results/data.py +16 -11
- vectordb_bench/frontend/components/check_results/filters.py +53 -25
- vectordb_bench/frontend/components/check_results/headerIcon.py +16 -13
- vectordb_bench/frontend/components/check_results/nav.py +20 -0
- vectordb_bench/frontend/components/custom/displayCustomCase.py +43 -8
- vectordb_bench/frontend/components/custom/displaypPrams.py +10 -5
- vectordb_bench/frontend/components/custom/getCustomConfig.py +10 -0
- vectordb_bench/frontend/components/label_filter/charts.py +60 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +48 -52
- vectordb_bench/frontend/components/run_test/dbSelector.py +9 -5
- vectordb_bench/frontend/components/run_test/inputWidget.py +48 -0
- vectordb_bench/frontend/components/run_test/submitTask.py +3 -1
- vectordb_bench/frontend/components/streaming/charts.py +253 -0
- vectordb_bench/frontend/components/streaming/data.py +62 -0
- vectordb_bench/frontend/components/tables/data.py +1 -1
- vectordb_bench/frontend/components/welcome/explainPrams.py +66 -0
- vectordb_bench/frontend/components/welcome/pagestyle.py +106 -0
- vectordb_bench/frontend/components/welcome/welcomePrams.py +147 -0
- vectordb_bench/frontend/config/dbCaseConfigs.py +420 -41
- vectordb_bench/frontend/config/styles.py +32 -2
- vectordb_bench/frontend/pages/concurrent.py +5 -1
- vectordb_bench/frontend/pages/custom.py +4 -0
- vectordb_bench/frontend/pages/label_filter.py +56 -0
- vectordb_bench/frontend/pages/quries_per_dollar.py +5 -1
- vectordb_bench/frontend/pages/results.py +60 -0
- vectordb_bench/frontend/pages/run_test.py +3 -3
- vectordb_bench/frontend/pages/streaming.py +135 -0
- vectordb_bench/frontend/pages/tables.py +4 -0
- vectordb_bench/frontend/vdb_benchmark.py +16 -41
- vectordb_bench/interface.py +6 -2
- vectordb_bench/metric.py +15 -1
- vectordb_bench/models.py +38 -11
- vectordb_bench/results/ElasticCloud/result_20250318_standard_elasticcloud.json +5890 -0
- vectordb_bench/results/Milvus/result_20250509_standard_milvus.json +6138 -0
- vectordb_bench/results/OpenSearch/result_20250224_standard_opensearch.json +7319 -0
- vectordb_bench/results/Pinecone/result_20250124_standard_pinecone.json +2365 -0
- vectordb_bench/results/QdrantCloud/result_20250602_standard_qdrantcloud.json +3556 -0
- vectordb_bench/results/ZillizCloud/result_20250613_standard_zillizcloud.json +6290 -0
- vectordb_bench/results/dbPrices.json +12 -4
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/METADATA +131 -32
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/RECORD +87 -65
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/WHEEL +1 -1
- vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -791
- vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -679
- vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -1352
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from vectordb_bench.base import BaseModel
|
|
20
20
|
from . import utils
|
21
21
|
from .clients import MetricType
|
22
22
|
from .data_source import DatasetReader, DatasetSource
|
23
|
+
from .filter import Filter, FilterOp, non_filter
|
23
24
|
|
24
25
|
log = logging.getLogger(__name__)
|
25
26
|
|
@@ -39,6 +40,21 @@ class BaseDataset(BaseModel):
|
|
39
40
|
with_gt: bool = False
|
40
41
|
_size_label: dict[int, SizeLabel] = PrivateAttr()
|
41
42
|
is_custom: bool = False
|
43
|
+
with_remote_resource: bool = True
|
44
|
+
# for label filter cases
|
45
|
+
with_scalar_labels: bool = False
|
46
|
+
# if True, scalar_labels will be retrieved from a separate parquet file;
|
47
|
+
# otherwise, they will be obtained from train.parquet.
|
48
|
+
scalar_labels_file_separated: bool = True
|
49
|
+
scalar_labels_file: str = "scalar_labels.parquet"
|
50
|
+
scalar_label_percentages: list[float] = []
|
51
|
+
train_id_field: str = "id"
|
52
|
+
train_vector_field: str = "emb"
|
53
|
+
test_file: str = "test.parquet"
|
54
|
+
test_id_field: str = "id"
|
55
|
+
test_vector_field: str = "emb"
|
56
|
+
gt_id_field: str = "id"
|
57
|
+
gt_neighbors_field: str = "neighbors_id"
|
42
58
|
|
43
59
|
@validator("size")
|
44
60
|
def verify_size(cls, v: int):
|
@@ -51,6 +67,10 @@ class BaseDataset(BaseModel):
|
|
51
67
|
def label(self) -> str:
|
52
68
|
return self._size_label.get(self.size).label
|
53
69
|
|
70
|
+
@property
|
71
|
+
def full_name(self) -> str:
|
72
|
+
return f"{self.name.capitalize()} ({self.label.capitalize()})"
|
73
|
+
|
54
74
|
@property
|
55
75
|
def dir_name(self) -> str:
|
56
76
|
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
@@ -59,11 +79,27 @@ class BaseDataset(BaseModel):
|
|
59
79
|
def file_count(self) -> int:
|
60
80
|
return self._size_label.get(self.size).file_count
|
61
81
|
|
82
|
+
@property
|
83
|
+
def train_files(self) -> list[str]:
|
84
|
+
return utils.compose_train_files(self.file_count, self.use_shuffled)
|
85
|
+
|
62
86
|
|
63
87
|
class CustomDataset(BaseDataset):
|
64
88
|
dir: str
|
65
89
|
file_num: int
|
66
90
|
is_custom: bool = True
|
91
|
+
with_remote_resource: bool = False
|
92
|
+
train_file: str = "train"
|
93
|
+
train_id_field: str = "id"
|
94
|
+
train_vector_field: str = "emb"
|
95
|
+
test_file: str = "test.parquet"
|
96
|
+
gt_file: str = "neighbors.parquet"
|
97
|
+
test_vector_field: str = "emb"
|
98
|
+
gt_neighbors_field: str = "neighbors_id"
|
99
|
+
with_scalar_labels: bool = True
|
100
|
+
scalar_labels_file_separated: bool = True
|
101
|
+
scalar_labels_file: str = "scalar_labels.parquet"
|
102
|
+
label_percentages: list[float] = []
|
67
103
|
|
68
104
|
@validator("size")
|
69
105
|
def verify_size(cls, v: int):
|
@@ -81,6 +117,17 @@ class CustomDataset(BaseDataset):
|
|
81
117
|
def file_count(self) -> int:
|
82
118
|
return self.file_num
|
83
119
|
|
120
|
+
@property
|
121
|
+
def train_files(self) -> list[str]:
|
122
|
+
train_file = self.train_file
|
123
|
+
prefix = f"{train_file}"
|
124
|
+
train_files = []
|
125
|
+
prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
|
126
|
+
for i in range(len(prefix_s)):
|
127
|
+
sub_file = f"{prefix_s[i]}.parquet"
|
128
|
+
train_files.append(sub_file)
|
129
|
+
return train_files
|
130
|
+
|
84
131
|
|
85
132
|
class LAION(BaseDataset):
|
86
133
|
name: str = "LAION"
|
@@ -109,12 +156,28 @@ class Cohere(BaseDataset):
|
|
109
156
|
dim: int = 768
|
110
157
|
metric_type: MetricType = MetricType.COSINE
|
111
158
|
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
112
|
-
with_gt: bool =
|
159
|
+
with_gt: bool = True
|
113
160
|
_size_label: dict = {
|
114
161
|
100_000: SizeLabel(100_000, "SMALL", 1),
|
115
162
|
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
|
116
163
|
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
|
117
164
|
}
|
165
|
+
with_scalar_labels: bool = True
|
166
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
167
|
+
|
168
|
+
|
169
|
+
class Bioasq(BaseDataset):
|
170
|
+
name: str = "Bioasq"
|
171
|
+
dim: int = 1024
|
172
|
+
metric_type: MetricType = MetricType.COSINE
|
173
|
+
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
174
|
+
with_gt: bool = True
|
175
|
+
_size_label: dict = {
|
176
|
+
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
|
177
|
+
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
|
178
|
+
}
|
179
|
+
with_scalar_labels: bool = True
|
180
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
118
181
|
|
119
182
|
|
120
183
|
class Glove(BaseDataset):
|
@@ -146,12 +209,14 @@ class OpenAI(BaseDataset):
|
|
146
209
|
dim: int = 1536
|
147
210
|
metric_type: MetricType = MetricType.COSINE
|
148
211
|
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
149
|
-
with_gt: bool =
|
212
|
+
with_gt: bool = True
|
150
213
|
_size_label: dict = {
|
151
214
|
50_000: SizeLabel(50_000, "SMALL", 1),
|
152
215
|
500_000: SizeLabel(500_000, "MEDIUM", 1),
|
153
216
|
5_000_000: SizeLabel(5_000_000, "LARGE", 10),
|
154
217
|
}
|
218
|
+
with_scalar_labels: bool = True
|
219
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
155
220
|
|
156
221
|
|
157
222
|
class DatasetManager(BaseModel):
|
@@ -166,8 +231,9 @@ class DatasetManager(BaseModel):
|
|
166
231
|
"""
|
167
232
|
|
168
233
|
data: BaseDataset
|
169
|
-
test_data:
|
170
|
-
gt_data:
|
234
|
+
test_data: list[list[float]] | None = None
|
235
|
+
gt_data: list[list[int]] | None = None
|
236
|
+
scalar_labels: pl.DataFrame | None = None
|
171
237
|
train_files: list[str] = []
|
172
238
|
reader: DatasetReader | None = None
|
173
239
|
|
@@ -191,7 +257,7 @@ class DatasetManager(BaseModel):
|
|
191
257
|
return pathlib.Path(
|
192
258
|
config.DATASET_LOCAL_DIR,
|
193
259
|
self.data.name.lower(),
|
194
|
-
self.data.dir_name
|
260
|
+
self.data.dir_name,
|
195
261
|
)
|
196
262
|
|
197
263
|
def __iter__(self):
|
@@ -201,58 +267,59 @@ class DatasetManager(BaseModel):
|
|
201
267
|
def prepare(
|
202
268
|
self,
|
203
269
|
source: DatasetSource = DatasetSource.S3,
|
204
|
-
filters:
|
270
|
+
filters: Filter = non_filter,
|
205
271
|
) -> bool:
|
206
272
|
"""Download the dataset from DatasetSource
|
207
273
|
url = f"{source}/{self.data.dir_name}"
|
208
274
|
|
209
275
|
Args:
|
210
276
|
source(DatasetSource): S3 or AliyunOSS, default as S3
|
211
|
-
filters(
|
277
|
+
filters(Filter): combined with dataset's with_gt to
|
212
278
|
compose the correct ground_truth file
|
213
279
|
|
214
280
|
Returns:
|
215
281
|
bool: whether the dataset is successfully prepared
|
216
282
|
|
217
283
|
"""
|
218
|
-
|
219
|
-
|
220
|
-
train_files = utils.compose_train_files(file_count, use_shuffled)
|
221
|
-
all_files = train_files
|
222
|
-
|
223
|
-
test_file = "test.parquet"
|
224
|
-
all_files.extend([test_file])
|
225
|
-
gt_file = None
|
284
|
+
self.train_files = self.data.train_files
|
285
|
+
gt_file, test_file = None, None
|
226
286
|
if self.data.with_gt:
|
227
|
-
gt_file =
|
228
|
-
all_files.extend([gt_file])
|
287
|
+
gt_file, test_file = filters.groundtruth_file, self.data.test_file
|
229
288
|
|
230
|
-
if
|
289
|
+
if self.data.with_remote_resource:
|
290
|
+
download_files = [file for file in self.train_files]
|
291
|
+
download_files.extend([gt_file, test_file])
|
292
|
+
if self.data.with_scalar_labels and self.data.scalar_labels_file_separated:
|
293
|
+
download_files.append(self.data.scalar_labels_file)
|
231
294
|
source.reader().read(
|
232
295
|
dataset=self.data.dir_name.lower(),
|
233
|
-
files=
|
296
|
+
files=download_files,
|
234
297
|
local_ds_root=self.data_dir,
|
235
298
|
)
|
236
299
|
|
237
|
-
|
238
|
-
|
300
|
+
# read scalar_labels_file if separated
|
301
|
+
if (
|
302
|
+
filters.type == FilterOp.StrEqual
|
303
|
+
and self.data.with_scalar_labels
|
304
|
+
and self.data.scalar_labels_file_separated
|
305
|
+
):
|
306
|
+
self.scalar_labels = self._read_file(self.data.scalar_labels_file)
|
239
307
|
|
240
|
-
if gt_file is not None:
|
241
|
-
self.
|
308
|
+
if gt_file is not None and test_file is not None:
|
309
|
+
self.test_data = self._read_file(test_file)[self.data.test_vector_field].to_list()
|
310
|
+
self.gt_data = self._read_file(gt_file)[self.data.gt_neighbors_field].to_list()
|
242
311
|
|
243
|
-
prefix = "shuffle_train" if use_shuffled else "train"
|
244
|
-
self.train_files = sorted([f.name for f in self.data_dir.glob(f"{prefix}*.parquet")])
|
245
312
|
log.debug(f"{self.data.name}: available train files {self.train_files}")
|
246
313
|
|
247
314
|
return True
|
248
315
|
|
249
|
-
def _read_file(self, file_name: str) ->
|
316
|
+
def _read_file(self, file_name: str) -> pl.DataFrame:
|
250
317
|
"""read one file from disk into memory"""
|
251
318
|
log.info(f"Read the entire file into memory: {file_name}")
|
252
319
|
p = pathlib.Path(self.data_dir, file_name)
|
253
320
|
if not p.exists():
|
254
321
|
log.warning(f"No such file: {p}")
|
255
|
-
return
|
322
|
+
return pl.DataFrame()
|
256
323
|
|
257
324
|
return pl.read_parquet(p)
|
258
325
|
|
@@ -308,6 +375,7 @@ class Dataset(Enum):
|
|
308
375
|
LAION = LAION
|
309
376
|
GIST = GIST
|
310
377
|
COHERE = Cohere
|
378
|
+
BIOASQ = Bioasq
|
311
379
|
GLOVE = Glove
|
312
380
|
SIFT = SIFT
|
313
381
|
OPENAI = OpenAI
|
@@ -317,3 +385,51 @@ class Dataset(Enum):
|
|
317
385
|
|
318
386
|
def manager(self, size: int) -> DatasetManager:
|
319
387
|
return DatasetManager(data=self.get(size))
|
388
|
+
|
389
|
+
|
390
|
+
class DatasetWithSizeType(Enum):
|
391
|
+
CohereSmall = "Small Cohere (768dim, 100K)"
|
392
|
+
CohereMedium = "Medium Cohere (768dim, 1M)"
|
393
|
+
CohereLarge = "Large Cohere (768dim, 10M)"
|
394
|
+
BioasqMedium = "Medium Bioasq (1024dim, 1M)"
|
395
|
+
BioasqLarge = "Large Bioasq (1024dim, 10M)"
|
396
|
+
OpenAISmall = "Small OpenAI (1536dim, 50K)"
|
397
|
+
OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
|
398
|
+
OpenAILarge = "Large OpenAI (1536dim, 5M)"
|
399
|
+
|
400
|
+
def get_manager(self) -> DatasetManager:
|
401
|
+
if self not in DatasetWithSizeMap:
|
402
|
+
msg = f"wrong ScalarDatasetWithSizeType: {self.name}"
|
403
|
+
raise ValueError(msg)
|
404
|
+
return DatasetWithSizeMap.get(self)
|
405
|
+
|
406
|
+
def get_load_timeout(self) -> float:
|
407
|
+
if "small" in self.value.lower():
|
408
|
+
return config.LOAD_TIMEOUT_768D_100K
|
409
|
+
if "medium" in self.value.lower():
|
410
|
+
return config.LOAD_TIMEOUT_768D_1M
|
411
|
+
if "large" in self.value.lower():
|
412
|
+
return config.LOAD_TIMEOUT_768D_10M
|
413
|
+
msg = f"No load_timeout for {self.value}"
|
414
|
+
raise KeyError(msg)
|
415
|
+
|
416
|
+
def get_optimize_timeout(self) -> float:
|
417
|
+
if "small" in self.value.lower():
|
418
|
+
return config.OPTIMIZE_TIMEOUT_768D_100K
|
419
|
+
if "medium" in self.value.lower():
|
420
|
+
return config.OPTIMIZE_TIMEOUT_768D_1M
|
421
|
+
if "large" in self.value.lower():
|
422
|
+
return config.OPTIMIZE_TIMEOUT_768D_10M
|
423
|
+
return config.OPTIMIZE_TIMEOUT_DEFAULT
|
424
|
+
|
425
|
+
|
426
|
+
DatasetWithSizeMap = {
|
427
|
+
DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
|
428
|
+
DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
|
429
|
+
DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
|
430
|
+
DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
|
431
|
+
DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
|
432
|
+
DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
|
433
|
+
DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),
|
434
|
+
DatasetWithSizeType.OpenAILarge: Dataset.OPENAI.manager(5_000_000),
|
435
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from enum import StrEnum
|
2
|
+
|
3
|
+
from ..base import BaseModel
|
4
|
+
|
5
|
+
|
6
|
+
class FilterOp(StrEnum):
|
7
|
+
NumGE = "NumGE" # test ">="
|
8
|
+
StrEqual = "Label" # test "=="
|
9
|
+
NonFilter = "NonFilter"
|
10
|
+
|
11
|
+
|
12
|
+
class Filter(BaseModel):
|
13
|
+
type: FilterOp
|
14
|
+
filter_rate: float = 0.0
|
15
|
+
|
16
|
+
@property
|
17
|
+
def groundtruth_file(self) -> str:
|
18
|
+
raise NotImplementedError
|
19
|
+
|
20
|
+
|
21
|
+
class NonFilter(Filter):
|
22
|
+
type: FilterOp = FilterOp.NonFilter
|
23
|
+
filter_rate: float = 0.0
|
24
|
+
gt_file_name: str = "neighbors.parquet"
|
25
|
+
|
26
|
+
@property
|
27
|
+
def groundtruth_file(self) -> str:
|
28
|
+
return self.gt_file_name
|
29
|
+
|
30
|
+
|
31
|
+
non_filter = NonFilter()
|
32
|
+
|
33
|
+
|
34
|
+
class IntFilter(Filter):
|
35
|
+
"""
|
36
|
+
compatible with older int-filter cases
|
37
|
+
filter expr: int_field >= int_value (dataset_size * filter_rate)
|
38
|
+
"""
|
39
|
+
|
40
|
+
type: FilterOp = FilterOp.NumGE
|
41
|
+
int_field: str = "id"
|
42
|
+
int_value: int
|
43
|
+
|
44
|
+
@property
|
45
|
+
def groundtruth_file(self) -> str:
|
46
|
+
if self.filter_rate == 0.01:
|
47
|
+
return "neighbors_head_1p.parquet"
|
48
|
+
if self.filter_rate == 0.99:
|
49
|
+
return "neighbors_tail_1p.parquet"
|
50
|
+
msg = f"Not Support Int Filter - {self.filter_rate}"
|
51
|
+
raise RuntimeError(msg)
|
52
|
+
|
53
|
+
|
54
|
+
class LabelFilter(Filter):
|
55
|
+
"""
|
56
|
+
filter expr: label_field == label_value, like `color == "red"`
|
57
|
+
"""
|
58
|
+
|
59
|
+
type: FilterOp = FilterOp.StrEqual
|
60
|
+
label_field: str = "labels"
|
61
|
+
label_percentage: float
|
62
|
+
|
63
|
+
@property
|
64
|
+
def label_value(self) -> str:
|
65
|
+
p = self.label_percentage * 100
|
66
|
+
if p >= 1:
|
67
|
+
return f"label_{int(p)}p" # such as 5p, 20p, 1p, ...
|
68
|
+
return f"label_{p:.1f}p" # such as 0.1p, 0.5p, ...
|
69
|
+
|
70
|
+
def __init__(self, label_percentage: float, **kwargs):
|
71
|
+
filter_rate = 1.0 - label_percentage
|
72
|
+
super().__init__(filter_rate=filter_rate, label_percentage=label_percentage, **kwargs)
|
73
|
+
|
74
|
+
@property
|
75
|
+
def groundtruth_file(self) -> str:
|
76
|
+
return f"neighbors_{self.label_field}_{self.label_value}.parquet"
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from .mp_runner import
|
2
|
-
|
3
|
-
)
|
1
|
+
from .mp_runner import MultiProcessingSearchRunner
|
2
|
+
from .read_write_runner import ReadWriteRunner
|
4
3
|
from .serial_runner import SerialInsertRunner, SerialSearchRunner
|
5
4
|
|
6
5
|
__all__ = [
|
7
6
|
"MultiProcessingSearchRunner",
|
7
|
+
"ReadWriteRunner",
|
8
8
|
"SerialInsertRunner",
|
9
9
|
"SerialSearchRunner",
|
10
10
|
]
|
@@ -9,6 +9,8 @@ from multiprocessing.queues import Queue
|
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
|
12
|
+
from vectordb_bench.backend.filter import Filter, non_filter
|
13
|
+
|
12
14
|
from ... import config
|
13
15
|
from ...models import ConcurrencySlotTimeoutError
|
14
16
|
from ..clients import api
|
@@ -31,7 +33,7 @@ class MultiProcessingSearchRunner:
|
|
31
33
|
db: api.VectorDB,
|
32
34
|
test_data: list[list[float]],
|
33
35
|
k: int = config.K_DEFAULT,
|
34
|
-
filters:
|
36
|
+
filters: Filter = non_filter,
|
35
37
|
concurrencies: Iterable[int] = config.NUM_CONCURRENCY,
|
36
38
|
duration: int = config.CONCURRENCY_DURATION,
|
37
39
|
concurrency_timeout: int = config.CONCURRENCY_TIMEOUT,
|
@@ -58,6 +60,7 @@ class MultiProcessingSearchRunner:
|
|
58
60
|
cond.wait()
|
59
61
|
|
60
62
|
with self.db.init():
|
63
|
+
self.db.prepare_filter(self.filters)
|
61
64
|
num, idx = len(test_data), random.randint(0, len(test_data) - 1)
|
62
65
|
|
63
66
|
start_time = time.perf_counter()
|
@@ -66,18 +69,12 @@ class MultiProcessingSearchRunner:
|
|
66
69
|
while time.perf_counter() < start_time + self.duration:
|
67
70
|
s = time.perf_counter()
|
68
71
|
try:
|
69
|
-
self.db.search_embedding(
|
70
|
-
|
71
|
-
|
72
|
-
self.filters,
|
73
|
-
)
|
72
|
+
self.db.search_embedding(test_data[idx], self.k)
|
73
|
+
count += 1
|
74
|
+
latencies.append(time.perf_counter() - s)
|
74
75
|
except Exception as e:
|
75
76
|
log.warning(f"VectorDB search_embedding error: {e}")
|
76
|
-
traceback.print_exc(chain=True)
|
77
|
-
raise e from None
|
78
77
|
|
79
|
-
latencies.append(time.perf_counter() - s)
|
80
|
-
count += 1
|
81
78
|
# loop through the test data
|
82
79
|
idx = idx + 1 if idx < num - 1 else 0
|
83
80
|
|
@@ -181,10 +178,20 @@ class MultiProcessingSearchRunner:
|
|
181
178
|
def stop(self) -> None:
|
182
179
|
pass
|
183
180
|
|
184
|
-
def run_by_dur(self, duration: int) -> float:
|
181
|
+
def run_by_dur(self, duration: int) -> tuple[float, float]:
|
182
|
+
"""
|
183
|
+
Returns:
|
184
|
+
float: largest qps
|
185
|
+
float: failed rate
|
186
|
+
"""
|
185
187
|
return self._run_by_dur(duration)
|
186
188
|
|
187
|
-
def _run_by_dur(self, duration: int) -> float:
|
189
|
+
def _run_by_dur(self, duration: int) -> tuple[float, float]:
|
190
|
+
"""
|
191
|
+
Returns:
|
192
|
+
float: largest qps
|
193
|
+
float: failed rate
|
194
|
+
"""
|
188
195
|
max_qps = 0
|
189
196
|
try:
|
190
197
|
for conc in self.concurrencies:
|
@@ -208,12 +215,17 @@ class MultiProcessingSearchRunner:
|
|
208
215
|
log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
|
209
216
|
|
210
217
|
start = time.perf_counter()
|
211
|
-
|
218
|
+
res = [r.result() for r in future_iter]
|
219
|
+
all_success_count = sum([r[0] for r in res])
|
220
|
+
all_failed_count = sum([r[1] for r in res])
|
221
|
+
failed_rate = all_failed_count / (all_failed_count + all_success_count)
|
212
222
|
cost = time.perf_counter() - start
|
213
223
|
|
214
|
-
qps = round(
|
215
|
-
log.info(
|
216
|
-
|
224
|
+
qps = round(all_success_count / cost, 4)
|
225
|
+
log.info(
|
226
|
+
f"End search in concurrency {conc}: dur={cost}s, failed_rate={failed_rate}, "
|
227
|
+
f"all_success_count={all_success_count}, all_failed_count={all_failed_count}, qps={qps}",
|
228
|
+
)
|
217
229
|
if qps > max_qps:
|
218
230
|
max_qps = qps
|
219
231
|
log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
|
@@ -230,52 +242,53 @@ class MultiProcessingSearchRunner:
|
|
230
242
|
finally:
|
231
243
|
self.stop()
|
232
244
|
|
233
|
-
return max_qps
|
245
|
+
return max_qps, failed_rate
|
234
246
|
|
235
|
-
def search_by_dur(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
) -> int:
|
247
|
+
def search_by_dur(self, dur: int, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> tuple[int, int]:
|
248
|
+
"""
|
249
|
+
Returns:
|
250
|
+
int: successful requests count
|
251
|
+
int: failed requests count
|
252
|
+
"""
|
242
253
|
# sync all process
|
243
254
|
q.put(1)
|
244
255
|
with cond:
|
245
256
|
cond.wait()
|
246
257
|
|
247
258
|
with self.db.init():
|
259
|
+
self.db.prepare_filter(self.filters)
|
248
260
|
num, idx = len(test_data), random.randint(0, len(test_data) - 1)
|
249
261
|
|
250
262
|
start_time = time.perf_counter()
|
251
|
-
|
263
|
+
success_count = 0
|
264
|
+
failed_cnt = 0
|
252
265
|
while time.perf_counter() < start_time + dur:
|
253
266
|
s = time.perf_counter()
|
254
267
|
try:
|
255
|
-
self.db.search_embedding(
|
256
|
-
|
257
|
-
self.k,
|
258
|
-
self.filters,
|
259
|
-
)
|
268
|
+
self.db.search_embedding(test_data[idx], self.k)
|
269
|
+
success_count += 1
|
260
270
|
except Exception as e:
|
261
|
-
|
262
|
-
|
263
|
-
|
271
|
+
failed_cnt += 1
|
272
|
+
# reduce log
|
273
|
+
if failed_cnt <= 3:
|
274
|
+
log.warning(f"VectorDB search_embedding error: {e}")
|
275
|
+
else:
|
276
|
+
log.debug(f"VectorDB search_embedding error: {e}")
|
264
277
|
|
265
|
-
count += 1
|
266
278
|
# loop through the test data
|
267
279
|
idx = idx + 1 if idx < num - 1 else 0
|
268
280
|
|
269
|
-
if
|
281
|
+
if success_count % 500 == 0:
|
270
282
|
log.debug(
|
271
|
-
f"({mp.current_process().name:16}) search_count: {
|
272
|
-
f"latest_latency={time.perf_counter()-s}"
|
283
|
+
f"({mp.current_process().name:16}) search_count: {success_count}, "
|
284
|
+
f"latest_latency={time.perf_counter()-s}",
|
273
285
|
)
|
274
286
|
|
275
287
|
total_dur = round(time.perf_counter() - start_time, 4)
|
276
288
|
log.debug(
|
277
289
|
f"{mp.current_process().name:16} search {self.duration}s: "
|
278
|
-
f"actual_dur={total_dur}s, count={
|
290
|
+
f"actual_dur={total_dur}s, count={success_count}, failed_cnt={failed_cnt}, "
|
291
|
+
f"qps (successful) in this process: {round(success_count / total_dur, 4):3}",
|
279
292
|
)
|
280
293
|
|
281
|
-
return
|
294
|
+
return success_count, failed_cnt
|