vectordb-bench 0.0.30__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +14 -27
- vectordb_bench/__main__.py +1 -1
- vectordb_bench/backend/assembler.py +19 -6
- vectordb_bench/backend/cases.py +186 -23
- vectordb_bench/backend/clients/__init__.py +16 -0
- vectordb_bench/backend/clients/api.py +22 -1
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +82 -41
- vectordb_bench/backend/clients/aws_opensearch/config.py +37 -4
- vectordb_bench/backend/clients/chroma/chroma.py +6 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +31 -1
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +133 -45
- vectordb_bench/backend/clients/milvus/config.py +1 -0
- vectordb_bench/backend/clients/milvus/milvus.py +75 -23
- vectordb_bench/backend/clients/oceanbase/cli.py +100 -0
- vectordb_bench/backend/clients/oceanbase/config.py +125 -0
- vectordb_bench/backend/clients/oceanbase/oceanbase.py +215 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +39 -25
- vectordb_bench/backend/clients/qdrant_cloud/config.py +73 -3
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +100 -33
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/dataset.py +146 -27
- vectordb_bench/backend/filter.py +76 -0
- vectordb_bench/backend/runner/__init__.py +3 -3
- vectordb_bench/backend/runner/mp_runner.py +52 -39
- vectordb_bench/backend/runner/rate_runner.py +68 -52
- vectordb_bench/backend/runner/read_write_runner.py +125 -68
- vectordb_bench/backend/runner/serial_runner.py +56 -23
- vectordb_bench/backend/task_runner.py +59 -20
- vectordb_bench/cli/cli.py +59 -1
- vectordb_bench/cli/vectordbbench.py +3 -0
- vectordb_bench/frontend/components/check_results/data.py +16 -11
- vectordb_bench/frontend/components/check_results/filters.py +53 -25
- vectordb_bench/frontend/components/check_results/headerIcon.py +18 -13
- vectordb_bench/frontend/components/check_results/nav.py +20 -0
- vectordb_bench/frontend/components/custom/displayCustomCase.py +43 -8
- vectordb_bench/frontend/components/custom/displaypPrams.py +10 -5
- vectordb_bench/frontend/components/custom/getCustomConfig.py +10 -0
- vectordb_bench/frontend/components/label_filter/charts.py +60 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +48 -52
- vectordb_bench/frontend/components/run_test/dbSelector.py +9 -5
- vectordb_bench/frontend/components/run_test/inputWidget.py +48 -0
- vectordb_bench/frontend/components/run_test/submitTask.py +3 -1
- vectordb_bench/frontend/components/streaming/charts.py +253 -0
- vectordb_bench/frontend/components/streaming/data.py +62 -0
- vectordb_bench/frontend/components/tables/data.py +1 -1
- vectordb_bench/frontend/components/welcome/explainPrams.py +66 -0
- vectordb_bench/frontend/components/welcome/pagestyle.py +106 -0
- vectordb_bench/frontend/components/welcome/welcomePrams.py +147 -0
- vectordb_bench/frontend/config/dbCaseConfigs.py +309 -42
- vectordb_bench/frontend/config/styles.py +34 -4
- vectordb_bench/frontend/pages/concurrent.py +5 -1
- vectordb_bench/frontend/pages/custom.py +4 -0
- vectordb_bench/frontend/pages/label_filter.py +56 -0
- vectordb_bench/frontend/pages/quries_per_dollar.py +5 -1
- vectordb_bench/frontend/{vdb_benchmark.py → pages/results.py} +10 -4
- vectordb_bench/frontend/pages/run_test.py +3 -3
- vectordb_bench/frontend/pages/streaming.py +135 -0
- vectordb_bench/frontend/pages/tables.py +4 -0
- vectordb_bench/frontend/vdbbench.py +31 -0
- vectordb_bench/interface.py +8 -3
- vectordb_bench/metric.py +15 -1
- vectordb_bench/models.py +31 -11
- vectordb_bench/results/ElasticCloud/result_20250318_standard_elasticcloud.json +5890 -0
- vectordb_bench/results/Milvus/result_20250509_standard_milvus.json +6138 -0
- vectordb_bench/results/OpenSearch/result_20250224_standard_opensearch.json +7319 -0
- vectordb_bench/results/Pinecone/result_20250124_standard_pinecone.json +2365 -0
- vectordb_bench/results/QdrantCloud/result_20250602_standard_qdrantcloud.json +3556 -0
- vectordb_bench/results/ZillizCloud/result_20250613_standard_zillizcloud.json +6290 -0
- vectordb_bench/results/dbPrices.json +12 -4
- vectordb_bench/results/getLeaderboardDataV2.py +59 -0
- vectordb_bench/results/leaderboard_v2.json +2662 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/METADATA +93 -40
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/RECORD +77 -58
- vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -791
- vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -679
- vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -1352
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/WHEEL +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from vectordb_bench.base import BaseModel
|
|
20
20
|
from . import utils
|
21
21
|
from .clients import MetricType
|
22
22
|
from .data_source import DatasetReader, DatasetSource
|
23
|
+
from .filter import Filter, FilterOp, non_filter
|
23
24
|
|
24
25
|
log = logging.getLogger(__name__)
|
25
26
|
|
@@ -39,6 +40,21 @@ class BaseDataset(BaseModel):
|
|
39
40
|
with_gt: bool = False
|
40
41
|
_size_label: dict[int, SizeLabel] = PrivateAttr()
|
41
42
|
is_custom: bool = False
|
43
|
+
with_remote_resource: bool = True
|
44
|
+
# for label filter cases
|
45
|
+
with_scalar_labels: bool = False
|
46
|
+
# if True, scalar_labels will be retrieved from a separate parquet file;
|
47
|
+
# otherwise, they will be obtained from train.parquet.
|
48
|
+
scalar_labels_file_separated: bool = True
|
49
|
+
scalar_labels_file: str = "scalar_labels.parquet"
|
50
|
+
scalar_label_percentages: list[float] = []
|
51
|
+
train_id_field: str = "id"
|
52
|
+
train_vector_field: str = "emb"
|
53
|
+
test_file: str = "test.parquet"
|
54
|
+
test_id_field: str = "id"
|
55
|
+
test_vector_field: str = "emb"
|
56
|
+
gt_id_field: str = "id"
|
57
|
+
gt_neighbors_field: str = "neighbors_id"
|
42
58
|
|
43
59
|
@validator("size")
|
44
60
|
def verify_size(cls, v: int):
|
@@ -51,6 +67,10 @@ class BaseDataset(BaseModel):
|
|
51
67
|
def label(self) -> str:
|
52
68
|
return self._size_label.get(self.size).label
|
53
69
|
|
70
|
+
@property
|
71
|
+
def full_name(self) -> str:
|
72
|
+
return f"{self.name.capitalize()} ({self.label.capitalize()})"
|
73
|
+
|
54
74
|
@property
|
55
75
|
def dir_name(self) -> str:
|
56
76
|
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
@@ -59,11 +79,27 @@ class BaseDataset(BaseModel):
|
|
59
79
|
def file_count(self) -> int:
|
60
80
|
return self._size_label.get(self.size).file_count
|
61
81
|
|
82
|
+
@property
|
83
|
+
def train_files(self) -> list[str]:
|
84
|
+
return utils.compose_train_files(self.file_count, self.use_shuffled)
|
85
|
+
|
62
86
|
|
63
87
|
class CustomDataset(BaseDataset):
|
64
88
|
dir: str
|
65
89
|
file_num: int
|
66
90
|
is_custom: bool = True
|
91
|
+
with_remote_resource: bool = False
|
92
|
+
train_file: str = "train"
|
93
|
+
train_id_field: str = "id"
|
94
|
+
train_vector_field: str = "emb"
|
95
|
+
test_file: str = "test.parquet"
|
96
|
+
gt_file: str = "neighbors.parquet"
|
97
|
+
test_vector_field: str = "emb"
|
98
|
+
gt_neighbors_field: str = "neighbors_id"
|
99
|
+
with_scalar_labels: bool = True
|
100
|
+
scalar_labels_file_separated: bool = True
|
101
|
+
scalar_labels_file: str = "scalar_labels.parquet"
|
102
|
+
label_percentages: list[float] = []
|
67
103
|
|
68
104
|
@validator("size")
|
69
105
|
def verify_size(cls, v: int):
|
@@ -81,6 +117,17 @@ class CustomDataset(BaseDataset):
|
|
81
117
|
def file_count(self) -> int:
|
82
118
|
return self.file_num
|
83
119
|
|
120
|
+
@property
|
121
|
+
def train_files(self) -> list[str]:
|
122
|
+
train_file = self.train_file
|
123
|
+
prefix = f"{train_file}"
|
124
|
+
train_files = []
|
125
|
+
prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
|
126
|
+
for i in range(len(prefix_s)):
|
127
|
+
sub_file = f"{prefix_s[i]}.parquet"
|
128
|
+
train_files.append(sub_file)
|
129
|
+
return train_files
|
130
|
+
|
84
131
|
|
85
132
|
class LAION(BaseDataset):
|
86
133
|
name: str = "LAION"
|
@@ -109,12 +156,28 @@ class Cohere(BaseDataset):
|
|
109
156
|
dim: int = 768
|
110
157
|
metric_type: MetricType = MetricType.COSINE
|
111
158
|
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
112
|
-
with_gt: bool =
|
159
|
+
with_gt: bool = True
|
113
160
|
_size_label: dict = {
|
114
161
|
100_000: SizeLabel(100_000, "SMALL", 1),
|
115
162
|
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
|
116
163
|
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
|
117
164
|
}
|
165
|
+
with_scalar_labels: bool = True
|
166
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
167
|
+
|
168
|
+
|
169
|
+
class Bioasq(BaseDataset):
|
170
|
+
name: str = "Bioasq"
|
171
|
+
dim: int = 1024
|
172
|
+
metric_type: MetricType = MetricType.COSINE
|
173
|
+
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
174
|
+
with_gt: bool = True
|
175
|
+
_size_label: dict = {
|
176
|
+
1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
|
177
|
+
10_000_000: SizeLabel(10_000_000, "LARGE", 10),
|
178
|
+
}
|
179
|
+
with_scalar_labels: bool = True
|
180
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
118
181
|
|
119
182
|
|
120
183
|
class Glove(BaseDataset):
|
@@ -146,12 +209,14 @@ class OpenAI(BaseDataset):
|
|
146
209
|
dim: int = 1536
|
147
210
|
metric_type: MetricType = MetricType.COSINE
|
148
211
|
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
149
|
-
with_gt: bool =
|
212
|
+
with_gt: bool = True
|
150
213
|
_size_label: dict = {
|
151
214
|
50_000: SizeLabel(50_000, "SMALL", 1),
|
152
215
|
500_000: SizeLabel(500_000, "MEDIUM", 1),
|
153
216
|
5_000_000: SizeLabel(5_000_000, "LARGE", 10),
|
154
217
|
}
|
218
|
+
with_scalar_labels: bool = True
|
219
|
+
scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
|
155
220
|
|
156
221
|
|
157
222
|
class DatasetManager(BaseModel):
|
@@ -166,8 +231,9 @@ class DatasetManager(BaseModel):
|
|
166
231
|
"""
|
167
232
|
|
168
233
|
data: BaseDataset
|
169
|
-
test_data:
|
170
|
-
gt_data:
|
234
|
+
test_data: list[list[float]] | None = None
|
235
|
+
gt_data: list[list[int]] | None = None
|
236
|
+
scalar_labels: pl.DataFrame | None = None
|
171
237
|
train_files: list[str] = []
|
172
238
|
reader: DatasetReader | None = None
|
173
239
|
|
@@ -176,6 +242,9 @@ class DatasetManager(BaseModel):
|
|
176
242
|
return self.data.name == obj.data.name and self.data.label == obj.data.label
|
177
243
|
return False
|
178
244
|
|
245
|
+
def __hash__(self) -> int:
|
246
|
+
return hash((self.data.name, self.data.label))
|
247
|
+
|
179
248
|
def set_reader(self, reader: DatasetReader):
|
180
249
|
self.reader = reader
|
181
250
|
|
@@ -191,7 +260,7 @@ class DatasetManager(BaseModel):
|
|
191
260
|
return pathlib.Path(
|
192
261
|
config.DATASET_LOCAL_DIR,
|
193
262
|
self.data.name.lower(),
|
194
|
-
self.data.dir_name
|
263
|
+
self.data.dir_name,
|
195
264
|
)
|
196
265
|
|
197
266
|
def __iter__(self):
|
@@ -201,58 +270,59 @@ class DatasetManager(BaseModel):
|
|
201
270
|
def prepare(
|
202
271
|
self,
|
203
272
|
source: DatasetSource = DatasetSource.S3,
|
204
|
-
filters:
|
273
|
+
filters: Filter = non_filter,
|
205
274
|
) -> bool:
|
206
275
|
"""Download the dataset from DatasetSource
|
207
276
|
url = f"{source}/{self.data.dir_name}"
|
208
277
|
|
209
278
|
Args:
|
210
279
|
source(DatasetSource): S3 or AliyunOSS, default as S3
|
211
|
-
filters(
|
280
|
+
filters(Filter): combined with dataset's with_gt to
|
212
281
|
compose the correct ground_truth file
|
213
282
|
|
214
283
|
Returns:
|
215
284
|
bool: whether the dataset is successfully prepared
|
216
285
|
|
217
286
|
"""
|
218
|
-
|
219
|
-
|
220
|
-
train_files = utils.compose_train_files(file_count, use_shuffled)
|
221
|
-
all_files = train_files
|
222
|
-
|
223
|
-
test_file = "test.parquet"
|
224
|
-
all_files.extend([test_file])
|
225
|
-
gt_file = None
|
287
|
+
self.train_files = self.data.train_files
|
288
|
+
gt_file, test_file = None, None
|
226
289
|
if self.data.with_gt:
|
227
|
-
gt_file =
|
228
|
-
all_files.extend([gt_file])
|
290
|
+
gt_file, test_file = filters.groundtruth_file, self.data.test_file
|
229
291
|
|
230
|
-
if
|
292
|
+
if self.data.with_remote_resource:
|
293
|
+
download_files = [file for file in self.train_files]
|
294
|
+
download_files.extend([gt_file, test_file])
|
295
|
+
if self.data.with_scalar_labels and self.data.scalar_labels_file_separated:
|
296
|
+
download_files.append(self.data.scalar_labels_file)
|
231
297
|
source.reader().read(
|
232
298
|
dataset=self.data.dir_name.lower(),
|
233
|
-
files=
|
299
|
+
files=download_files,
|
234
300
|
local_ds_root=self.data_dir,
|
235
301
|
)
|
236
302
|
|
237
|
-
|
238
|
-
|
303
|
+
# read scalar_labels_file if separated
|
304
|
+
if (
|
305
|
+
filters.type == FilterOp.StrEqual
|
306
|
+
and self.data.with_scalar_labels
|
307
|
+
and self.data.scalar_labels_file_separated
|
308
|
+
):
|
309
|
+
self.scalar_labels = self._read_file(self.data.scalar_labels_file)
|
239
310
|
|
240
|
-
if gt_file is not None:
|
241
|
-
self.
|
311
|
+
if gt_file is not None and test_file is not None:
|
312
|
+
self.test_data = self._read_file(test_file)[self.data.test_vector_field].to_list()
|
313
|
+
self.gt_data = self._read_file(gt_file)[self.data.gt_neighbors_field].to_list()
|
242
314
|
|
243
|
-
prefix = "shuffle_train" if use_shuffled else "train"
|
244
|
-
self.train_files = sorted([f.name for f in self.data_dir.glob(f"{prefix}*.parquet")])
|
245
315
|
log.debug(f"{self.data.name}: available train files {self.train_files}")
|
246
316
|
|
247
317
|
return True
|
248
318
|
|
249
|
-
def _read_file(self, file_name: str) ->
|
319
|
+
def _read_file(self, file_name: str) -> pl.DataFrame:
|
250
320
|
"""read one file from disk into memory"""
|
251
321
|
log.info(f"Read the entire file into memory: {file_name}")
|
252
322
|
p = pathlib.Path(self.data_dir, file_name)
|
253
323
|
if not p.exists():
|
254
324
|
log.warning(f"No such file: {p}")
|
255
|
-
return
|
325
|
+
return pl.DataFrame()
|
256
326
|
|
257
327
|
return pl.read_parquet(p)
|
258
328
|
|
@@ -308,6 +378,7 @@ class Dataset(Enum):
|
|
308
378
|
LAION = LAION
|
309
379
|
GIST = GIST
|
310
380
|
COHERE = Cohere
|
381
|
+
BIOASQ = Bioasq
|
311
382
|
GLOVE = Glove
|
312
383
|
SIFT = SIFT
|
313
384
|
OPENAI = OpenAI
|
@@ -317,3 +388,51 @@ class Dataset(Enum):
|
|
317
388
|
|
318
389
|
def manager(self, size: int) -> DatasetManager:
|
319
390
|
return DatasetManager(data=self.get(size))
|
391
|
+
|
392
|
+
|
393
|
+
class DatasetWithSizeType(Enum):
|
394
|
+
CohereSmall = "Small Cohere (768dim, 100K)"
|
395
|
+
CohereMedium = "Medium Cohere (768dim, 1M)"
|
396
|
+
CohereLarge = "Large Cohere (768dim, 10M)"
|
397
|
+
BioasqMedium = "Medium Bioasq (1024dim, 1M)"
|
398
|
+
BioasqLarge = "Large Bioasq (1024dim, 10M)"
|
399
|
+
OpenAISmall = "Small OpenAI (1536dim, 50K)"
|
400
|
+
OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
|
401
|
+
OpenAILarge = "Large OpenAI (1536dim, 5M)"
|
402
|
+
|
403
|
+
def get_manager(self) -> DatasetManager:
|
404
|
+
if self not in DatasetWithSizeMap:
|
405
|
+
msg = f"wrong ScalarDatasetWithSizeType: {self.name}"
|
406
|
+
raise ValueError(msg)
|
407
|
+
return DatasetWithSizeMap.get(self)
|
408
|
+
|
409
|
+
def get_load_timeout(self) -> float:
|
410
|
+
if "small" in self.value.lower():
|
411
|
+
return config.LOAD_TIMEOUT_768D_100K
|
412
|
+
if "medium" in self.value.lower():
|
413
|
+
return config.LOAD_TIMEOUT_768D_1M
|
414
|
+
if "large" in self.value.lower():
|
415
|
+
return config.LOAD_TIMEOUT_768D_10M
|
416
|
+
msg = f"No load_timeout for {self.value}"
|
417
|
+
raise KeyError(msg)
|
418
|
+
|
419
|
+
def get_optimize_timeout(self) -> float:
|
420
|
+
if "small" in self.value.lower():
|
421
|
+
return config.OPTIMIZE_TIMEOUT_768D_100K
|
422
|
+
if "medium" in self.value.lower():
|
423
|
+
return config.OPTIMIZE_TIMEOUT_768D_1M
|
424
|
+
if "large" in self.value.lower():
|
425
|
+
return config.OPTIMIZE_TIMEOUT_768D_10M
|
426
|
+
return config.OPTIMIZE_TIMEOUT_DEFAULT
|
427
|
+
|
428
|
+
|
429
|
+
DatasetWithSizeMap = {
|
430
|
+
DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
|
431
|
+
DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
|
432
|
+
DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
|
433
|
+
DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
|
434
|
+
DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
|
435
|
+
DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
|
436
|
+
DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),
|
437
|
+
DatasetWithSizeType.OpenAILarge: Dataset.OPENAI.manager(5_000_000),
|
438
|
+
}
|
@@ -0,0 +1,76 @@
|
|
1
|
+
from enum import StrEnum
|
2
|
+
|
3
|
+
from ..base import BaseModel
|
4
|
+
|
5
|
+
|
6
|
+
class FilterOp(StrEnum):
|
7
|
+
NumGE = "NumGE" # test ">="
|
8
|
+
StrEqual = "Label" # test "=="
|
9
|
+
NonFilter = "NonFilter"
|
10
|
+
|
11
|
+
|
12
|
+
class Filter(BaseModel):
|
13
|
+
type: FilterOp
|
14
|
+
filter_rate: float = 0.0
|
15
|
+
|
16
|
+
@property
|
17
|
+
def groundtruth_file(self) -> str:
|
18
|
+
raise NotImplementedError
|
19
|
+
|
20
|
+
|
21
|
+
class NonFilter(Filter):
|
22
|
+
type: FilterOp = FilterOp.NonFilter
|
23
|
+
filter_rate: float = 0.0
|
24
|
+
gt_file_name: str = "neighbors.parquet"
|
25
|
+
|
26
|
+
@property
|
27
|
+
def groundtruth_file(self) -> str:
|
28
|
+
return self.gt_file_name
|
29
|
+
|
30
|
+
|
31
|
+
non_filter = NonFilter()
|
32
|
+
|
33
|
+
|
34
|
+
class IntFilter(Filter):
|
35
|
+
"""
|
36
|
+
compatible with older int-filter cases
|
37
|
+
filter expr: int_field >= int_value (dataset_size * filter_rate)
|
38
|
+
"""
|
39
|
+
|
40
|
+
type: FilterOp = FilterOp.NumGE
|
41
|
+
int_field: str = "id"
|
42
|
+
int_value: int
|
43
|
+
|
44
|
+
@property
|
45
|
+
def groundtruth_file(self) -> str:
|
46
|
+
if self.filter_rate == 0.01:
|
47
|
+
return "neighbors_head_1p.parquet"
|
48
|
+
if self.filter_rate == 0.99:
|
49
|
+
return "neighbors_tail_1p.parquet"
|
50
|
+
msg = f"Not Support Int Filter - {self.filter_rate}"
|
51
|
+
raise RuntimeError(msg)
|
52
|
+
|
53
|
+
|
54
|
+
class LabelFilter(Filter):
|
55
|
+
"""
|
56
|
+
filter expr: label_field == label_value, like `color == "red"`
|
57
|
+
"""
|
58
|
+
|
59
|
+
type: FilterOp = FilterOp.StrEqual
|
60
|
+
label_field: str = "labels"
|
61
|
+
label_percentage: float
|
62
|
+
|
63
|
+
@property
|
64
|
+
def label_value(self) -> str:
|
65
|
+
p = self.label_percentage * 100
|
66
|
+
if p >= 1:
|
67
|
+
return f"label_{int(p)}p" # such as 5p, 20p, 1p, ...
|
68
|
+
return f"label_{p:.1f}p" # such as 0.1p, 0.5p, ...
|
69
|
+
|
70
|
+
def __init__(self, label_percentage: float, **kwargs):
|
71
|
+
filter_rate = 1.0 - label_percentage
|
72
|
+
super().__init__(filter_rate=filter_rate, label_percentage=label_percentage, **kwargs)
|
73
|
+
|
74
|
+
@property
|
75
|
+
def groundtruth_file(self) -> str:
|
76
|
+
return f"neighbors_{self.label_field}_{self.label_value}.parquet"
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from .mp_runner import
|
2
|
-
|
3
|
-
)
|
1
|
+
from .mp_runner import MultiProcessingSearchRunner
|
2
|
+
from .read_write_runner import ReadWriteRunner
|
4
3
|
from .serial_runner import SerialInsertRunner, SerialSearchRunner
|
5
4
|
|
6
5
|
__all__ = [
|
7
6
|
"MultiProcessingSearchRunner",
|
7
|
+
"ReadWriteRunner",
|
8
8
|
"SerialInsertRunner",
|
9
9
|
"SerialSearchRunner",
|
10
10
|
]
|
@@ -9,6 +9,8 @@ from multiprocessing.queues import Queue
|
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
|
12
|
+
from vectordb_bench.backend.filter import Filter, non_filter
|
13
|
+
|
12
14
|
from ... import config
|
13
15
|
from ...models import ConcurrencySlotTimeoutError
|
14
16
|
from ..clients import api
|
@@ -31,7 +33,7 @@ class MultiProcessingSearchRunner:
|
|
31
33
|
db: api.VectorDB,
|
32
34
|
test_data: list[list[float]],
|
33
35
|
k: int = config.K_DEFAULT,
|
34
|
-
filters:
|
36
|
+
filters: Filter = non_filter,
|
35
37
|
concurrencies: Iterable[int] = config.NUM_CONCURRENCY,
|
36
38
|
duration: int = config.CONCURRENCY_DURATION,
|
37
39
|
concurrency_timeout: int = config.CONCURRENCY_TIMEOUT,
|
@@ -58,6 +60,7 @@ class MultiProcessingSearchRunner:
|
|
58
60
|
cond.wait()
|
59
61
|
|
60
62
|
with self.db.init():
|
63
|
+
self.db.prepare_filter(self.filters)
|
61
64
|
num, idx = len(test_data), random.randint(0, len(test_data) - 1)
|
62
65
|
|
63
66
|
start_time = time.perf_counter()
|
@@ -66,18 +69,12 @@ class MultiProcessingSearchRunner:
|
|
66
69
|
while time.perf_counter() < start_time + self.duration:
|
67
70
|
s = time.perf_counter()
|
68
71
|
try:
|
69
|
-
self.db.search_embedding(
|
70
|
-
|
71
|
-
|
72
|
-
self.filters,
|
73
|
-
)
|
72
|
+
self.db.search_embedding(test_data[idx], self.k)
|
73
|
+
count += 1
|
74
|
+
latencies.append(time.perf_counter() - s)
|
74
75
|
except Exception as e:
|
75
76
|
log.warning(f"VectorDB search_embedding error: {e}")
|
76
|
-
traceback.print_exc(chain=True)
|
77
|
-
raise e from None
|
78
77
|
|
79
|
-
latencies.append(time.perf_counter() - s)
|
80
|
-
count += 1
|
81
78
|
# loop through the test data
|
82
79
|
idx = idx + 1 if idx < num - 1 else 0
|
83
80
|
|
@@ -181,10 +178,20 @@ class MultiProcessingSearchRunner:
|
|
181
178
|
def stop(self) -> None:
|
182
179
|
pass
|
183
180
|
|
184
|
-
def run_by_dur(self, duration: int) -> float:
|
181
|
+
def run_by_dur(self, duration: int) -> tuple[float, float]:
|
182
|
+
"""
|
183
|
+
Returns:
|
184
|
+
float: largest qps
|
185
|
+
float: failed rate
|
186
|
+
"""
|
185
187
|
return self._run_by_dur(duration)
|
186
188
|
|
187
|
-
def _run_by_dur(self, duration: int) -> float:
|
189
|
+
def _run_by_dur(self, duration: int) -> tuple[float, float]:
|
190
|
+
"""
|
191
|
+
Returns:
|
192
|
+
float: largest qps
|
193
|
+
float: failed rate
|
194
|
+
"""
|
188
195
|
max_qps = 0
|
189
196
|
try:
|
190
197
|
for conc in self.concurrencies:
|
@@ -208,12 +215,17 @@ class MultiProcessingSearchRunner:
|
|
208
215
|
log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
|
209
216
|
|
210
217
|
start = time.perf_counter()
|
211
|
-
|
218
|
+
res = [r.result() for r in future_iter]
|
219
|
+
all_success_count = sum([r[0] for r in res])
|
220
|
+
all_failed_count = sum([r[1] for r in res])
|
221
|
+
failed_rate = all_failed_count / (all_failed_count + all_success_count)
|
212
222
|
cost = time.perf_counter() - start
|
213
223
|
|
214
|
-
qps = round(
|
215
|
-
log.info(
|
216
|
-
|
224
|
+
qps = round(all_success_count / cost, 4)
|
225
|
+
log.info(
|
226
|
+
f"End search in concurrency {conc}: dur={cost}s, failed_rate={failed_rate}, "
|
227
|
+
f"all_success_count={all_success_count}, all_failed_count={all_failed_count}, qps={qps}",
|
228
|
+
)
|
217
229
|
if qps > max_qps:
|
218
230
|
max_qps = qps
|
219
231
|
log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
|
@@ -230,52 +242,53 @@ class MultiProcessingSearchRunner:
|
|
230
242
|
finally:
|
231
243
|
self.stop()
|
232
244
|
|
233
|
-
return max_qps
|
245
|
+
return max_qps, failed_rate
|
234
246
|
|
235
|
-
def search_by_dur(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
) -> int:
|
247
|
+
def search_by_dur(self, dur: int, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> tuple[int, int]:
|
248
|
+
"""
|
249
|
+
Returns:
|
250
|
+
int: successful requests count
|
251
|
+
int: failed requests count
|
252
|
+
"""
|
242
253
|
# sync all process
|
243
254
|
q.put(1)
|
244
255
|
with cond:
|
245
256
|
cond.wait()
|
246
257
|
|
247
258
|
with self.db.init():
|
259
|
+
self.db.prepare_filter(self.filters)
|
248
260
|
num, idx = len(test_data), random.randint(0, len(test_data) - 1)
|
249
261
|
|
250
262
|
start_time = time.perf_counter()
|
251
|
-
|
263
|
+
success_count = 0
|
264
|
+
failed_cnt = 0
|
252
265
|
while time.perf_counter() < start_time + dur:
|
253
266
|
s = time.perf_counter()
|
254
267
|
try:
|
255
|
-
self.db.search_embedding(
|
256
|
-
|
257
|
-
self.k,
|
258
|
-
self.filters,
|
259
|
-
)
|
268
|
+
self.db.search_embedding(test_data[idx], self.k)
|
269
|
+
success_count += 1
|
260
270
|
except Exception as e:
|
261
|
-
|
262
|
-
|
263
|
-
|
271
|
+
failed_cnt += 1
|
272
|
+
# reduce log
|
273
|
+
if failed_cnt <= 3:
|
274
|
+
log.warning(f"VectorDB search_embedding error: {e}")
|
275
|
+
else:
|
276
|
+
log.debug(f"VectorDB search_embedding error: {e}")
|
264
277
|
|
265
|
-
count += 1
|
266
278
|
# loop through the test data
|
267
279
|
idx = idx + 1 if idx < num - 1 else 0
|
268
280
|
|
269
|
-
if
|
281
|
+
if success_count % 500 == 0:
|
270
282
|
log.debug(
|
271
|
-
f"({mp.current_process().name:16}) search_count: {
|
272
|
-
f"latest_latency={time.perf_counter()-s}"
|
283
|
+
f"({mp.current_process().name:16}) search_count: {success_count}, "
|
284
|
+
f"latest_latency={time.perf_counter()-s}",
|
273
285
|
)
|
274
286
|
|
275
287
|
total_dur = round(time.perf_counter() - start_time, 4)
|
276
288
|
log.debug(
|
277
289
|
f"{mp.current_process().name:16} search {self.duration}s: "
|
278
|
-
f"actual_dur={total_dur}s, count={
|
290
|
+
f"actual_dur={total_dur}s, count={success_count}, failed_cnt={failed_cnt}, "
|
291
|
+
f"qps (successful) in this process: {round(success_count / total_dur, 4):3}",
|
279
292
|
)
|
280
293
|
|
281
|
-
return
|
294
|
+
return success_count, failed_cnt
|