vectordb-bench 0.0.29__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. vectordb_bench/__init__.py +14 -27
  2. vectordb_bench/backend/assembler.py +19 -6
  3. vectordb_bench/backend/cases.py +186 -23
  4. vectordb_bench/backend/clients/__init__.py +32 -0
  5. vectordb_bench/backend/clients/api.py +22 -1
  6. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +249 -43
  7. vectordb_bench/backend/clients/aws_opensearch/cli.py +51 -21
  8. vectordb_bench/backend/clients/aws_opensearch/config.py +58 -16
  9. vectordb_bench/backend/clients/chroma/chroma.py +6 -2
  10. vectordb_bench/backend/clients/elastic_cloud/config.py +19 -1
  11. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +133 -45
  12. vectordb_bench/backend/clients/lancedb/cli.py +62 -8
  13. vectordb_bench/backend/clients/lancedb/config.py +14 -1
  14. vectordb_bench/backend/clients/lancedb/lancedb.py +21 -9
  15. vectordb_bench/backend/clients/memorydb/memorydb.py +2 -2
  16. vectordb_bench/backend/clients/milvus/cli.py +30 -9
  17. vectordb_bench/backend/clients/milvus/config.py +3 -0
  18. vectordb_bench/backend/clients/milvus/milvus.py +81 -23
  19. vectordb_bench/backend/clients/oceanbase/cli.py +100 -0
  20. vectordb_bench/backend/clients/oceanbase/config.py +125 -0
  21. vectordb_bench/backend/clients/oceanbase/oceanbase.py +215 -0
  22. vectordb_bench/backend/clients/pinecone/pinecone.py +39 -25
  23. vectordb_bench/backend/clients/qdrant_cloud/config.py +59 -3
  24. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +100 -33
  25. vectordb_bench/backend/clients/qdrant_local/cli.py +60 -0
  26. vectordb_bench/backend/clients/qdrant_local/config.py +47 -0
  27. vectordb_bench/backend/clients/qdrant_local/qdrant_local.py +232 -0
  28. vectordb_bench/backend/clients/weaviate_cloud/cli.py +29 -3
  29. vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -0
  30. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +5 -0
  31. vectordb_bench/backend/dataset.py +143 -27
  32. vectordb_bench/backend/filter.py +76 -0
  33. vectordb_bench/backend/runner/__init__.py +3 -3
  34. vectordb_bench/backend/runner/mp_runner.py +52 -39
  35. vectordb_bench/backend/runner/rate_runner.py +68 -52
  36. vectordb_bench/backend/runner/read_write_runner.py +125 -68
  37. vectordb_bench/backend/runner/serial_runner.py +56 -23
  38. vectordb_bench/backend/task_runner.py +48 -20
  39. vectordb_bench/cli/batch_cli.py +121 -0
  40. vectordb_bench/cli/cli.py +59 -1
  41. vectordb_bench/cli/vectordbbench.py +7 -0
  42. vectordb_bench/config-files/batch_sample_config.yml +17 -0
  43. vectordb_bench/frontend/components/check_results/data.py +16 -11
  44. vectordb_bench/frontend/components/check_results/filters.py +53 -25
  45. vectordb_bench/frontend/components/check_results/headerIcon.py +16 -13
  46. vectordb_bench/frontend/components/check_results/nav.py +20 -0
  47. vectordb_bench/frontend/components/custom/displayCustomCase.py +43 -8
  48. vectordb_bench/frontend/components/custom/displaypPrams.py +10 -5
  49. vectordb_bench/frontend/components/custom/getCustomConfig.py +10 -0
  50. vectordb_bench/frontend/components/label_filter/charts.py +60 -0
  51. vectordb_bench/frontend/components/run_test/caseSelector.py +48 -52
  52. vectordb_bench/frontend/components/run_test/dbSelector.py +9 -5
  53. vectordb_bench/frontend/components/run_test/inputWidget.py +48 -0
  54. vectordb_bench/frontend/components/run_test/submitTask.py +3 -1
  55. vectordb_bench/frontend/components/streaming/charts.py +253 -0
  56. vectordb_bench/frontend/components/streaming/data.py +62 -0
  57. vectordb_bench/frontend/components/tables/data.py +1 -1
  58. vectordb_bench/frontend/components/welcome/explainPrams.py +66 -0
  59. vectordb_bench/frontend/components/welcome/pagestyle.py +106 -0
  60. vectordb_bench/frontend/components/welcome/welcomePrams.py +147 -0
  61. vectordb_bench/frontend/config/dbCaseConfigs.py +420 -41
  62. vectordb_bench/frontend/config/styles.py +32 -2
  63. vectordb_bench/frontend/pages/concurrent.py +5 -1
  64. vectordb_bench/frontend/pages/custom.py +4 -0
  65. vectordb_bench/frontend/pages/label_filter.py +56 -0
  66. vectordb_bench/frontend/pages/quries_per_dollar.py +5 -1
  67. vectordb_bench/frontend/pages/results.py +60 -0
  68. vectordb_bench/frontend/pages/run_test.py +3 -3
  69. vectordb_bench/frontend/pages/streaming.py +135 -0
  70. vectordb_bench/frontend/pages/tables.py +4 -0
  71. vectordb_bench/frontend/vdb_benchmark.py +16 -41
  72. vectordb_bench/interface.py +6 -2
  73. vectordb_bench/metric.py +15 -1
  74. vectordb_bench/models.py +38 -11
  75. vectordb_bench/results/ElasticCloud/result_20250318_standard_elasticcloud.json +5890 -0
  76. vectordb_bench/results/Milvus/result_20250509_standard_milvus.json +6138 -0
  77. vectordb_bench/results/OpenSearch/result_20250224_standard_opensearch.json +7319 -0
  78. vectordb_bench/results/Pinecone/result_20250124_standard_pinecone.json +2365 -0
  79. vectordb_bench/results/QdrantCloud/result_20250602_standard_qdrantcloud.json +3556 -0
  80. vectordb_bench/results/ZillizCloud/result_20250613_standard_zillizcloud.json +6290 -0
  81. vectordb_bench/results/dbPrices.json +12 -4
  82. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/METADATA +131 -32
  83. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/RECORD +87 -65
  84. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/WHEEL +1 -1
  85. vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -791
  86. vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -679
  87. vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -1352
  88. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/entry_points.txt +0 -0
  89. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/licenses/LICENSE +0 -0
  90. {vectordb_bench-0.0.29.dist-info → vectordb_bench-1.0.0.dist-info}/top_level.txt +0 -0
@@ -20,6 +20,7 @@ from vectordb_bench.base import BaseModel
20
20
  from . import utils
21
21
  from .clients import MetricType
22
22
  from .data_source import DatasetReader, DatasetSource
23
+ from .filter import Filter, FilterOp, non_filter
23
24
 
24
25
  log = logging.getLogger(__name__)
25
26
 
@@ -39,6 +40,21 @@ class BaseDataset(BaseModel):
39
40
  with_gt: bool = False
40
41
  _size_label: dict[int, SizeLabel] = PrivateAttr()
41
42
  is_custom: bool = False
43
+ with_remote_resource: bool = True
44
+ # for label filter cases
45
+ with_scalar_labels: bool = False
46
+ # if True, scalar_labels will be retrieved from a separate parquet file;
47
+ # otherwise, they will be obtained from train.parquet.
48
+ scalar_labels_file_separated: bool = True
49
+ scalar_labels_file: str = "scalar_labels.parquet"
50
+ scalar_label_percentages: list[float] = []
51
+ train_id_field: str = "id"
52
+ train_vector_field: str = "emb"
53
+ test_file: str = "test.parquet"
54
+ test_id_field: str = "id"
55
+ test_vector_field: str = "emb"
56
+ gt_id_field: str = "id"
57
+ gt_neighbors_field: str = "neighbors_id"
42
58
 
43
59
  @validator("size")
44
60
  def verify_size(cls, v: int):
@@ -51,6 +67,10 @@ class BaseDataset(BaseModel):
51
67
  def label(self) -> str:
52
68
  return self._size_label.get(self.size).label
53
69
 
70
+ @property
71
+ def full_name(self) -> str:
72
+ return f"{self.name.capitalize()} ({self.label.capitalize()})"
73
+
54
74
  @property
55
75
  def dir_name(self) -> str:
56
76
  return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
@@ -59,11 +79,27 @@ class BaseDataset(BaseModel):
59
79
  def file_count(self) -> int:
60
80
  return self._size_label.get(self.size).file_count
61
81
 
82
+ @property
83
+ def train_files(self) -> list[str]:
84
+ return utils.compose_train_files(self.file_count, self.use_shuffled)
85
+
62
86
 
63
87
  class CustomDataset(BaseDataset):
64
88
  dir: str
65
89
  file_num: int
66
90
  is_custom: bool = True
91
+ with_remote_resource: bool = False
92
+ train_file: str = "train"
93
+ train_id_field: str = "id"
94
+ train_vector_field: str = "emb"
95
+ test_file: str = "test.parquet"
96
+ gt_file: str = "neighbors.parquet"
97
+ test_vector_field: str = "emb"
98
+ gt_neighbors_field: str = "neighbors_id"
99
+ with_scalar_labels: bool = True
100
+ scalar_labels_file_separated: bool = True
101
+ scalar_labels_file: str = "scalar_labels.parquet"
102
+ label_percentages: list[float] = []
67
103
 
68
104
  @validator("size")
69
105
  def verify_size(cls, v: int):
@@ -81,6 +117,17 @@ class CustomDataset(BaseDataset):
81
117
  def file_count(self) -> int:
82
118
  return self.file_num
83
119
 
120
+ @property
121
+ def train_files(self) -> list[str]:
122
+ train_file = self.train_file
123
+ prefix = f"{train_file}"
124
+ train_files = []
125
+ prefix_s = [item.strip() for item in prefix.split(",") if item.strip()]
126
+ for i in range(len(prefix_s)):
127
+ sub_file = f"{prefix_s[i]}.parquet"
128
+ train_files.append(sub_file)
129
+ return train_files
130
+
84
131
 
85
132
  class LAION(BaseDataset):
86
133
  name: str = "LAION"
@@ -109,12 +156,28 @@ class Cohere(BaseDataset):
109
156
  dim: int = 768
110
157
  metric_type: MetricType = MetricType.COSINE
111
158
  use_shuffled: bool = config.USE_SHUFFLED_DATA
112
- with_gt: bool = (True,)
159
+ with_gt: bool = True
113
160
  _size_label: dict = {
114
161
  100_000: SizeLabel(100_000, "SMALL", 1),
115
162
  1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
116
163
  10_000_000: SizeLabel(10_000_000, "LARGE", 10),
117
164
  }
165
+ with_scalar_labels: bool = True
166
+ scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
167
+
168
+
169
+ class Bioasq(BaseDataset):
170
+ name: str = "Bioasq"
171
+ dim: int = 1024
172
+ metric_type: MetricType = MetricType.COSINE
173
+ use_shuffled: bool = config.USE_SHUFFLED_DATA
174
+ with_gt: bool = True
175
+ _size_label: dict = {
176
+ 1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
177
+ 10_000_000: SizeLabel(10_000_000, "LARGE", 10),
178
+ }
179
+ with_scalar_labels: bool = True
180
+ scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
118
181
 
119
182
 
120
183
  class Glove(BaseDataset):
@@ -146,12 +209,14 @@ class OpenAI(BaseDataset):
146
209
  dim: int = 1536
147
210
  metric_type: MetricType = MetricType.COSINE
148
211
  use_shuffled: bool = config.USE_SHUFFLED_DATA
149
- with_gt: bool = (True,)
212
+ with_gt: bool = True
150
213
  _size_label: dict = {
151
214
  50_000: SizeLabel(50_000, "SMALL", 1),
152
215
  500_000: SizeLabel(500_000, "MEDIUM", 1),
153
216
  5_000_000: SizeLabel(5_000_000, "LARGE", 10),
154
217
  }
218
+ with_scalar_labels: bool = True
219
+ scalar_label_percentages: list[float] = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
155
220
 
156
221
 
157
222
  class DatasetManager(BaseModel):
@@ -166,8 +231,9 @@ class DatasetManager(BaseModel):
166
231
  """
167
232
 
168
233
  data: BaseDataset
169
- test_data: pd.DataFrame | None = None
170
- gt_data: pd.DataFrame | None = None
234
+ test_data: list[list[float]] | None = None
235
+ gt_data: list[list[int]] | None = None
236
+ scalar_labels: pl.DataFrame | None = None
171
237
  train_files: list[str] = []
172
238
  reader: DatasetReader | None = None
173
239
 
@@ -191,7 +257,7 @@ class DatasetManager(BaseModel):
191
257
  return pathlib.Path(
192
258
  config.DATASET_LOCAL_DIR,
193
259
  self.data.name.lower(),
194
- self.data.dir_name.lower(),
260
+ self.data.dir_name,
195
261
  )
196
262
 
197
263
  def __iter__(self):
@@ -201,58 +267,59 @@ class DatasetManager(BaseModel):
201
267
  def prepare(
202
268
  self,
203
269
  source: DatasetSource = DatasetSource.S3,
204
- filters: float | str | None = None,
270
+ filters: Filter = non_filter,
205
271
  ) -> bool:
206
272
  """Download the dataset from DatasetSource
207
273
  url = f"{source}/{self.data.dir_name}"
208
274
 
209
275
  Args:
210
276
  source(DatasetSource): S3 or AliyunOSS, default as S3
211
- filters(Optional[int | float | str]): combined with dataset's with_gt to
277
+ filters(Filter): combined with dataset's with_gt to
212
278
  compose the correct ground_truth file
213
279
 
214
280
  Returns:
215
281
  bool: whether the dataset is successfully prepared
216
282
 
217
283
  """
218
- file_count, use_shuffled = self.data.file_count, self.data.use_shuffled
219
-
220
- train_files = utils.compose_train_files(file_count, use_shuffled)
221
- all_files = train_files
222
-
223
- test_file = "test.parquet"
224
- all_files.extend([test_file])
225
- gt_file = None
284
+ self.train_files = self.data.train_files
285
+ gt_file, test_file = None, None
226
286
  if self.data.with_gt:
227
- gt_file = utils.compose_gt_file(filters)
228
- all_files.extend([gt_file])
287
+ gt_file, test_file = filters.groundtruth_file, self.data.test_file
229
288
 
230
- if not self.data.is_custom:
289
+ if self.data.with_remote_resource:
290
+ download_files = [file for file in self.train_files]
291
+ download_files.extend([gt_file, test_file])
292
+ if self.data.with_scalar_labels and self.data.scalar_labels_file_separated:
293
+ download_files.append(self.data.scalar_labels_file)
231
294
  source.reader().read(
232
295
  dataset=self.data.dir_name.lower(),
233
- files=all_files,
296
+ files=download_files,
234
297
  local_ds_root=self.data_dir,
235
298
  )
236
299
 
237
- if test_file is not None:
238
- self.test_data = self._read_file(test_file)
300
+ # read scalar_labels_file if separated
301
+ if (
302
+ filters.type == FilterOp.StrEqual
303
+ and self.data.with_scalar_labels
304
+ and self.data.scalar_labels_file_separated
305
+ ):
306
+ self.scalar_labels = self._read_file(self.data.scalar_labels_file)
239
307
 
240
- if gt_file is not None:
241
- self.gt_data = self._read_file(gt_file)
308
+ if gt_file is not None and test_file is not None:
309
+ self.test_data = self._read_file(test_file)[self.data.test_vector_field].to_list()
310
+ self.gt_data = self._read_file(gt_file)[self.data.gt_neighbors_field].to_list()
242
311
 
243
- prefix = "shuffle_train" if use_shuffled else "train"
244
- self.train_files = sorted([f.name for f in self.data_dir.glob(f"{prefix}*.parquet")])
245
312
  log.debug(f"{self.data.name}: available train files {self.train_files}")
246
313
 
247
314
  return True
248
315
 
249
- def _read_file(self, file_name: str) -> pd.DataFrame:
316
+ def _read_file(self, file_name: str) -> pl.DataFrame:
250
317
  """read one file from disk into memory"""
251
318
  log.info(f"Read the entire file into memory: {file_name}")
252
319
  p = pathlib.Path(self.data_dir, file_name)
253
320
  if not p.exists():
254
321
  log.warning(f"No such file: {p}")
255
- return pd.DataFrame()
322
+ return pl.DataFrame()
256
323
 
257
324
  return pl.read_parquet(p)
258
325
 
@@ -308,6 +375,7 @@ class Dataset(Enum):
308
375
  LAION = LAION
309
376
  GIST = GIST
310
377
  COHERE = Cohere
378
+ BIOASQ = Bioasq
311
379
  GLOVE = Glove
312
380
  SIFT = SIFT
313
381
  OPENAI = OpenAI
@@ -317,3 +385,51 @@ class Dataset(Enum):
317
385
 
318
386
  def manager(self, size: int) -> DatasetManager:
319
387
  return DatasetManager(data=self.get(size))
388
+
389
+
390
+ class DatasetWithSizeType(Enum):
391
+ CohereSmall = "Small Cohere (768dim, 100K)"
392
+ CohereMedium = "Medium Cohere (768dim, 1M)"
393
+ CohereLarge = "Large Cohere (768dim, 10M)"
394
+ BioasqMedium = "Medium Bioasq (1024dim, 1M)"
395
+ BioasqLarge = "Large Bioasq (1024dim, 10M)"
396
+ OpenAISmall = "Small OpenAI (1536dim, 50K)"
397
+ OpenAIMedium = "Medium OpenAI (1536dim, 500K)"
398
+ OpenAILarge = "Large OpenAI (1536dim, 5M)"
399
+
400
+ def get_manager(self) -> DatasetManager:
401
+ if self not in DatasetWithSizeMap:
402
+ msg = f"wrong ScalarDatasetWithSizeType: {self.name}"
403
+ raise ValueError(msg)
404
+ return DatasetWithSizeMap.get(self)
405
+
406
+ def get_load_timeout(self) -> float:
407
+ if "small" in self.value.lower():
408
+ return config.LOAD_TIMEOUT_768D_100K
409
+ if "medium" in self.value.lower():
410
+ return config.LOAD_TIMEOUT_768D_1M
411
+ if "large" in self.value.lower():
412
+ return config.LOAD_TIMEOUT_768D_10M
413
+ msg = f"No load_timeout for {self.value}"
414
+ raise KeyError(msg)
415
+
416
+ def get_optimize_timeout(self) -> float:
417
+ if "small" in self.value.lower():
418
+ return config.OPTIMIZE_TIMEOUT_768D_100K
419
+ if "medium" in self.value.lower():
420
+ return config.OPTIMIZE_TIMEOUT_768D_1M
421
+ if "large" in self.value.lower():
422
+ return config.OPTIMIZE_TIMEOUT_768D_10M
423
+ return config.OPTIMIZE_TIMEOUT_DEFAULT
424
+
425
+
426
+ DatasetWithSizeMap = {
427
+ DatasetWithSizeType.CohereSmall: Dataset.COHERE.manager(100_000),
428
+ DatasetWithSizeType.CohereMedium: Dataset.COHERE.manager(1_000_000),
429
+ DatasetWithSizeType.CohereLarge: Dataset.COHERE.manager(10_000_000),
430
+ DatasetWithSizeType.BioasqMedium: Dataset.BIOASQ.manager(1_000_000),
431
+ DatasetWithSizeType.BioasqLarge: Dataset.BIOASQ.manager(10_000_000),
432
+ DatasetWithSizeType.OpenAISmall: Dataset.OPENAI.manager(50_000),
433
+ DatasetWithSizeType.OpenAIMedium: Dataset.OPENAI.manager(500_000),
434
+ DatasetWithSizeType.OpenAILarge: Dataset.OPENAI.manager(5_000_000),
435
+ }
@@ -0,0 +1,76 @@
1
+ from enum import StrEnum
2
+
3
+ from ..base import BaseModel
4
+
5
+
6
+ class FilterOp(StrEnum):
7
+ NumGE = "NumGE" # test ">="
8
+ StrEqual = "Label" # test "=="
9
+ NonFilter = "NonFilter"
10
+
11
+
12
+ class Filter(BaseModel):
13
+ type: FilterOp
14
+ filter_rate: float = 0.0
15
+
16
+ @property
17
+ def groundtruth_file(self) -> str:
18
+ raise NotImplementedError
19
+
20
+
21
+ class NonFilter(Filter):
22
+ type: FilterOp = FilterOp.NonFilter
23
+ filter_rate: float = 0.0
24
+ gt_file_name: str = "neighbors.parquet"
25
+
26
+ @property
27
+ def groundtruth_file(self) -> str:
28
+ return self.gt_file_name
29
+
30
+
31
+ non_filter = NonFilter()
32
+
33
+
34
+ class IntFilter(Filter):
35
+ """
36
+ compatible with older int-filter cases
37
+ filter expr: int_field >= int_value (dataset_size * filter_rate)
38
+ """
39
+
40
+ type: FilterOp = FilterOp.NumGE
41
+ int_field: str = "id"
42
+ int_value: int
43
+
44
+ @property
45
+ def groundtruth_file(self) -> str:
46
+ if self.filter_rate == 0.01:
47
+ return "neighbors_head_1p.parquet"
48
+ if self.filter_rate == 0.99:
49
+ return "neighbors_tail_1p.parquet"
50
+ msg = f"Not Support Int Filter - {self.filter_rate}"
51
+ raise RuntimeError(msg)
52
+
53
+
54
+ class LabelFilter(Filter):
55
+ """
56
+ filter expr: label_field == label_value, like `color == "red"`
57
+ """
58
+
59
+ type: FilterOp = FilterOp.StrEqual
60
+ label_field: str = "labels"
61
+ label_percentage: float
62
+
63
+ @property
64
+ def label_value(self) -> str:
65
+ p = self.label_percentage * 100
66
+ if p >= 1:
67
+ return f"label_{int(p)}p" # such as 5p, 20p, 1p, ...
68
+ return f"label_{p:.1f}p" # such as 0.1p, 0.5p, ...
69
+
70
+ def __init__(self, label_percentage: float, **kwargs):
71
+ filter_rate = 1.0 - label_percentage
72
+ super().__init__(filter_rate=filter_rate, label_percentage=label_percentage, **kwargs)
73
+
74
+ @property
75
+ def groundtruth_file(self) -> str:
76
+ return f"neighbors_{self.label_field}_{self.label_value}.parquet"
@@ -1,10 +1,10 @@
1
- from .mp_runner import (
2
- MultiProcessingSearchRunner,
3
- )
1
+ from .mp_runner import MultiProcessingSearchRunner
2
+ from .read_write_runner import ReadWriteRunner
4
3
  from .serial_runner import SerialInsertRunner, SerialSearchRunner
5
4
 
6
5
  __all__ = [
7
6
  "MultiProcessingSearchRunner",
7
+ "ReadWriteRunner",
8
8
  "SerialInsertRunner",
9
9
  "SerialSearchRunner",
10
10
  ]
@@ -9,6 +9,8 @@ from multiprocessing.queues import Queue
9
9
 
10
10
  import numpy as np
11
11
 
12
+ from vectordb_bench.backend.filter import Filter, non_filter
13
+
12
14
  from ... import config
13
15
  from ...models import ConcurrencySlotTimeoutError
14
16
  from ..clients import api
@@ -31,7 +33,7 @@ class MultiProcessingSearchRunner:
31
33
  db: api.VectorDB,
32
34
  test_data: list[list[float]],
33
35
  k: int = config.K_DEFAULT,
34
- filters: dict | None = None,
36
+ filters: Filter = non_filter,
35
37
  concurrencies: Iterable[int] = config.NUM_CONCURRENCY,
36
38
  duration: int = config.CONCURRENCY_DURATION,
37
39
  concurrency_timeout: int = config.CONCURRENCY_TIMEOUT,
@@ -58,6 +60,7 @@ class MultiProcessingSearchRunner:
58
60
  cond.wait()
59
61
 
60
62
  with self.db.init():
63
+ self.db.prepare_filter(self.filters)
61
64
  num, idx = len(test_data), random.randint(0, len(test_data) - 1)
62
65
 
63
66
  start_time = time.perf_counter()
@@ -66,18 +69,12 @@ class MultiProcessingSearchRunner:
66
69
  while time.perf_counter() < start_time + self.duration:
67
70
  s = time.perf_counter()
68
71
  try:
69
- self.db.search_embedding(
70
- test_data[idx],
71
- self.k,
72
- self.filters,
73
- )
72
+ self.db.search_embedding(test_data[idx], self.k)
73
+ count += 1
74
+ latencies.append(time.perf_counter() - s)
74
75
  except Exception as e:
75
76
  log.warning(f"VectorDB search_embedding error: {e}")
76
- traceback.print_exc(chain=True)
77
- raise e from None
78
77
 
79
- latencies.append(time.perf_counter() - s)
80
- count += 1
81
78
  # loop through the test data
82
79
  idx = idx + 1 if idx < num - 1 else 0
83
80
 
@@ -181,10 +178,20 @@ class MultiProcessingSearchRunner:
181
178
  def stop(self) -> None:
182
179
  pass
183
180
 
184
- def run_by_dur(self, duration: int) -> float:
181
+ def run_by_dur(self, duration: int) -> tuple[float, float]:
182
+ """
183
+ Returns:
184
+ float: largest qps
185
+ float: failed rate
186
+ """
185
187
  return self._run_by_dur(duration)
186
188
 
187
- def _run_by_dur(self, duration: int) -> float:
189
+ def _run_by_dur(self, duration: int) -> tuple[float, float]:
190
+ """
191
+ Returns:
192
+ float: largest qps
193
+ float: failed rate
194
+ """
188
195
  max_qps = 0
189
196
  try:
190
197
  for conc in self.concurrencies:
@@ -208,12 +215,17 @@ class MultiProcessingSearchRunner:
208
215
  log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
209
216
 
210
217
  start = time.perf_counter()
211
- all_count = sum([r.result() for r in future_iter])
218
+ res = [r.result() for r in future_iter]
219
+ all_success_count = sum([r[0] for r in res])
220
+ all_failed_count = sum([r[1] for r in res])
221
+ failed_rate = all_failed_count / (all_failed_count + all_success_count)
212
222
  cost = time.perf_counter() - start
213
223
 
214
- qps = round(all_count / cost, 4)
215
- log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
216
-
224
+ qps = round(all_success_count / cost, 4)
225
+ log.info(
226
+ f"End search in concurrency {conc}: dur={cost}s, failed_rate={failed_rate}, "
227
+ f"all_success_count={all_success_count}, all_failed_count={all_failed_count}, qps={qps}",
228
+ )
217
229
  if qps > max_qps:
218
230
  max_qps = qps
219
231
  log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
@@ -230,52 +242,53 @@ class MultiProcessingSearchRunner:
230
242
  finally:
231
243
  self.stop()
232
244
 
233
- return max_qps
245
+ return max_qps, failed_rate
234
246
 
235
- def search_by_dur(
236
- self,
237
- dur: int,
238
- test_data: list[list[float]],
239
- q: mp.Queue,
240
- cond: mp.Condition,
241
- ) -> int:
247
+ def search_by_dur(self, dur: int, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> tuple[int, int]:
248
+ """
249
+ Returns:
250
+ int: successful requests count
251
+ int: failed requests count
252
+ """
242
253
  # sync all process
243
254
  q.put(1)
244
255
  with cond:
245
256
  cond.wait()
246
257
 
247
258
  with self.db.init():
259
+ self.db.prepare_filter(self.filters)
248
260
  num, idx = len(test_data), random.randint(0, len(test_data) - 1)
249
261
 
250
262
  start_time = time.perf_counter()
251
- count = 0
263
+ success_count = 0
264
+ failed_cnt = 0
252
265
  while time.perf_counter() < start_time + dur:
253
266
  s = time.perf_counter()
254
267
  try:
255
- self.db.search_embedding(
256
- test_data[idx],
257
- self.k,
258
- self.filters,
259
- )
268
+ self.db.search_embedding(test_data[idx], self.k)
269
+ success_count += 1
260
270
  except Exception as e:
261
- log.warning(f"VectorDB search_embedding error: {e}")
262
- traceback.print_exc(chain=True)
263
- raise e from None
271
+ failed_cnt += 1
272
+ # reduce log
273
+ if failed_cnt <= 3:
274
+ log.warning(f"VectorDB search_embedding error: {e}")
275
+ else:
276
+ log.debug(f"VectorDB search_embedding error: {e}")
264
277
 
265
- count += 1
266
278
  # loop through the test data
267
279
  idx = idx + 1 if idx < num - 1 else 0
268
280
 
269
- if count % 500 == 0:
281
+ if success_count % 500 == 0:
270
282
  log.debug(
271
- f"({mp.current_process().name:16}) search_count: {count}, "
272
- f"latest_latency={time.perf_counter()-s}"
283
+ f"({mp.current_process().name:16}) search_count: {success_count}, "
284
+ f"latest_latency={time.perf_counter()-s}",
273
285
  )
274
286
 
275
287
  total_dur = round(time.perf_counter() - start_time, 4)
276
288
  log.debug(
277
289
  f"{mp.current_process().name:16} search {self.duration}s: "
278
- f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}"
290
+ f"actual_dur={total_dur}s, count={success_count}, failed_cnt={failed_cnt}, "
291
+ f"qps (successful) in this process: {round(success_count / total_dur, 4):3}",
279
292
  )
280
293
 
281
- return count
294
+ return success_count, failed_cnt