vectordb-bench 0.0.19__py3-none-any.whl → 0.0.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. vectordb_bench/__init__.py +49 -24
  2. vectordb_bench/__main__.py +4 -3
  3. vectordb_bench/backend/assembler.py +12 -13
  4. vectordb_bench/backend/cases.py +55 -45
  5. vectordb_bench/backend/clients/__init__.py +75 -14
  6. vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
  7. vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
  8. vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +111 -70
  9. vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
  10. vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
  11. vectordb_bench/backend/clients/alloydb/cli.py +51 -34
  12. vectordb_bench/backend/clients/alloydb/config.py +30 -30
  13. vectordb_bench/backend/clients/api.py +5 -9
  14. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
  15. vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
  16. vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
  17. vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
  18. vectordb_bench/backend/clients/chroma/chroma.py +38 -36
  19. vectordb_bench/backend/clients/chroma/config.py +4 -2
  20. vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
  21. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
  22. vectordb_bench/backend/clients/memorydb/cli.py +8 -8
  23. vectordb_bench/backend/clients/memorydb/config.py +2 -2
  24. vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
  25. vectordb_bench/backend/clients/milvus/cli.py +41 -83
  26. vectordb_bench/backend/clients/milvus/config.py +18 -8
  27. vectordb_bench/backend/clients/milvus/milvus.py +18 -19
  28. vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
  29. vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
  30. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
  31. vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
  32. vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
  33. vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
  34. vectordb_bench/backend/clients/pgvector/cli.py +40 -31
  35. vectordb_bench/backend/clients/pgvector/config.py +63 -73
  36. vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
  37. vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
  38. vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
  39. vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
  40. vectordb_bench/backend/clients/pinecone/config.py +1 -0
  41. vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
  42. vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
  43. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
  44. vectordb_bench/backend/clients/redis/cli.py +6 -12
  45. vectordb_bench/backend/clients/redis/config.py +7 -5
  46. vectordb_bench/backend/clients/redis/redis.py +94 -58
  47. vectordb_bench/backend/clients/test/cli.py +1 -2
  48. vectordb_bench/backend/clients/test/config.py +2 -2
  49. vectordb_bench/backend/clients/test/test.py +4 -5
  50. vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
  51. vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
  52. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
  53. vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
  54. vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
  55. vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
  56. vectordb_bench/backend/data_source.py +30 -18
  57. vectordb_bench/backend/dataset.py +47 -27
  58. vectordb_bench/backend/result_collector.py +2 -3
  59. vectordb_bench/backend/runner/__init__.py +4 -6
  60. vectordb_bench/backend/runner/mp_runner.py +85 -34
  61. vectordb_bench/backend/runner/rate_runner.py +30 -19
  62. vectordb_bench/backend/runner/read_write_runner.py +51 -23
  63. vectordb_bench/backend/runner/serial_runner.py +91 -48
  64. vectordb_bench/backend/runner/util.py +4 -3
  65. vectordb_bench/backend/task_runner.py +92 -72
  66. vectordb_bench/backend/utils.py +17 -10
  67. vectordb_bench/base.py +0 -1
  68. vectordb_bench/cli/cli.py +65 -60
  69. vectordb_bench/cli/vectordbbench.py +6 -7
  70. vectordb_bench/frontend/components/check_results/charts.py +8 -19
  71. vectordb_bench/frontend/components/check_results/data.py +4 -16
  72. vectordb_bench/frontend/components/check_results/filters.py +8 -16
  73. vectordb_bench/frontend/components/check_results/nav.py +4 -4
  74. vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
  75. vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
  76. vectordb_bench/frontend/components/concurrent/charts.py +12 -12
  77. vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
  78. vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
  79. vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
  80. vectordb_bench/frontend/components/custom/initStyle.py +1 -1
  81. vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
  82. vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
  83. vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
  84. vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
  85. vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
  86. vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
  87. vectordb_bench/frontend/components/tables/data.py +3 -6
  88. vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
  89. vectordb_bench/frontend/pages/concurrent.py +3 -5
  90. vectordb_bench/frontend/pages/custom.py +30 -9
  91. vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
  92. vectordb_bench/frontend/pages/run_test.py +3 -7
  93. vectordb_bench/frontend/utils.py +1 -1
  94. vectordb_bench/frontend/vdb_benchmark.py +4 -6
  95. vectordb_bench/interface.py +56 -26
  96. vectordb_bench/log_util.py +59 -64
  97. vectordb_bench/metric.py +10 -11
  98. vectordb_bench/models.py +26 -43
  99. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +22 -15
  100. vectordb_bench-0.0.20.dist-info/RECORD +135 -0
  101. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
  102. vectordb_bench-0.0.19.dist-info/RECORD +0 -135
  103. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
  104. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
  105. {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,20 @@
1
+ import concurrent
1
2
  import logging
2
- import psutil
3
3
  import traceback
4
- import concurrent
5
- import numpy as np
6
4
  from enum import Enum, auto
7
5
 
8
- from . import utils
9
- from .cases import Case, CaseLabel
10
- from ..base import BaseModel
11
- from ..models import TaskConfig, PerformanceTimeoutError, TaskStage
6
+ import numpy as np
7
+ import psutil
12
8
 
13
- from .clients import (
14
- api,
15
- MetricType
16
- )
17
- from ..metric import Metric
18
- from .runner import MultiProcessingSearchRunner
19
- from .runner import SerialSearchRunner, SerialInsertRunner
20
- from .data_source import DatasetSource
9
+ from vectordb_bench.base import BaseModel
10
+ from vectordb_bench.metric import Metric
11
+ from vectordb_bench.models import PerformanceTimeoutError, TaskConfig, TaskStage
21
12
 
13
+ from . import utils
14
+ from .cases import Case, CaseLabel
15
+ from .clients import MetricType, api
16
+ from .data_source import DatasetSource
17
+ from .runner import MultiProcessingSearchRunner, SerialInsertRunner, SerialSearchRunner
22
18
 
23
19
  log = logging.getLogger(__name__)
24
20
 
@@ -53,24 +49,39 @@ class CaseRunner(BaseModel):
53
49
  search_runner: MultiProcessingSearchRunner | None = None
54
50
  final_search_runner: MultiProcessingSearchRunner | None = None
55
51
 
56
- def __eq__(self, obj):
52
+ def __eq__(self, obj: any):
57
53
  if isinstance(obj, CaseRunner):
58
- return self.ca.label == CaseLabel.Performance and \
59
- self.config.db == obj.config.db and \
60
- self.config.db_case_config == obj.config.db_case_config and \
61
- self.ca.dataset == obj.ca.dataset
54
+ return (
55
+ self.ca.label == CaseLabel.Performance
56
+ and self.config.db == obj.config.db
57
+ and self.config.db_case_config == obj.config.db_case_config
58
+ and self.ca.dataset == obj.ca.dataset
59
+ )
62
60
  return False
63
61
 
64
62
  def display(self) -> dict:
65
- c_dict = self.ca.dict(include={'label':True, 'filters': True,'dataset':{'data': {'name': True, 'size': True, 'dim': True, 'metric_type': True, 'label': True}} })
66
- c_dict['db'] = self.config.db_name
63
+ c_dict = self.ca.dict(
64
+ include={
65
+ "label": True,
66
+ "filters": True,
67
+ "dataset": {
68
+ "data": {
69
+ "name": True,
70
+ "size": True,
71
+ "dim": True,
72
+ "metric_type": True,
73
+ "label": True,
74
+ },
75
+ },
76
+ },
77
+ )
78
+ c_dict["db"] = self.config.db_name
67
79
  return c_dict
68
80
 
69
81
  @property
70
82
  def normalize(self) -> bool:
71
83
  assert self.db
72
- return self.db.need_normalize_cosine() and \
73
- self.ca.dataset.data.metric_type == MetricType.COSINE
84
+ return self.db.need_normalize_cosine() and self.ca.dataset.data.metric_type == MetricType.COSINE
74
85
 
75
86
  def init_db(self, drop_old: bool = True) -> None:
76
87
  db_cls = self.config.db.init_cls
@@ -80,8 +91,7 @@ class CaseRunner(BaseModel):
80
91
  db_config=self.config.db_config.to_dict(),
81
92
  db_case_config=self.config.db_case_config,
82
93
  drop_old=drop_old,
83
- ) # type:ignore
84
-
94
+ )
85
95
 
86
96
  def _pre_run(self, drop_old: bool = True):
87
97
  try:
@@ -89,12 +99,9 @@ class CaseRunner(BaseModel):
89
99
  self.ca.dataset.prepare(self.dataset_source, filters=self.ca.filter_rate)
90
100
  except ModuleNotFoundError as e:
91
101
  log.warning(
92
- f"pre run case error: please install client for db: {self.config.db}, error={e}"
102
+ f"pre run case error: please install client for db: {self.config.db}, error={e}",
93
103
  )
94
104
  raise e from None
95
- except Exception as e:
96
- log.warning(f"pre run case error: {e}")
97
- raise e from None
98
105
 
99
106
  def run(self, drop_old: bool = True) -> Metric:
100
107
  log.info("Starting run")
@@ -103,12 +110,11 @@ class CaseRunner(BaseModel):
103
110
 
104
111
  if self.ca.label == CaseLabel.Load:
105
112
  return self._run_capacity_case()
106
- elif self.ca.label == CaseLabel.Performance:
113
+ if self.ca.label == CaseLabel.Performance:
107
114
  return self._run_perf_case(drop_old)
108
- else:
109
- msg = f"unknown case type: {self.ca.label}"
110
- log.warning(msg)
111
- raise ValueError(msg)
115
+ msg = f"unknown case type: {self.ca.label}"
116
+ log.warning(msg)
117
+ raise ValueError(msg)
112
118
 
113
119
  def _run_capacity_case(self) -> Metric:
114
120
  """run capacity cases
@@ -120,7 +126,10 @@ class CaseRunner(BaseModel):
120
126
  log.info("Start capacity case")
121
127
  try:
122
128
  runner = SerialInsertRunner(
123
- self.db, self.ca.dataset, self.normalize, self.ca.load_timeout
129
+ self.db,
130
+ self.ca.dataset,
131
+ self.normalize,
132
+ self.ca.load_timeout,
124
133
  )
125
134
  count = runner.run_endlessness()
126
135
  except Exception as e:
@@ -128,7 +137,7 @@ class CaseRunner(BaseModel):
128
137
  raise e from None
129
138
  else:
130
139
  log.info(
131
- f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}"
140
+ f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}",
132
141
  )
133
142
  return Metric(max_load_count=count)
134
143
 
@@ -138,7 +147,7 @@ class CaseRunner(BaseModel):
138
147
  Returns:
139
148
  Metric: load_duration, recall, serial_latency_p99, and, qps
140
149
  """
141
- '''
150
+ """
142
151
  if drop_old:
143
152
  _, load_dur = self._load_train_data()
144
153
  build_dur = self._optimize()
@@ -153,38 +162,40 @@ class CaseRunner(BaseModel):
153
162
 
154
163
  m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
155
164
  m.recall, m.serial_latency_p99 = self._serial_search()
156
- '''
165
+ """
157
166
 
158
167
  log.info("Start performance case")
159
168
  try:
160
169
  m = Metric()
161
170
  if drop_old:
162
171
  if TaskStage.LOAD in self.config.stages:
163
- # self._load_train_data()
164
172
  _, load_dur = self._load_train_data()
165
173
  build_dur = self._optimize()
166
174
  m.load_duration = round(load_dur + build_dur, 4)
167
175
  log.info(
168
176
  f"Finish loading the entire dataset into VectorDB,"
169
177
  f" insert_duration={load_dur}, optimize_duration={build_dur}"
170
- f" load_duration(insert + optimize) = {m.load_duration}"
178
+ f" load_duration(insert + optimize) = {m.load_duration}",
171
179
  )
172
180
  else:
173
181
  log.info("Data loading skipped")
174
- if (
175
- TaskStage.SEARCH_SERIAL in self.config.stages
176
- or TaskStage.SEARCH_CONCURRENT in self.config.stages
177
- ):
182
+ if TaskStage.SEARCH_SERIAL in self.config.stages or TaskStage.SEARCH_CONCURRENT in self.config.stages:
178
183
  self._init_search_runner()
179
184
  if TaskStage.SEARCH_CONCURRENT in self.config.stages:
180
185
  search_results = self._conc_search()
181
- m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list, m.conc_latency_avg_list = search_results
186
+ (
187
+ m.qps,
188
+ m.conc_num_list,
189
+ m.conc_qps_list,
190
+ m.conc_latency_p99_list,
191
+ m.conc_latency_avg_list,
192
+ ) = search_results
182
193
  if TaskStage.SEARCH_SERIAL in self.config.stages:
183
194
  search_results = self._serial_search()
184
- '''
195
+ """
185
196
  m.recall = search_results.recall
186
197
  m.serial_latencies = search_results.serial_latencies
187
- '''
198
+ """
188
199
  m.recall, m.ndcg, m.serial_latency_p99 = search_results
189
200
 
190
201
  except Exception as e:
@@ -199,7 +210,12 @@ class CaseRunner(BaseModel):
199
210
  def _load_train_data(self):
200
211
  """Insert train data and get the insert_duration"""
201
212
  try:
202
- runner = SerialInsertRunner(self.db, self.ca.dataset, self.normalize, self.ca.load_timeout)
213
+ runner = SerialInsertRunner(
214
+ self.db,
215
+ self.ca.dataset,
216
+ self.normalize,
217
+ self.ca.load_timeout,
218
+ )
203
219
  runner.run()
204
220
  except Exception as e:
205
221
  raise e from None
@@ -215,11 +231,12 @@ class CaseRunner(BaseModel):
215
231
  """
216
232
  try:
217
233
  results, _ = self.serial_search_runner.run()
218
- return results
219
234
  except Exception as e:
220
- log.warning(f"search error: {str(e)}, {e}")
235
+ log.warning(f"search error: {e!s}, {e}")
221
236
  self.stop()
222
- raise e from None
237
+ raise e from e
238
+ else:
239
+ return results
223
240
 
224
241
  def _conc_search(self):
225
242
  """Performance concurrency tests, search the test data endlessness
@@ -231,7 +248,7 @@ class CaseRunner(BaseModel):
231
248
  try:
232
249
  return self.search_runner.run()
233
250
  except Exception as e:
234
- log.warning(f"search error: {str(e)}, {e}")
251
+ log.warning(f"search error: {e!s}, {e}")
235
252
  raise e from None
236
253
  finally:
237
254
  self.stop()
@@ -250,7 +267,7 @@ class CaseRunner(BaseModel):
250
267
  log.warning(f"VectorDB optimize timeout in {self.ca.optimize_timeout}")
251
268
  for pid, _ in executor._processes.items():
252
269
  psutil.Process(pid).kill()
253
- raise PerformanceTimeoutError("Performance case optimize timeout") from e
270
+ raise PerformanceTimeoutError from e
254
271
  except Exception as e:
255
272
  log.warning(f"VectorDB optimize error: {e}")
256
273
  raise e from None
@@ -286,6 +303,16 @@ class CaseRunner(BaseModel):
286
303
  self.search_runner.stop()
287
304
 
288
305
 
306
+ DATA_FORMAT = " %-14s | %-12s %-20s %7s | %-10s"
307
+ TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
308
+ "DB",
309
+ "CaseType",
310
+ "Dataset",
311
+ "Filter",
312
+ "task_label",
313
+ )
314
+
315
+
289
316
  class TaskRunner(BaseModel):
290
317
  run_id: str
291
318
  task_label: str
@@ -304,18 +331,8 @@ class TaskRunner(BaseModel):
304
331
  return sum([1 for c in self.case_runners if c.status == status])
305
332
 
306
333
  def display(self) -> None:
307
- DATA_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s")
308
- TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
309
- "DB", "CaseType", "Dataset", "Filter", "task_label")
310
-
311
334
  fmt = [TITLE_FORMAT]
312
- fmt.append(DATA_FORMAT%(
313
- "-"*11,
314
- "-"*12,
315
- "-"*20,
316
- "-"*7,
317
- "-"*7
318
- ))
335
+ fmt.append(DATA_FORMAT % ("-" * 11, "-" * 12, "-" * 20, "-" * 7, "-" * 7))
319
336
 
320
337
  for f in self.case_runners:
321
338
  if f.ca.filter_rate != 0.0:
@@ -326,13 +343,16 @@ class TaskRunner(BaseModel):
326
343
  filters = "None"
327
344
 
328
345
  ds_str = f"{f.ca.dataset.data.name}-{f.ca.dataset.data.label}-{utils.numerize(f.ca.dataset.data.size)}"
329
- fmt.append(DATA_FORMAT%(
330
- f.config.db_name,
331
- f.ca.label.name,
332
- ds_str,
333
- filters,
334
- self.task_label,
335
- ))
346
+ fmt.append(
347
+ DATA_FORMAT
348
+ % (
349
+ f.config.db_name,
350
+ f.ca.label.name,
351
+ ds_str,
352
+ filters,
353
+ self.task_label,
354
+ ),
355
+ )
336
356
 
337
357
  tmp_logger = logging.getLogger("no_color")
338
358
  for f in fmt:
@@ -2,7 +2,7 @@ import time
2
2
  from functools import wraps
3
3
 
4
4
 
5
- def numerize(n) -> str:
5
+ def numerize(n: int) -> str:
6
6
  """display positive number n for readability
7
7
 
8
8
  Examples:
@@ -16,32 +16,34 @@ def numerize(n) -> str:
16
16
  "K": 1e6,
17
17
  "M": 1e9,
18
18
  "B": 1e12,
19
- "END": float('inf'),
19
+ "END": float("inf"),
20
20
  }
21
21
 
22
22
  display_n, sufix = n, ""
23
23
  for s, base in sufix2upbound.items():
24
24
  # number >= 1000B will alway have sufix 'B'
25
25
  if s == "END":
26
- display_n = int(n/1e9)
26
+ display_n = int(n / 1e9)
27
27
  sufix = "B"
28
28
  break
29
29
 
30
30
  if n < base:
31
31
  sufix = "" if s == "EMPTY" else s
32
- display_n = int(n/(base/1e3))
32
+ display_n = int(n / (base / 1e3))
33
33
  break
34
34
  return f"{display_n}{sufix}"
35
35
 
36
36
 
37
- def time_it(func):
38
- """ returns result and elapsed time"""
37
+ def time_it(func: any):
38
+ """returns result and elapsed time"""
39
+
39
40
  @wraps(func)
40
41
  def inner(*args, **kwargs):
41
42
  pref = time.perf_counter()
42
43
  result = func(*args, **kwargs)
43
44
  delta = time.perf_counter() - pref
44
45
  return result, delta
46
+
45
47
  return inner
46
48
 
47
49
 
@@ -62,14 +64,19 @@ def compose_train_files(train_count: int, use_shuffled: bool) -> list[str]:
62
64
  return train_files
63
65
 
64
66
 
65
- def compose_gt_file(filters: int | float | str | None = None) -> str:
67
+ ONE_PERCENT = 0.01
68
+ NINETY_NINE_PERCENT = 0.99
69
+
70
+
71
+ def compose_gt_file(filters: float | str | None = None) -> str:
66
72
  if filters is None:
67
73
  return "neighbors.parquet"
68
74
 
69
- if filters == 0.01:
75
+ if filters == ONE_PERCENT:
70
76
  return "neighbors_head_1p.parquet"
71
77
 
72
- if filters == 0.99:
78
+ if filters == NINETY_NINE_PERCENT:
73
79
  return "neighbors_tail_1p.parquet"
74
80
 
75
- raise ValueError(f"Filters not supported: {filters}")
81
+ msg = f"Filters not supported: {filters}"
82
+ raise ValueError(msg)
vectordb_bench/base.py CHANGED
@@ -3,4 +3,3 @@ from pydantic import BaseModel as PydanticBaseModel
3
3
 
4
4
  class BaseModel(PydanticBaseModel, arbitrary_types_allowed=True):
5
5
  pass
6
-