vectordb-bench 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. vectordb_bench/backend/assembler.py +2 -2
  2. vectordb_bench/backend/clients/__init__.py +28 -2
  3. vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +1 -7
  4. vectordb_bench/backend/clients/alloydb/alloydb.py +1 -4
  5. vectordb_bench/backend/clients/api.py +8 -15
  6. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +54 -8
  7. vectordb_bench/backend/clients/aws_opensearch/cli.py +85 -1
  8. vectordb_bench/backend/clients/aws_opensearch/config.py +10 -0
  9. vectordb_bench/backend/clients/chroma/chroma.py +1 -4
  10. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +1 -4
  11. vectordb_bench/backend/clients/memorydb/cli.py +2 -2
  12. vectordb_bench/backend/clients/memorydb/memorydb.py +2 -5
  13. vectordb_bench/backend/clients/milvus/milvus.py +1 -20
  14. vectordb_bench/backend/clients/mongodb/config.py +53 -0
  15. vectordb_bench/backend/clients/mongodb/mongodb.py +200 -0
  16. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +1 -4
  17. vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +3 -11
  18. vectordb_bench/backend/clients/pgvector/pgvector.py +2 -7
  19. vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +2 -7
  20. vectordb_bench/backend/clients/pinecone/pinecone.py +1 -4
  21. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +3 -6
  22. vectordb_bench/backend/clients/redis/redis.py +1 -4
  23. vectordb_bench/backend/clients/test/cli.py +1 -1
  24. vectordb_bench/backend/clients/test/test.py +1 -4
  25. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +1 -4
  26. vectordb_bench/backend/data_source.py +4 -12
  27. vectordb_bench/backend/runner/mp_runner.py +16 -34
  28. vectordb_bench/backend/runner/rate_runner.py +4 -4
  29. vectordb_bench/backend/runner/read_write_runner.py +11 -15
  30. vectordb_bench/backend/runner/serial_runner.py +20 -28
  31. vectordb_bench/backend/task_runner.py +6 -26
  32. vectordb_bench/frontend/components/custom/displaypPrams.py +12 -1
  33. vectordb_bench/frontend/components/run_test/submitTask.py +20 -3
  34. vectordb_bench/frontend/config/dbCaseConfigs.py +32 -0
  35. vectordb_bench/interface.py +10 -19
  36. vectordb_bench/log_util.py +15 -2
  37. vectordb_bench/models.py +4 -0
  38. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/METADATA +55 -2
  39. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/RECORD +43 -41
  40. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/LICENSE +0 -0
  41. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/WHEEL +0 -0
  42. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/entry_points.txt +0 -0
  43. {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/top_level.txt +0 -0
@@ -40,9 +40,7 @@ class SerialInsertRunner:
40
40
  def task(self) -> int:
41
41
  count = 0
42
42
  with self.db.init():
43
- log.info(
44
- f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}",
45
- )
43
+ log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
46
44
  start = time.perf_counter()
47
45
  for data_df in self.dataset:
48
46
  all_metadata = data_df["id"].tolist()
@@ -66,13 +64,11 @@ class SerialInsertRunner:
66
64
  assert insert_count == len(all_metadata)
67
65
  count += insert_count
68
66
  if count % 100_000 == 0:
69
- log.info(
70
- f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB",
71
- )
67
+ log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
72
68
 
73
69
  log.info(
74
- f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, ",
75
- f"dur={time.perf_counter()-start}",
70
+ f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, "
71
+ f"dur={time.perf_counter() - start}"
76
72
  )
77
73
  return count
78
74
 
@@ -83,8 +79,8 @@ class SerialInsertRunner:
83
79
 
84
80
  num_batches = math.ceil(len(all_embeddings) / NUM_PER_BATCH)
85
81
  log.info(
86
- f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} ",
87
- f"embeddings in batch {NUM_PER_BATCH}",
82
+ f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} "
83
+ f"embeddings in batch {NUM_PER_BATCH}"
88
84
  )
89
85
  count = 0
90
86
  for batch_id in range(num_batches):
@@ -94,8 +90,8 @@ class SerialInsertRunner:
94
90
  embeddings = all_embeddings[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
95
91
 
96
92
  log.debug(
97
- f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], ",
98
- f"Start inserting {len(metadata)} embeddings",
93
+ f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
94
+ f"Start inserting {len(metadata)} embeddings"
99
95
  )
100
96
  while retry_count < LOAD_MAX_TRY_COUNT:
101
97
  insert_count, error = self.db.insert_embeddings(
@@ -113,15 +109,15 @@ class SerialInsertRunner:
113
109
  else:
114
110
  break
115
111
  log.debug(
116
- f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], ",
117
- f"Finish inserting {len(metadata)} embeddings",
112
+ f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
113
+ f"Finish inserting {len(metadata)} embeddings"
118
114
  )
119
115
 
120
116
  assert already_insert_count == len(metadata)
121
117
  count += already_insert_count
122
118
  log.info(
123
- f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in ",
124
- f"batch {NUM_PER_BATCH}",
119
+ f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in "
120
+ f"batch {NUM_PER_BATCH}"
125
121
  )
126
122
  return count
127
123
 
@@ -160,8 +156,6 @@ class SerialInsertRunner:
160
156
  start_time = time.perf_counter()
161
157
  max_load_count, times = 0, 0
162
158
  try:
163
- with self.db.init():
164
- self.db.ready_to_load()
165
159
  while time.perf_counter() - start_time < self.timeout:
166
160
  count = self.endless_insert_data(
167
161
  all_embeddings,
@@ -171,13 +165,13 @@ class SerialInsertRunner:
171
165
  max_load_count += count
172
166
  times += 1
173
167
  log.info(
174
- f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, ",
175
- f"{max_load_count}",
168
+ f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, "
169
+ f"{max_load_count}"
176
170
  )
177
171
  except Exception as e:
178
172
  log.info(
179
- f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, ",
180
- f"{max_load_count}, err={e}",
173
+ f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, "
174
+ f"{max_load_count}, err={e}"
181
175
  )
182
176
  traceback.print_exc()
183
177
  return max_load_count
@@ -209,9 +203,7 @@ class SerialSearchRunner:
209
203
  self.ground_truth = ground_truth
210
204
 
211
205
  def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]:
212
- log.info(
213
- f"{mp.current_process().name:14} start search the entire test_data to get recall and latency",
214
- )
206
+ log.info(f"{mp.current_process().name:14} start search the entire test_data to get recall and latency")
215
207
  with self.db.init():
216
208
  test_data, ground_truth = args
217
209
  ideal_dcg = get_ideal_dcg(self.k)
@@ -242,8 +234,8 @@ class SerialSearchRunner:
242
234
 
243
235
  if len(latencies) % 100 == 0:
244
236
  log.debug(
245
- f"({mp.current_process().name:14}) search_count={len(latencies):3}, ",
246
- f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}",
237
+ f"({mp.current_process().name:14}) search_count={len(latencies):3}, "
238
+ f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}"
247
239
  )
248
240
 
249
241
  avg_latency = round(np.mean(latencies), 4)
@@ -258,7 +250,7 @@ class SerialSearchRunner:
258
250
  f"avg_recall={avg_recall}, "
259
251
  f"avg_ndcg={avg_ndcg},"
260
252
  f"avg_latency={avg_latency}, "
261
- f"p99={p99}",
253
+ f"p99={p99}"
262
254
  )
263
255
  return (avg_recall, avg_ndcg, p99)
264
256
 
@@ -98,9 +98,7 @@ class CaseRunner(BaseModel):
98
98
  self.init_db(drop_old)
99
99
  self.ca.dataset.prepare(self.dataset_source, filters=self.ca.filter_rate)
100
100
  except ModuleNotFoundError as e:
101
- log.warning(
102
- f"pre run case error: please install client for db: {self.config.db}, error={e}",
103
- )
101
+ log.warning(f"pre run case error: please install client for db: {self.config.db}, error={e}")
104
102
  raise e from None
105
103
 
106
104
  def run(self, drop_old: bool = True) -> Metric:
@@ -136,9 +134,7 @@ class CaseRunner(BaseModel):
136
134
  log.warning(f"Failed to run capacity case, reason = {e}")
137
135
  raise e from None
138
136
  else:
139
- log.info(
140
- f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}",
141
- )
137
+ log.info(f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}")
142
138
  return Metric(max_load_count=count)
143
139
 
144
140
  def _run_perf_case(self, drop_old: bool = True) -> Metric:
@@ -147,22 +143,6 @@ class CaseRunner(BaseModel):
147
143
  Returns:
148
144
  Metric: load_duration, recall, serial_latency_p99, and, qps
149
145
  """
150
- """
151
- if drop_old:
152
- _, load_dur = self._load_train_data()
153
- build_dur = self._optimize()
154
- m.load_duration = round(load_dur+build_dur, 4)
155
- log.info(
156
- f"Finish loading the entire dataset into VectorDB,"
157
- f" insert_duration={load_dur}, optimize_duration={build_dur}"
158
- f" load_duration(insert + optimize) = {m.load_duration}"
159
- )
160
-
161
- self._init_search_runner()
162
-
163
- m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
164
- m.recall, m.serial_latency_p99 = self._serial_search()
165
- """
166
146
 
167
147
  log.info("Start performance case")
168
148
  try:
@@ -175,7 +155,7 @@ class CaseRunner(BaseModel):
175
155
  log.info(
176
156
  f"Finish loading the entire dataset into VectorDB,"
177
157
  f" insert_duration={load_dur}, optimize_duration={build_dur}"
178
- f" load_duration(insert + optimize) = {m.load_duration}",
158
+ f" load_duration(insert + optimize) = {m.load_duration}"
179
159
  )
180
160
  else:
181
161
  log.info("Data loading skipped")
@@ -254,13 +234,13 @@ class CaseRunner(BaseModel):
254
234
  self.stop()
255
235
 
256
236
  @utils.time_it
257
- def _task(self) -> None:
237
+ def _optimize_task(self) -> None:
258
238
  with self.db.init():
259
- self.db.optimize_with_size(data_size=self.ca.dataset.data.size)
239
+ self.db.optimize(data_size=self.ca.dataset.data.size)
260
240
 
261
241
  def _optimize(self) -> float:
262
242
  with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
263
- future = executor.submit(self._task)
243
+ future = executor.submit(self._optimize_task)
264
244
  try:
265
245
  return future.result(timeout=self.ca.optimize_timeout)[1]
266
246
  except TimeoutError as e:
@@ -3,7 +3,7 @@ def displayParams(st):
3
3
  """
4
4
  - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
5
5
  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
6
- - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
6
+ - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
7
7
  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
8
8
 
9
9
  - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
@@ -11,3 +11,14 @@ def displayParams(st):
11
11
  - `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
12
12
  """
13
13
  )
14
+ st.caption(
15
+ """We recommend limiting the number of test query vectors, like 1,000.""",
16
+ help="""
17
+ When conducting concurrent query tests, Vdbbench creates a large number of processes.
18
+ To minimize additional communication overhead during testing,
19
+ we prepare a complete set of test queries for each process, allowing them to run independently.\n
20
+ However, this means that as the number of concurrent processes increases,
21
+ the number of copied query vectors also increases significantly,
22
+ which can place substantial pressure on memory resources.
23
+ """,
24
+ )
@@ -1,6 +1,8 @@
1
1
  from datetime import datetime
2
+ from vectordb_bench import config
2
3
  from vectordb_bench.frontend.config import styles
3
4
  from vectordb_bench.interface import benchmark_runner
5
+ from vectordb_bench.models import TaskConfig
4
6
 
5
7
 
6
8
  def submitTask(st, tasks, isAllValid):
@@ -47,16 +49,31 @@ def advancedSettings(st):
47
49
  k = container[0].number_input("k", min_value=1, value=100, label_visibility="collapsed")
48
50
  container[1].caption("K value for number of nearest neighbors to search")
49
51
 
50
- return index_already_exists, use_aliyun, k
52
+ container = st.columns([1, 2])
53
+ defaultconcurrentInput = ",".join(map(str, config.NUM_CONCURRENCY))
54
+ concurrentInput = container[0].text_input(
55
+ "Concurrent Input", value=defaultconcurrentInput, label_visibility="collapsed"
56
+ )
57
+ container[1].caption("num of concurrencies for search tests to get max-qps")
58
+ return index_already_exists, use_aliyun, k, concurrentInput
51
59
 
52
60
 
53
- def controlPanel(st, tasks, taskLabel, isAllValid):
54
- index_already_exists, use_aliyun, k = advancedSettings(st)
61
+ def controlPanel(st, tasks: list[TaskConfig], taskLabel, isAllValid):
62
+ index_already_exists, use_aliyun, k, concurrentInput = advancedSettings(st)
55
63
 
56
64
  def runHandler():
57
65
  benchmark_runner.set_drop_old(not index_already_exists)
66
+
67
+ try:
68
+ concurrentInput_list = [int(item.strip()) for item in concurrentInput.split(",")]
69
+ except ValueError:
70
+ st.write("please input correct number")
71
+ return None
72
+
58
73
  for task in tasks:
59
74
  task.case_config.k = k
75
+ task.case_config.concurrency_search_config.num_concurrency = concurrentInput_list
76
+
60
77
  benchmark_runner.set_download_address(use_aliyun)
61
78
  benchmark_runner.run(tasks, taskLabel)
62
79
 
@@ -1041,6 +1041,26 @@ CaseConfigParamInput_NumCandidates_AliES = CaseConfigInput(
1041
1041
  )
1042
1042
 
1043
1043
 
1044
+ CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput(
1045
+ label=CaseConfigParamType.mongodb_quantization_type,
1046
+ inputType=InputType.Option,
1047
+ inputConfig={
1048
+ "options": ["none", "scalar", "binary"],
1049
+ },
1050
+ )
1051
+
1052
+
1053
+ CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput(
1054
+ label=CaseConfigParamType.mongodb_num_candidates_ratio,
1055
+ inputType=InputType.Number,
1056
+ inputConfig={
1057
+ "min": 10,
1058
+ "max": 20,
1059
+ "value": 10,
1060
+ },
1061
+ )
1062
+
1063
+
1044
1064
  MilvusLoadConfig = [
1045
1065
  CaseConfigParamInput_IndexType,
1046
1066
  CaseConfigParamInput_M,
@@ -1224,6 +1244,14 @@ AliyunElasticsearchPerformanceConfig = [
1224
1244
  CaseConfigParamInput_NumCandidates_AliES,
1225
1245
  ]
1226
1246
 
1247
+ MongoDBLoadingConfig = [
1248
+ CaseConfigParamInput_MongoDBQuantizationType,
1249
+ ]
1250
+ MongoDBPerformanceConfig = [
1251
+ CaseConfigParamInput_MongoDBQuantizationType,
1252
+ CaseConfigParamInput_MongoDBNumCandidatesRatio,
1253
+ ]
1254
+
1227
1255
  CASE_CONFIG_MAP = {
1228
1256
  DB.Milvus: {
1229
1257
  CaseLabel.Load: MilvusLoadConfig,
@@ -1272,4 +1300,8 @@ CASE_CONFIG_MAP = {
1272
1300
  CaseLabel.Load: AliyunOpensearchLoadingConfig,
1273
1301
  CaseLabel.Performance: AliyunOpenSearchPerformanceConfig,
1274
1302
  },
1303
+ DB.MongoDB: {
1304
+ CaseLabel.Load: MongoDBLoadingConfig,
1305
+ CaseLabel.Performance: MongoDBPerformanceConfig,
1306
+ },
1275
1307
  }
@@ -65,9 +65,7 @@ class BenchMarkRunner:
65
65
  log.warning("Empty tasks submitted")
66
66
  return False
67
67
 
68
- log.debug(
69
- f"tasks: {tasks}, task_label: {task_label}, dataset source: {self.dataset_source}",
70
- )
68
+ log.debug(f"tasks: {tasks}, task_label: {task_label}, dataset source: {self.dataset_source}")
71
69
 
72
70
  # Generate run_id
73
71
  run_id = uuid.uuid4().hex
@@ -169,14 +167,13 @@ class BenchMarkRunner:
169
167
  drop_old = TaskStage.DROP_OLD in runner.config.stages
170
168
  if (latest_runner and runner == latest_runner) or not self.drop_old:
171
169
  drop_old = False
170
+ num_cases = running_task.num_cases()
172
171
  try:
173
- log.info(
174
- f"[{idx+1}/{running_task.num_cases()}] start case: {runner.display()}, drop_old={drop_old}",
175
- )
172
+ log.info(f"[{idx+1}/{num_cases}] start case: {runner.display()}, drop_old={drop_old}")
176
173
  case_res.metrics = runner.run(drop_old)
177
174
  log.info(
178
- f"[{idx+1}/{running_task.num_cases()}] finish case: {runner.display()}, "
179
- f"result={case_res.metrics}, label={case_res.label}",
175
+ f"[{idx+1}/{num_cases}] finish case: {runner.display()}, "
176
+ f"result={case_res.metrics}, label={case_res.label}"
180
177
  )
181
178
 
182
179
  # cache the latest succeeded runner
@@ -189,16 +186,12 @@ class BenchMarkRunner:
189
186
  if not drop_old:
190
187
  case_res.metrics.load_duration = cached_load_duration if cached_load_duration else 0.0
191
188
  except (LoadTimeoutError, PerformanceTimeoutError) as e:
192
- log.warning(
193
- f"[{idx+1}/{running_task.num_cases()}] case {runner.display()} failed to run, reason={e}",
194
- )
189
+ log.warning(f"[{idx+1}/{num_cases}] case {runner.display()} failed to run, reason={e}")
195
190
  case_res.label = ResultLabel.OUTOFRANGE
196
191
  continue
197
192
 
198
193
  except Exception as e:
199
- log.warning(
200
- f"[{idx+1}/{running_task.num_cases()}] case {runner.display()} failed to run, reason={e}",
201
- )
194
+ log.warning(f"[{idx+1}/{num_cases}] case {runner.display()} failed to run, reason={e}")
202
195
  traceback.print_exc()
203
196
  case_res.label = ResultLabel.FAILED
204
197
  continue
@@ -217,9 +210,7 @@ class BenchMarkRunner:
217
210
 
218
211
  send_conn.send((SIGNAL.SUCCESS, None))
219
212
  send_conn.close()
220
- log.info(
221
- f"Success to finish task: label={running_task.task_label}, run_id={running_task.run_id}",
222
- )
213
+ log.info(f"Success to finish task: label={running_task.task_label}, run_id={running_task.run_id}")
223
214
 
224
215
  except Exception as e:
225
216
  err_msg = (
@@ -249,8 +240,8 @@ class BenchMarkRunner:
249
240
 
250
241
  def _run_async(self, conn: Connection) -> bool:
251
242
  log.info(
252
- f"task submitted: id={self.running_task.run_id}, {self.running_task.task_label}, ",
253
- f"case number: {len(self.running_task.case_runners)}",
243
+ f"task submitted: id={self.running_task.run_id}, {self.running_task.task_label}, "
244
+ f"case number: {len(self.running_task.case_runners)}"
254
245
  )
255
246
  global global_result_future
256
247
  executor = concurrent.futures.ProcessPoolExecutor(
@@ -1,8 +1,13 @@
1
1
  import logging
2
2
  from logging import config
3
+ from pathlib import Path
3
4
 
4
5
 
5
6
  def init(log_level: str):
7
+ # Create logs directory if it doesn't exist
8
+ log_dir = Path("logs")
9
+ log_dir.mkdir(exist_ok=True)
10
+
6
11
  log_config = {
7
12
  "version": 1,
8
13
  "disable_existing_loggers": False,
@@ -24,15 +29,23 @@ def init(log_level: str):
24
29
  "class": "logging.StreamHandler",
25
30
  "formatter": "default",
26
31
  },
32
+ "file": {
33
+ "class": "logging.handlers.RotatingFileHandler",
34
+ "formatter": "default",
35
+ "filename": "logs/vectordb_bench.log",
36
+ "maxBytes": 10485760, # 10MB
37
+ "backupCount": 5,
38
+ "encoding": "utf8",
39
+ },
27
40
  },
28
41
  "loggers": {
29
42
  "vectordb_bench": {
30
- "handlers": ["console"],
43
+ "handlers": ["console", "file"],
31
44
  "level": log_level,
32
45
  "propagate": False,
33
46
  },
34
47
  "no_color": {
35
- "handlers": ["no_color_console"],
48
+ "handlers": ["no_color_console", "file"],
36
49
  "level": log_level,
37
50
  "propagate": False,
38
51
  },
vectordb_bench/models.py CHANGED
@@ -88,6 +88,10 @@ class CaseConfigParamType(Enum):
88
88
  numSearchThreads = "num_search_threads"
89
89
  maxNumPrefetchDatasets = "max_num_prefetch_datasets"
90
90
 
91
+ # mongodb params
92
+ mongodb_quantization_type = "quantization"
93
+ mongodb_num_candidates_ratio = "num_candidates_ratio"
94
+
91
95
 
92
96
  class CustomizedCase(BaseModel):
93
97
  pass
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: vectordb-bench
3
- Version: 0.0.20
3
+ Version: 0.0.22
4
4
  Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
5
5
  Author-email: XuanYang-cn <xuan.yang@zilliz.com>
6
6
  Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -21,7 +21,7 @@ Requires-Dist: oss2
21
21
  Requires-Dist: psutil
22
22
  Requires-Dist: polars
23
23
  Requires-Dist: plotly
24
- Requires-Dist: environs
24
+ Requires-Dist: environs<14.1.0
25
25
  Requires-Dist: pydantic<v2
26
26
  Requires-Dist: scikit-learn
27
27
  Requires-Dist: pymilvus
@@ -73,6 +73,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
73
73
  Provides-Extra: aliyun-opensearch
74
74
  Requires-Dist: alibabacloud_ha3engine_vector; extra == "aliyun-opensearch"
75
75
  Requires-Dist: alibabacloud_searchengine20211025; extra == "aliyun-opensearch"
76
+ Provides-Extra: mongodb
77
+ Requires-Dist: pymongo; extra == "mongodb"
76
78
 
77
79
  # VectorDBBench: A Benchmark Tool for VectorDB
78
80
 
@@ -89,6 +91,8 @@ Closely mimicking real-world production environments, we've set up diverse testi
89
91
 
90
92
  Prepare to delve into the world of VectorDBBench, and let it guide you in uncovering your perfect vector database match.
91
93
 
94
+ VectorDBBench is sponsered by Zilliz,the leading opensource vectorDB company behind Milvus. Choose smarter with VectorDBBench- start your free test on [zilliz cloud](https://zilliz.com/) today!
95
+
92
96
  **Leaderboard:** https://zilliz.com/benchmark
93
97
  ## Quick Start
94
98
  ### Prerequirement
@@ -128,6 +132,7 @@ All the database client supported
128
132
  | chromadb | `pip install vectordb-bench[chromadb]` |
129
133
  | awsopensearch | `pip install vectordb-bench[opensearch]` |
130
134
  | aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
135
+ | mongodb | `pip install vectordb-bench[mongodb]` |
131
136
 
132
137
  ### Run
133
138
 
@@ -228,6 +233,47 @@ Options:
228
233
  with-gt]
229
234
  --help Show this message and exit.
230
235
  ```
236
+
237
+ ### Run awsopensearch from command line
238
+
239
+ ```shell
240
+ vectordbbench awsopensearch --db-label awsopensearch \
241
+ --m 16 --ef-construction 256 \
242
+ --host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \
243
+ --user vector --password '<password>' \
244
+ --case-type Performance1536D5M --num-insert-workers 10 \
245
+ --skip-load --num-concurrency 75
246
+ ```
247
+
248
+ To list the options for awsopensearch, execute `vectordbbench awsopensearch --help`
249
+
250
+ ```text
251
+ $ vectordbbench awsopensearch --help
252
+ Usage: vectordbbench awsopensearch [OPTIONS]
253
+
254
+ Options:
255
+ # Sharding and Replication
256
+ --number-of-shards INTEGER Number of primary shards for the index
257
+ --number-of-replicas INTEGER Number of replica copies for each primary
258
+ shard
259
+ # Indexing Performance
260
+ --index-thread-qty INTEGER Thread count for native engine indexing
261
+ --index-thread-qty-during-force-merge INTEGER
262
+ Thread count during force merge operations
263
+ --number-of-indexing-clients INTEGER
264
+ Number of concurrent indexing clients
265
+ # Index Management
266
+ --number-of-segments INTEGER Target number of segments after merging
267
+ --refresh-interval TEXT How often to make new data available for
268
+ search
269
+ --force-merge-enabled BOOLEAN Whether to perform force merge operation
270
+ --flush-threshold-size TEXT Size threshold for flushing the transaction
271
+ log
272
+ # Memory Management
273
+ --cb-threshold TEXT k-NN Memory circuit breaker threshold
274
+
275
+ --help Show this message and exit.```
276
+
231
277
  #### Using a configuration file.
232
278
 
233
279
  The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -394,6 +440,13 @@ We have strict requirements for the data set format, please follow them.
394
440
  - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
395
441
  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
396
442
  - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
443
+ - We recommend limiting the number of test query vectors, like 1,000.
444
+ When conducting concurrent query tests, Vdbbench creates a large number of processes.
445
+ To minimize additional communication overhead during testing,
446
+ we prepare a complete set of test queries for each process, allowing them to run independently.
447
+ However, this means that as the number of concurrent processes increases,
448
+ the number of copied query vectors also increases significantly,
449
+ which can place substantial pressure on memory resources.
397
450
  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
398
451
 
399
452
  - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.