vectordb-bench 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/backend/assembler.py +2 -2
- vectordb_bench/backend/clients/__init__.py +28 -2
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +1 -7
- vectordb_bench/backend/clients/alloydb/alloydb.py +1 -4
- vectordb_bench/backend/clients/api.py +8 -15
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +54 -8
- vectordb_bench/backend/clients/aws_opensearch/cli.py +85 -1
- vectordb_bench/backend/clients/aws_opensearch/config.py +10 -0
- vectordb_bench/backend/clients/chroma/chroma.py +1 -4
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +1 -4
- vectordb_bench/backend/clients/memorydb/cli.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +2 -5
- vectordb_bench/backend/clients/milvus/milvus.py +1 -20
- vectordb_bench/backend/clients/mongodb/config.py +53 -0
- vectordb_bench/backend/clients/mongodb/mongodb.py +200 -0
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +1 -4
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +3 -11
- vectordb_bench/backend/clients/pgvector/pgvector.py +2 -7
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +2 -7
- vectordb_bench/backend/clients/pinecone/pinecone.py +1 -4
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +3 -6
- vectordb_bench/backend/clients/redis/redis.py +1 -4
- vectordb_bench/backend/clients/test/cli.py +1 -1
- vectordb_bench/backend/clients/test/test.py +1 -4
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +1 -4
- vectordb_bench/backend/data_source.py +4 -12
- vectordb_bench/backend/runner/mp_runner.py +16 -34
- vectordb_bench/backend/runner/rate_runner.py +4 -4
- vectordb_bench/backend/runner/read_write_runner.py +11 -15
- vectordb_bench/backend/runner/serial_runner.py +20 -28
- vectordb_bench/backend/task_runner.py +6 -26
- vectordb_bench/frontend/components/custom/displaypPrams.py +12 -1
- vectordb_bench/frontend/components/run_test/submitTask.py +20 -3
- vectordb_bench/frontend/config/dbCaseConfigs.py +32 -0
- vectordb_bench/interface.py +10 -19
- vectordb_bench/log_util.py +15 -2
- vectordb_bench/models.py +4 -0
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/METADATA +55 -2
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/RECORD +43 -41
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/WHEEL +0 -0
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.20.dist-info → vectordb_bench-0.0.22.dist-info}/top_level.txt +0 -0
@@ -40,9 +40,7 @@ class SerialInsertRunner:
|
|
40
40
|
def task(self) -> int:
|
41
41
|
count = 0
|
42
42
|
with self.db.init():
|
43
|
-
log.info(
|
44
|
-
f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}",
|
45
|
-
)
|
43
|
+
log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
|
46
44
|
start = time.perf_counter()
|
47
45
|
for data_df in self.dataset:
|
48
46
|
all_metadata = data_df["id"].tolist()
|
@@ -66,13 +64,11 @@ class SerialInsertRunner:
|
|
66
64
|
assert insert_count == len(all_metadata)
|
67
65
|
count += insert_count
|
68
66
|
if count % 100_000 == 0:
|
69
|
-
log.info(
|
70
|
-
f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB",
|
71
|
-
)
|
67
|
+
log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
|
72
68
|
|
73
69
|
log.info(
|
74
|
-
f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, "
|
75
|
-
f"dur={time.perf_counter()-start}"
|
70
|
+
f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, "
|
71
|
+
f"dur={time.perf_counter() - start}"
|
76
72
|
)
|
77
73
|
return count
|
78
74
|
|
@@ -83,8 +79,8 @@ class SerialInsertRunner:
|
|
83
79
|
|
84
80
|
num_batches = math.ceil(len(all_embeddings) / NUM_PER_BATCH)
|
85
81
|
log.info(
|
86
|
-
f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} "
|
87
|
-
f"embeddings in batch {NUM_PER_BATCH}"
|
82
|
+
f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} "
|
83
|
+
f"embeddings in batch {NUM_PER_BATCH}"
|
88
84
|
)
|
89
85
|
count = 0
|
90
86
|
for batch_id in range(num_batches):
|
@@ -94,8 +90,8 @@ class SerialInsertRunner:
|
|
94
90
|
embeddings = all_embeddings[batch_id * NUM_PER_BATCH : (batch_id + 1) * NUM_PER_BATCH]
|
95
91
|
|
96
92
|
log.debug(
|
97
|
-
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
98
|
-
f"Start inserting {len(metadata)} embeddings"
|
93
|
+
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
94
|
+
f"Start inserting {len(metadata)} embeddings"
|
99
95
|
)
|
100
96
|
while retry_count < LOAD_MAX_TRY_COUNT:
|
101
97
|
insert_count, error = self.db.insert_embeddings(
|
@@ -113,15 +109,15 @@ class SerialInsertRunner:
|
|
113
109
|
else:
|
114
110
|
break
|
115
111
|
log.debug(
|
116
|
-
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
117
|
-
f"Finish inserting {len(metadata)} embeddings"
|
112
|
+
f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_batches}], "
|
113
|
+
f"Finish inserting {len(metadata)} embeddings"
|
118
114
|
)
|
119
115
|
|
120
116
|
assert already_insert_count == len(metadata)
|
121
117
|
count += already_insert_count
|
122
118
|
log.info(
|
123
|
-
f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in "
|
124
|
-
f"batch {NUM_PER_BATCH}"
|
119
|
+
f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in "
|
120
|
+
f"batch {NUM_PER_BATCH}"
|
125
121
|
)
|
126
122
|
return count
|
127
123
|
|
@@ -160,8 +156,6 @@ class SerialInsertRunner:
|
|
160
156
|
start_time = time.perf_counter()
|
161
157
|
max_load_count, times = 0, 0
|
162
158
|
try:
|
163
|
-
with self.db.init():
|
164
|
-
self.db.ready_to_load()
|
165
159
|
while time.perf_counter() - start_time < self.timeout:
|
166
160
|
count = self.endless_insert_data(
|
167
161
|
all_embeddings,
|
@@ -171,13 +165,13 @@ class SerialInsertRunner:
|
|
171
165
|
max_load_count += count
|
172
166
|
times += 1
|
173
167
|
log.info(
|
174
|
-
f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, "
|
175
|
-
f"{max_load_count}"
|
168
|
+
f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, "
|
169
|
+
f"{max_load_count}"
|
176
170
|
)
|
177
171
|
except Exception as e:
|
178
172
|
log.info(
|
179
|
-
f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, "
|
180
|
-
f"{max_load_count}, err={e}"
|
173
|
+
f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, "
|
174
|
+
f"{max_load_count}, err={e}"
|
181
175
|
)
|
182
176
|
traceback.print_exc()
|
183
177
|
return max_load_count
|
@@ -209,9 +203,7 @@ class SerialSearchRunner:
|
|
209
203
|
self.ground_truth = ground_truth
|
210
204
|
|
211
205
|
def search(self, args: tuple[list, pd.DataFrame]) -> tuple[float, float, float]:
|
212
|
-
log.info(
|
213
|
-
f"{mp.current_process().name:14} start search the entire test_data to get recall and latency",
|
214
|
-
)
|
206
|
+
log.info(f"{mp.current_process().name:14} start search the entire test_data to get recall and latency")
|
215
207
|
with self.db.init():
|
216
208
|
test_data, ground_truth = args
|
217
209
|
ideal_dcg = get_ideal_dcg(self.k)
|
@@ -242,8 +234,8 @@ class SerialSearchRunner:
|
|
242
234
|
|
243
235
|
if len(latencies) % 100 == 0:
|
244
236
|
log.debug(
|
245
|
-
f"({mp.current_process().name:14}) search_count={len(latencies):3}, "
|
246
|
-
f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}"
|
237
|
+
f"({mp.current_process().name:14}) search_count={len(latencies):3}, "
|
238
|
+
f"latest_latency={latencies[-1]}, latest recall={recalls[-1]}"
|
247
239
|
)
|
248
240
|
|
249
241
|
avg_latency = round(np.mean(latencies), 4)
|
@@ -258,7 +250,7 @@ class SerialSearchRunner:
|
|
258
250
|
f"avg_recall={avg_recall}, "
|
259
251
|
f"avg_ndcg={avg_ndcg},"
|
260
252
|
f"avg_latency={avg_latency}, "
|
261
|
-
f"p99={p99}"
|
253
|
+
f"p99={p99}"
|
262
254
|
)
|
263
255
|
return (avg_recall, avg_ndcg, p99)
|
264
256
|
|
@@ -98,9 +98,7 @@ class CaseRunner(BaseModel):
|
|
98
98
|
self.init_db(drop_old)
|
99
99
|
self.ca.dataset.prepare(self.dataset_source, filters=self.ca.filter_rate)
|
100
100
|
except ModuleNotFoundError as e:
|
101
|
-
log.warning(
|
102
|
-
f"pre run case error: please install client for db: {self.config.db}, error={e}",
|
103
|
-
)
|
101
|
+
log.warning(f"pre run case error: please install client for db: {self.config.db}, error={e}")
|
104
102
|
raise e from None
|
105
103
|
|
106
104
|
def run(self, drop_old: bool = True) -> Metric:
|
@@ -136,9 +134,7 @@ class CaseRunner(BaseModel):
|
|
136
134
|
log.warning(f"Failed to run capacity case, reason = {e}")
|
137
135
|
raise e from None
|
138
136
|
else:
|
139
|
-
log.info(
|
140
|
-
f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}",
|
141
|
-
)
|
137
|
+
log.info(f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}")
|
142
138
|
return Metric(max_load_count=count)
|
143
139
|
|
144
140
|
def _run_perf_case(self, drop_old: bool = True) -> Metric:
|
@@ -147,22 +143,6 @@ class CaseRunner(BaseModel):
|
|
147
143
|
Returns:
|
148
144
|
Metric: load_duration, recall, serial_latency_p99, and, qps
|
149
145
|
"""
|
150
|
-
"""
|
151
|
-
if drop_old:
|
152
|
-
_, load_dur = self._load_train_data()
|
153
|
-
build_dur = self._optimize()
|
154
|
-
m.load_duration = round(load_dur+build_dur, 4)
|
155
|
-
log.info(
|
156
|
-
f"Finish loading the entire dataset into VectorDB,"
|
157
|
-
f" insert_duration={load_dur}, optimize_duration={build_dur}"
|
158
|
-
f" load_duration(insert + optimize) = {m.load_duration}"
|
159
|
-
)
|
160
|
-
|
161
|
-
self._init_search_runner()
|
162
|
-
|
163
|
-
m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
|
164
|
-
m.recall, m.serial_latency_p99 = self._serial_search()
|
165
|
-
"""
|
166
146
|
|
167
147
|
log.info("Start performance case")
|
168
148
|
try:
|
@@ -175,7 +155,7 @@ class CaseRunner(BaseModel):
|
|
175
155
|
log.info(
|
176
156
|
f"Finish loading the entire dataset into VectorDB,"
|
177
157
|
f" insert_duration={load_dur}, optimize_duration={build_dur}"
|
178
|
-
f" load_duration(insert + optimize) = {m.load_duration}"
|
158
|
+
f" load_duration(insert + optimize) = {m.load_duration}"
|
179
159
|
)
|
180
160
|
else:
|
181
161
|
log.info("Data loading skipped")
|
@@ -254,13 +234,13 @@ class CaseRunner(BaseModel):
|
|
254
234
|
self.stop()
|
255
235
|
|
256
236
|
@utils.time_it
|
257
|
-
def
|
237
|
+
def _optimize_task(self) -> None:
|
258
238
|
with self.db.init():
|
259
|
-
self.db.
|
239
|
+
self.db.optimize(data_size=self.ca.dataset.data.size)
|
260
240
|
|
261
241
|
def _optimize(self) -> float:
|
262
242
|
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
|
263
|
-
future = executor.submit(self.
|
243
|
+
future = executor.submit(self._optimize_task)
|
264
244
|
try:
|
265
245
|
return future.result(timeout=self.ca.optimize_timeout)[1]
|
266
246
|
except TimeoutError as e:
|
@@ -3,7 +3,7 @@ def displayParams(st):
|
|
3
3
|
"""
|
4
4
|
- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
|
5
5
|
- Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
6
|
-
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
6
|
+
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
7
7
|
- Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
|
8
8
|
|
9
9
|
- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
|
@@ -11,3 +11,14 @@ def displayParams(st):
|
|
11
11
|
- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
|
12
12
|
"""
|
13
13
|
)
|
14
|
+
st.caption(
|
15
|
+
"""We recommend limiting the number of test query vectors, like 1,000.""",
|
16
|
+
help="""
|
17
|
+
When conducting concurrent query tests, Vdbbench creates a large number of processes.
|
18
|
+
To minimize additional communication overhead during testing,
|
19
|
+
we prepare a complete set of test queries for each process, allowing them to run independently.\n
|
20
|
+
However, this means that as the number of concurrent processes increases,
|
21
|
+
the number of copied query vectors also increases significantly,
|
22
|
+
which can place substantial pressure on memory resources.
|
23
|
+
""",
|
24
|
+
)
|
@@ -1,6 +1,8 @@
|
|
1
1
|
from datetime import datetime
|
2
|
+
from vectordb_bench import config
|
2
3
|
from vectordb_bench.frontend.config import styles
|
3
4
|
from vectordb_bench.interface import benchmark_runner
|
5
|
+
from vectordb_bench.models import TaskConfig
|
4
6
|
|
5
7
|
|
6
8
|
def submitTask(st, tasks, isAllValid):
|
@@ -47,16 +49,31 @@ def advancedSettings(st):
|
|
47
49
|
k = container[0].number_input("k", min_value=1, value=100, label_visibility="collapsed")
|
48
50
|
container[1].caption("K value for number of nearest neighbors to search")
|
49
51
|
|
50
|
-
|
52
|
+
container = st.columns([1, 2])
|
53
|
+
defaultconcurrentInput = ",".join(map(str, config.NUM_CONCURRENCY))
|
54
|
+
concurrentInput = container[0].text_input(
|
55
|
+
"Concurrent Input", value=defaultconcurrentInput, label_visibility="collapsed"
|
56
|
+
)
|
57
|
+
container[1].caption("num of concurrencies for search tests to get max-qps")
|
58
|
+
return index_already_exists, use_aliyun, k, concurrentInput
|
51
59
|
|
52
60
|
|
53
|
-
def controlPanel(st, tasks, taskLabel, isAllValid):
|
54
|
-
index_already_exists, use_aliyun, k = advancedSettings(st)
|
61
|
+
def controlPanel(st, tasks: list[TaskConfig], taskLabel, isAllValid):
|
62
|
+
index_already_exists, use_aliyun, k, concurrentInput = advancedSettings(st)
|
55
63
|
|
56
64
|
def runHandler():
|
57
65
|
benchmark_runner.set_drop_old(not index_already_exists)
|
66
|
+
|
67
|
+
try:
|
68
|
+
concurrentInput_list = [int(item.strip()) for item in concurrentInput.split(",")]
|
69
|
+
except ValueError:
|
70
|
+
st.write("please input correct number")
|
71
|
+
return None
|
72
|
+
|
58
73
|
for task in tasks:
|
59
74
|
task.case_config.k = k
|
75
|
+
task.case_config.concurrency_search_config.num_concurrency = concurrentInput_list
|
76
|
+
|
60
77
|
benchmark_runner.set_download_address(use_aliyun)
|
61
78
|
benchmark_runner.run(tasks, taskLabel)
|
62
79
|
|
@@ -1041,6 +1041,26 @@ CaseConfigParamInput_NumCandidates_AliES = CaseConfigInput(
|
|
1041
1041
|
)
|
1042
1042
|
|
1043
1043
|
|
1044
|
+
CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput(
|
1045
|
+
label=CaseConfigParamType.mongodb_quantization_type,
|
1046
|
+
inputType=InputType.Option,
|
1047
|
+
inputConfig={
|
1048
|
+
"options": ["none", "scalar", "binary"],
|
1049
|
+
},
|
1050
|
+
)
|
1051
|
+
|
1052
|
+
|
1053
|
+
CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput(
|
1054
|
+
label=CaseConfigParamType.mongodb_num_candidates_ratio,
|
1055
|
+
inputType=InputType.Number,
|
1056
|
+
inputConfig={
|
1057
|
+
"min": 10,
|
1058
|
+
"max": 20,
|
1059
|
+
"value": 10,
|
1060
|
+
},
|
1061
|
+
)
|
1062
|
+
|
1063
|
+
|
1044
1064
|
MilvusLoadConfig = [
|
1045
1065
|
CaseConfigParamInput_IndexType,
|
1046
1066
|
CaseConfigParamInput_M,
|
@@ -1224,6 +1244,14 @@ AliyunElasticsearchPerformanceConfig = [
|
|
1224
1244
|
CaseConfigParamInput_NumCandidates_AliES,
|
1225
1245
|
]
|
1226
1246
|
|
1247
|
+
MongoDBLoadingConfig = [
|
1248
|
+
CaseConfigParamInput_MongoDBQuantizationType,
|
1249
|
+
]
|
1250
|
+
MongoDBPerformanceConfig = [
|
1251
|
+
CaseConfigParamInput_MongoDBQuantizationType,
|
1252
|
+
CaseConfigParamInput_MongoDBNumCandidatesRatio,
|
1253
|
+
]
|
1254
|
+
|
1227
1255
|
CASE_CONFIG_MAP = {
|
1228
1256
|
DB.Milvus: {
|
1229
1257
|
CaseLabel.Load: MilvusLoadConfig,
|
@@ -1272,4 +1300,8 @@ CASE_CONFIG_MAP = {
|
|
1272
1300
|
CaseLabel.Load: AliyunOpensearchLoadingConfig,
|
1273
1301
|
CaseLabel.Performance: AliyunOpenSearchPerformanceConfig,
|
1274
1302
|
},
|
1303
|
+
DB.MongoDB: {
|
1304
|
+
CaseLabel.Load: MongoDBLoadingConfig,
|
1305
|
+
CaseLabel.Performance: MongoDBPerformanceConfig,
|
1306
|
+
},
|
1275
1307
|
}
|
vectordb_bench/interface.py
CHANGED
@@ -65,9 +65,7 @@ class BenchMarkRunner:
|
|
65
65
|
log.warning("Empty tasks submitted")
|
66
66
|
return False
|
67
67
|
|
68
|
-
log.debug(
|
69
|
-
f"tasks: {tasks}, task_label: {task_label}, dataset source: {self.dataset_source}",
|
70
|
-
)
|
68
|
+
log.debug(f"tasks: {tasks}, task_label: {task_label}, dataset source: {self.dataset_source}")
|
71
69
|
|
72
70
|
# Generate run_id
|
73
71
|
run_id = uuid.uuid4().hex
|
@@ -169,14 +167,13 @@ class BenchMarkRunner:
|
|
169
167
|
drop_old = TaskStage.DROP_OLD in runner.config.stages
|
170
168
|
if (latest_runner and runner == latest_runner) or not self.drop_old:
|
171
169
|
drop_old = False
|
170
|
+
num_cases = running_task.num_cases()
|
172
171
|
try:
|
173
|
-
log.info(
|
174
|
-
f"[{idx+1}/{running_task.num_cases()}] start case: {runner.display()}, drop_old={drop_old}",
|
175
|
-
)
|
172
|
+
log.info(f"[{idx+1}/{num_cases}] start case: {runner.display()}, drop_old={drop_old}")
|
176
173
|
case_res.metrics = runner.run(drop_old)
|
177
174
|
log.info(
|
178
|
-
f"[{idx+1}/{
|
179
|
-
f"result={case_res.metrics}, label={case_res.label}"
|
175
|
+
f"[{idx+1}/{num_cases}] finish case: {runner.display()}, "
|
176
|
+
f"result={case_res.metrics}, label={case_res.label}"
|
180
177
|
)
|
181
178
|
|
182
179
|
# cache the latest succeeded runner
|
@@ -189,16 +186,12 @@ class BenchMarkRunner:
|
|
189
186
|
if not drop_old:
|
190
187
|
case_res.metrics.load_duration = cached_load_duration if cached_load_duration else 0.0
|
191
188
|
except (LoadTimeoutError, PerformanceTimeoutError) as e:
|
192
|
-
log.warning(
|
193
|
-
f"[{idx+1}/{running_task.num_cases()}] case {runner.display()} failed to run, reason={e}",
|
194
|
-
)
|
189
|
+
log.warning(f"[{idx+1}/{num_cases}] case {runner.display()} failed to run, reason={e}")
|
195
190
|
case_res.label = ResultLabel.OUTOFRANGE
|
196
191
|
continue
|
197
192
|
|
198
193
|
except Exception as e:
|
199
|
-
log.warning(
|
200
|
-
f"[{idx+1}/{running_task.num_cases()}] case {runner.display()} failed to run, reason={e}",
|
201
|
-
)
|
194
|
+
log.warning(f"[{idx+1}/{num_cases}] case {runner.display()} failed to run, reason={e}")
|
202
195
|
traceback.print_exc()
|
203
196
|
case_res.label = ResultLabel.FAILED
|
204
197
|
continue
|
@@ -217,9 +210,7 @@ class BenchMarkRunner:
|
|
217
210
|
|
218
211
|
send_conn.send((SIGNAL.SUCCESS, None))
|
219
212
|
send_conn.close()
|
220
|
-
log.info(
|
221
|
-
f"Success to finish task: label={running_task.task_label}, run_id={running_task.run_id}",
|
222
|
-
)
|
213
|
+
log.info(f"Success to finish task: label={running_task.task_label}, run_id={running_task.run_id}")
|
223
214
|
|
224
215
|
except Exception as e:
|
225
216
|
err_msg = (
|
@@ -249,8 +240,8 @@ class BenchMarkRunner:
|
|
249
240
|
|
250
241
|
def _run_async(self, conn: Connection) -> bool:
|
251
242
|
log.info(
|
252
|
-
f"task submitted: id={self.running_task.run_id}, {self.running_task.task_label}, "
|
253
|
-
f"case number: {len(self.running_task.case_runners)}"
|
243
|
+
f"task submitted: id={self.running_task.run_id}, {self.running_task.task_label}, "
|
244
|
+
f"case number: {len(self.running_task.case_runners)}"
|
254
245
|
)
|
255
246
|
global global_result_future
|
256
247
|
executor = concurrent.futures.ProcessPoolExecutor(
|
vectordb_bench/log_util.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
from logging import config
|
3
|
+
from pathlib import Path
|
3
4
|
|
4
5
|
|
5
6
|
def init(log_level: str):
|
7
|
+
# Create logs directory if it doesn't exist
|
8
|
+
log_dir = Path("logs")
|
9
|
+
log_dir.mkdir(exist_ok=True)
|
10
|
+
|
6
11
|
log_config = {
|
7
12
|
"version": 1,
|
8
13
|
"disable_existing_loggers": False,
|
@@ -24,15 +29,23 @@ def init(log_level: str):
|
|
24
29
|
"class": "logging.StreamHandler",
|
25
30
|
"formatter": "default",
|
26
31
|
},
|
32
|
+
"file": {
|
33
|
+
"class": "logging.handlers.RotatingFileHandler",
|
34
|
+
"formatter": "default",
|
35
|
+
"filename": "logs/vectordb_bench.log",
|
36
|
+
"maxBytes": 10485760, # 10MB
|
37
|
+
"backupCount": 5,
|
38
|
+
"encoding": "utf8",
|
39
|
+
},
|
27
40
|
},
|
28
41
|
"loggers": {
|
29
42
|
"vectordb_bench": {
|
30
|
-
"handlers": ["console"],
|
43
|
+
"handlers": ["console", "file"],
|
31
44
|
"level": log_level,
|
32
45
|
"propagate": False,
|
33
46
|
},
|
34
47
|
"no_color": {
|
35
|
-
"handlers": ["no_color_console"],
|
48
|
+
"handlers": ["no_color_console", "file"],
|
36
49
|
"level": log_level,
|
37
50
|
"propagate": False,
|
38
51
|
},
|
vectordb_bench/models.py
CHANGED
@@ -88,6 +88,10 @@ class CaseConfigParamType(Enum):
|
|
88
88
|
numSearchThreads = "num_search_threads"
|
89
89
|
maxNumPrefetchDatasets = "max_num_prefetch_datasets"
|
90
90
|
|
91
|
+
# mongodb params
|
92
|
+
mongodb_quantization_type = "quantization"
|
93
|
+
mongodb_num_candidates_ratio = "num_candidates_ratio"
|
94
|
+
|
91
95
|
|
92
96
|
class CustomizedCase(BaseModel):
|
93
97
|
pass
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: vectordb-bench
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.22
|
4
4
|
Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
|
5
5
|
Author-email: XuanYang-cn <xuan.yang@zilliz.com>
|
6
6
|
Project-URL: repository, https://github.com/zilliztech/VectorDBBench
|
@@ -21,7 +21,7 @@ Requires-Dist: oss2
|
|
21
21
|
Requires-Dist: psutil
|
22
22
|
Requires-Dist: polars
|
23
23
|
Requires-Dist: plotly
|
24
|
-
Requires-Dist: environs
|
24
|
+
Requires-Dist: environs<14.1.0
|
25
25
|
Requires-Dist: pydantic<v2
|
26
26
|
Requires-Dist: scikit-learn
|
27
27
|
Requires-Dist: pymilvus
|
@@ -73,6 +73,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
|
|
73
73
|
Provides-Extra: aliyun-opensearch
|
74
74
|
Requires-Dist: alibabacloud_ha3engine_vector; extra == "aliyun-opensearch"
|
75
75
|
Requires-Dist: alibabacloud_searchengine20211025; extra == "aliyun-opensearch"
|
76
|
+
Provides-Extra: mongodb
|
77
|
+
Requires-Dist: pymongo; extra == "mongodb"
|
76
78
|
|
77
79
|
# VectorDBBench: A Benchmark Tool for VectorDB
|
78
80
|
|
@@ -89,6 +91,8 @@ Closely mimicking real-world production environments, we've set up diverse testi
|
|
89
91
|
|
90
92
|
Prepare to delve into the world of VectorDBBench, and let it guide you in uncovering your perfect vector database match.
|
91
93
|
|
94
|
+
VectorDBBench is sponsered by Zilliz,the leading opensource vectorDB company behind Milvus. Choose smarter with VectorDBBench- start your free test on [zilliz cloud](https://zilliz.com/) today!
|
95
|
+
|
92
96
|
**Leaderboard:** https://zilliz.com/benchmark
|
93
97
|
## Quick Start
|
94
98
|
### Prerequirement
|
@@ -128,6 +132,7 @@ All the database client supported
|
|
128
132
|
| chromadb | `pip install vectordb-bench[chromadb]` |
|
129
133
|
| awsopensearch | `pip install vectordb-bench[opensearch]` |
|
130
134
|
| aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
|
135
|
+
| mongodb | `pip install vectordb-bench[mongodb]` |
|
131
136
|
|
132
137
|
### Run
|
133
138
|
|
@@ -228,6 +233,47 @@ Options:
|
|
228
233
|
with-gt]
|
229
234
|
--help Show this message and exit.
|
230
235
|
```
|
236
|
+
|
237
|
+
### Run awsopensearch from command line
|
238
|
+
|
239
|
+
```shell
|
240
|
+
vectordbbench awsopensearch --db-label awsopensearch \
|
241
|
+
--m 16 --ef-construction 256 \
|
242
|
+
--host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \
|
243
|
+
--user vector --password '<password>' \
|
244
|
+
--case-type Performance1536D5M --num-insert-workers 10 \
|
245
|
+
--skip-load --num-concurrency 75
|
246
|
+
```
|
247
|
+
|
248
|
+
To list the options for awsopensearch, execute `vectordbbench awsopensearch --help`
|
249
|
+
|
250
|
+
```text
|
251
|
+
$ vectordbbench awsopensearch --help
|
252
|
+
Usage: vectordbbench awsopensearch [OPTIONS]
|
253
|
+
|
254
|
+
Options:
|
255
|
+
# Sharding and Replication
|
256
|
+
--number-of-shards INTEGER Number of primary shards for the index
|
257
|
+
--number-of-replicas INTEGER Number of replica copies for each primary
|
258
|
+
shard
|
259
|
+
# Indexing Performance
|
260
|
+
--index-thread-qty INTEGER Thread count for native engine indexing
|
261
|
+
--index-thread-qty-during-force-merge INTEGER
|
262
|
+
Thread count during force merge operations
|
263
|
+
--number-of-indexing-clients INTEGER
|
264
|
+
Number of concurrent indexing clients
|
265
|
+
# Index Management
|
266
|
+
--number-of-segments INTEGER Target number of segments after merging
|
267
|
+
--refresh-interval TEXT How often to make new data available for
|
268
|
+
search
|
269
|
+
--force-merge-enabled BOOLEAN Whether to perform force merge operation
|
270
|
+
--flush-threshold-size TEXT Size threshold for flushing the transaction
|
271
|
+
log
|
272
|
+
# Memory Management
|
273
|
+
--cb-threshold TEXT k-NN Memory circuit breaker threshold
|
274
|
+
|
275
|
+
--help Show this message and exit.```
|
276
|
+
|
231
277
|
#### Using a configuration file.
|
232
278
|
|
233
279
|
The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
|
@@ -394,6 +440,13 @@ We have strict requirements for the data set format, please follow them.
|
|
394
440
|
- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
|
395
441
|
- Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
396
442
|
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
443
|
+
- We recommend limiting the number of test query vectors, like 1,000.
|
444
|
+
When conducting concurrent query tests, Vdbbench creates a large number of processes.
|
445
|
+
To minimize additional communication overhead during testing,
|
446
|
+
we prepare a complete set of test queries for each process, allowing them to run independently.
|
447
|
+
However, this means that as the number of concurrent processes increases,
|
448
|
+
the number of copied query vectors also increases significantly,
|
449
|
+
which can place substantial pressure on memory resources.
|
397
450
|
- Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
|
398
451
|
|
399
452
|
- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
|