vectordb-bench 0.0.19__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +49 -24
- vectordb_bench/__main__.py +4 -3
- vectordb_bench/backend/assembler.py +12 -13
- vectordb_bench/backend/cases.py +55 -45
- vectordb_bench/backend/clients/__init__.py +75 -14
- vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
- vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +111 -70
- vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
- vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
- vectordb_bench/backend/clients/alloydb/cli.py +51 -34
- vectordb_bench/backend/clients/alloydb/config.py +30 -30
- vectordb_bench/backend/clients/api.py +5 -9
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
- vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
- vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
- vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
- vectordb_bench/backend/clients/chroma/chroma.py +38 -36
- vectordb_bench/backend/clients/chroma/config.py +4 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
- vectordb_bench/backend/clients/memorydb/cli.py +8 -8
- vectordb_bench/backend/clients/memorydb/config.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
- vectordb_bench/backend/clients/milvus/cli.py +41 -83
- vectordb_bench/backend/clients/milvus/config.py +18 -8
- vectordb_bench/backend/clients/milvus/milvus.py +18 -19
- vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
- vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
- vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
- vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
- vectordb_bench/backend/clients/pgvector/cli.py +40 -31
- vectordb_bench/backend/clients/pgvector/config.py +63 -73
- vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
- vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
- vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
- vectordb_bench/backend/clients/pinecone/config.py +1 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
- vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
- vectordb_bench/backend/clients/redis/cli.py +6 -12
- vectordb_bench/backend/clients/redis/config.py +7 -5
- vectordb_bench/backend/clients/redis/redis.py +94 -58
- vectordb_bench/backend/clients/test/cli.py +1 -2
- vectordb_bench/backend/clients/test/config.py +2 -2
- vectordb_bench/backend/clients/test/test.py +4 -5
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
- vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
- vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/data_source.py +30 -18
- vectordb_bench/backend/dataset.py +47 -27
- vectordb_bench/backend/result_collector.py +2 -3
- vectordb_bench/backend/runner/__init__.py +4 -6
- vectordb_bench/backend/runner/mp_runner.py +85 -34
- vectordb_bench/backend/runner/rate_runner.py +30 -19
- vectordb_bench/backend/runner/read_write_runner.py +51 -23
- vectordb_bench/backend/runner/serial_runner.py +91 -48
- vectordb_bench/backend/runner/util.py +4 -3
- vectordb_bench/backend/task_runner.py +92 -72
- vectordb_bench/backend/utils.py +17 -10
- vectordb_bench/base.py +0 -1
- vectordb_bench/cli/cli.py +65 -60
- vectordb_bench/cli/vectordbbench.py +6 -7
- vectordb_bench/frontend/components/check_results/charts.py +8 -19
- vectordb_bench/frontend/components/check_results/data.py +4 -16
- vectordb_bench/frontend/components/check_results/filters.py +8 -16
- vectordb_bench/frontend/components/check_results/nav.py +4 -4
- vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
- vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
- vectordb_bench/frontend/components/concurrent/charts.py +12 -12
- vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
- vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
- vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
- vectordb_bench/frontend/components/custom/initStyle.py +1 -1
- vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
- vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
- vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
- vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
- vectordb_bench/frontend/components/tables/data.py +3 -6
- vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
- vectordb_bench/frontend/pages/concurrent.py +3 -5
- vectordb_bench/frontend/pages/custom.py +30 -9
- vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
- vectordb_bench/frontend/pages/run_test.py +3 -7
- vectordb_bench/frontend/utils.py +1 -1
- vectordb_bench/frontend/vdb_benchmark.py +4 -6
- vectordb_bench/interface.py +56 -26
- vectordb_bench/log_util.py +59 -64
- vectordb_bench/metric.py +10 -11
- vectordb_bench/models.py +26 -43
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +22 -15
- vectordb_bench-0.0.20.dist-info/RECORD +135 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
- vectordb_bench-0.0.19.dist-info/RECORD +0 -135
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,20 @@
|
|
1
|
+
import concurrent
|
1
2
|
import logging
|
2
|
-
import psutil
|
3
3
|
import traceback
|
4
|
-
import concurrent
|
5
|
-
import numpy as np
|
6
4
|
from enum import Enum, auto
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
from ..base import BaseModel
|
11
|
-
from ..models import TaskConfig, PerformanceTimeoutError, TaskStage
|
6
|
+
import numpy as np
|
7
|
+
import psutil
|
12
8
|
|
13
|
-
from .
|
14
|
-
|
15
|
-
|
16
|
-
)
|
17
|
-
from ..metric import Metric
|
18
|
-
from .runner import MultiProcessingSearchRunner
|
19
|
-
from .runner import SerialSearchRunner, SerialInsertRunner
|
20
|
-
from .data_source import DatasetSource
|
9
|
+
from vectordb_bench.base import BaseModel
|
10
|
+
from vectordb_bench.metric import Metric
|
11
|
+
from vectordb_bench.models import PerformanceTimeoutError, TaskConfig, TaskStage
|
21
12
|
|
13
|
+
from . import utils
|
14
|
+
from .cases import Case, CaseLabel
|
15
|
+
from .clients import MetricType, api
|
16
|
+
from .data_source import DatasetSource
|
17
|
+
from .runner import MultiProcessingSearchRunner, SerialInsertRunner, SerialSearchRunner
|
22
18
|
|
23
19
|
log = logging.getLogger(__name__)
|
24
20
|
|
@@ -53,24 +49,39 @@ class CaseRunner(BaseModel):
|
|
53
49
|
search_runner: MultiProcessingSearchRunner | None = None
|
54
50
|
final_search_runner: MultiProcessingSearchRunner | None = None
|
55
51
|
|
56
|
-
def __eq__(self, obj):
|
52
|
+
def __eq__(self, obj: any):
|
57
53
|
if isinstance(obj, CaseRunner):
|
58
|
-
return
|
59
|
-
self.
|
60
|
-
self.config.
|
61
|
-
self.
|
54
|
+
return (
|
55
|
+
self.ca.label == CaseLabel.Performance
|
56
|
+
and self.config.db == obj.config.db
|
57
|
+
and self.config.db_case_config == obj.config.db_case_config
|
58
|
+
and self.ca.dataset == obj.ca.dataset
|
59
|
+
)
|
62
60
|
return False
|
63
61
|
|
64
62
|
def display(self) -> dict:
|
65
|
-
c_dict = self.ca.dict(
|
66
|
-
|
63
|
+
c_dict = self.ca.dict(
|
64
|
+
include={
|
65
|
+
"label": True,
|
66
|
+
"filters": True,
|
67
|
+
"dataset": {
|
68
|
+
"data": {
|
69
|
+
"name": True,
|
70
|
+
"size": True,
|
71
|
+
"dim": True,
|
72
|
+
"metric_type": True,
|
73
|
+
"label": True,
|
74
|
+
},
|
75
|
+
},
|
76
|
+
},
|
77
|
+
)
|
78
|
+
c_dict["db"] = self.config.db_name
|
67
79
|
return c_dict
|
68
80
|
|
69
81
|
@property
|
70
82
|
def normalize(self) -> bool:
|
71
83
|
assert self.db
|
72
|
-
return self.db.need_normalize_cosine() and
|
73
|
-
self.ca.dataset.data.metric_type == MetricType.COSINE
|
84
|
+
return self.db.need_normalize_cosine() and self.ca.dataset.data.metric_type == MetricType.COSINE
|
74
85
|
|
75
86
|
def init_db(self, drop_old: bool = True) -> None:
|
76
87
|
db_cls = self.config.db.init_cls
|
@@ -80,8 +91,7 @@ class CaseRunner(BaseModel):
|
|
80
91
|
db_config=self.config.db_config.to_dict(),
|
81
92
|
db_case_config=self.config.db_case_config,
|
82
93
|
drop_old=drop_old,
|
83
|
-
)
|
84
|
-
|
94
|
+
)
|
85
95
|
|
86
96
|
def _pre_run(self, drop_old: bool = True):
|
87
97
|
try:
|
@@ -89,12 +99,9 @@ class CaseRunner(BaseModel):
|
|
89
99
|
self.ca.dataset.prepare(self.dataset_source, filters=self.ca.filter_rate)
|
90
100
|
except ModuleNotFoundError as e:
|
91
101
|
log.warning(
|
92
|
-
f"pre run case error: please install client for db: {self.config.db}, error={e}"
|
102
|
+
f"pre run case error: please install client for db: {self.config.db}, error={e}",
|
93
103
|
)
|
94
104
|
raise e from None
|
95
|
-
except Exception as e:
|
96
|
-
log.warning(f"pre run case error: {e}")
|
97
|
-
raise e from None
|
98
105
|
|
99
106
|
def run(self, drop_old: bool = True) -> Metric:
|
100
107
|
log.info("Starting run")
|
@@ -103,12 +110,11 @@ class CaseRunner(BaseModel):
|
|
103
110
|
|
104
111
|
if self.ca.label == CaseLabel.Load:
|
105
112
|
return self._run_capacity_case()
|
106
|
-
|
113
|
+
if self.ca.label == CaseLabel.Performance:
|
107
114
|
return self._run_perf_case(drop_old)
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
raise ValueError(msg)
|
115
|
+
msg = f"unknown case type: {self.ca.label}"
|
116
|
+
log.warning(msg)
|
117
|
+
raise ValueError(msg)
|
112
118
|
|
113
119
|
def _run_capacity_case(self) -> Metric:
|
114
120
|
"""run capacity cases
|
@@ -120,7 +126,10 @@ class CaseRunner(BaseModel):
|
|
120
126
|
log.info("Start capacity case")
|
121
127
|
try:
|
122
128
|
runner = SerialInsertRunner(
|
123
|
-
self.db,
|
129
|
+
self.db,
|
130
|
+
self.ca.dataset,
|
131
|
+
self.normalize,
|
132
|
+
self.ca.load_timeout,
|
124
133
|
)
|
125
134
|
count = runner.run_endlessness()
|
126
135
|
except Exception as e:
|
@@ -128,7 +137,7 @@ class CaseRunner(BaseModel):
|
|
128
137
|
raise e from None
|
129
138
|
else:
|
130
139
|
log.info(
|
131
|
-
f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}"
|
140
|
+
f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}",
|
132
141
|
)
|
133
142
|
return Metric(max_load_count=count)
|
134
143
|
|
@@ -138,7 +147,7 @@ class CaseRunner(BaseModel):
|
|
138
147
|
Returns:
|
139
148
|
Metric: load_duration, recall, serial_latency_p99, and, qps
|
140
149
|
"""
|
141
|
-
|
150
|
+
"""
|
142
151
|
if drop_old:
|
143
152
|
_, load_dur = self._load_train_data()
|
144
153
|
build_dur = self._optimize()
|
@@ -153,38 +162,40 @@ class CaseRunner(BaseModel):
|
|
153
162
|
|
154
163
|
m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
|
155
164
|
m.recall, m.serial_latency_p99 = self._serial_search()
|
156
|
-
|
165
|
+
"""
|
157
166
|
|
158
167
|
log.info("Start performance case")
|
159
168
|
try:
|
160
169
|
m = Metric()
|
161
170
|
if drop_old:
|
162
171
|
if TaskStage.LOAD in self.config.stages:
|
163
|
-
# self._load_train_data()
|
164
172
|
_, load_dur = self._load_train_data()
|
165
173
|
build_dur = self._optimize()
|
166
174
|
m.load_duration = round(load_dur + build_dur, 4)
|
167
175
|
log.info(
|
168
176
|
f"Finish loading the entire dataset into VectorDB,"
|
169
177
|
f" insert_duration={load_dur}, optimize_duration={build_dur}"
|
170
|
-
f" load_duration(insert + optimize) = {m.load_duration}"
|
178
|
+
f" load_duration(insert + optimize) = {m.load_duration}",
|
171
179
|
)
|
172
180
|
else:
|
173
181
|
log.info("Data loading skipped")
|
174
|
-
if
|
175
|
-
TaskStage.SEARCH_SERIAL in self.config.stages
|
176
|
-
or TaskStage.SEARCH_CONCURRENT in self.config.stages
|
177
|
-
):
|
182
|
+
if TaskStage.SEARCH_SERIAL in self.config.stages or TaskStage.SEARCH_CONCURRENT in self.config.stages:
|
178
183
|
self._init_search_runner()
|
179
184
|
if TaskStage.SEARCH_CONCURRENT in self.config.stages:
|
180
185
|
search_results = self._conc_search()
|
181
|
-
|
186
|
+
(
|
187
|
+
m.qps,
|
188
|
+
m.conc_num_list,
|
189
|
+
m.conc_qps_list,
|
190
|
+
m.conc_latency_p99_list,
|
191
|
+
m.conc_latency_avg_list,
|
192
|
+
) = search_results
|
182
193
|
if TaskStage.SEARCH_SERIAL in self.config.stages:
|
183
194
|
search_results = self._serial_search()
|
184
|
-
|
195
|
+
"""
|
185
196
|
m.recall = search_results.recall
|
186
197
|
m.serial_latencies = search_results.serial_latencies
|
187
|
-
|
198
|
+
"""
|
188
199
|
m.recall, m.ndcg, m.serial_latency_p99 = search_results
|
189
200
|
|
190
201
|
except Exception as e:
|
@@ -199,7 +210,12 @@ class CaseRunner(BaseModel):
|
|
199
210
|
def _load_train_data(self):
|
200
211
|
"""Insert train data and get the insert_duration"""
|
201
212
|
try:
|
202
|
-
runner = SerialInsertRunner(
|
213
|
+
runner = SerialInsertRunner(
|
214
|
+
self.db,
|
215
|
+
self.ca.dataset,
|
216
|
+
self.normalize,
|
217
|
+
self.ca.load_timeout,
|
218
|
+
)
|
203
219
|
runner.run()
|
204
220
|
except Exception as e:
|
205
221
|
raise e from None
|
@@ -215,11 +231,12 @@ class CaseRunner(BaseModel):
|
|
215
231
|
"""
|
216
232
|
try:
|
217
233
|
results, _ = self.serial_search_runner.run()
|
218
|
-
return results
|
219
234
|
except Exception as e:
|
220
|
-
log.warning(f"search error: {
|
235
|
+
log.warning(f"search error: {e!s}, {e}")
|
221
236
|
self.stop()
|
222
|
-
raise e from
|
237
|
+
raise e from e
|
238
|
+
else:
|
239
|
+
return results
|
223
240
|
|
224
241
|
def _conc_search(self):
|
225
242
|
"""Performance concurrency tests, search the test data endlessness
|
@@ -231,7 +248,7 @@ class CaseRunner(BaseModel):
|
|
231
248
|
try:
|
232
249
|
return self.search_runner.run()
|
233
250
|
except Exception as e:
|
234
|
-
log.warning(f"search error: {
|
251
|
+
log.warning(f"search error: {e!s}, {e}")
|
235
252
|
raise e from None
|
236
253
|
finally:
|
237
254
|
self.stop()
|
@@ -250,7 +267,7 @@ class CaseRunner(BaseModel):
|
|
250
267
|
log.warning(f"VectorDB optimize timeout in {self.ca.optimize_timeout}")
|
251
268
|
for pid, _ in executor._processes.items():
|
252
269
|
psutil.Process(pid).kill()
|
253
|
-
raise PerformanceTimeoutError
|
270
|
+
raise PerformanceTimeoutError from e
|
254
271
|
except Exception as e:
|
255
272
|
log.warning(f"VectorDB optimize error: {e}")
|
256
273
|
raise e from None
|
@@ -286,6 +303,16 @@ class CaseRunner(BaseModel):
|
|
286
303
|
self.search_runner.stop()
|
287
304
|
|
288
305
|
|
306
|
+
DATA_FORMAT = " %-14s | %-12s %-20s %7s | %-10s"
|
307
|
+
TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
|
308
|
+
"DB",
|
309
|
+
"CaseType",
|
310
|
+
"Dataset",
|
311
|
+
"Filter",
|
312
|
+
"task_label",
|
313
|
+
)
|
314
|
+
|
315
|
+
|
289
316
|
class TaskRunner(BaseModel):
|
290
317
|
run_id: str
|
291
318
|
task_label: str
|
@@ -304,18 +331,8 @@ class TaskRunner(BaseModel):
|
|
304
331
|
return sum([1 for c in self.case_runners if c.status == status])
|
305
332
|
|
306
333
|
def display(self) -> None:
|
307
|
-
DATA_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s")
|
308
|
-
TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
|
309
|
-
"DB", "CaseType", "Dataset", "Filter", "task_label")
|
310
|
-
|
311
334
|
fmt = [TITLE_FORMAT]
|
312
|
-
fmt.append(DATA_FORMAT%(
|
313
|
-
"-"*11,
|
314
|
-
"-"*12,
|
315
|
-
"-"*20,
|
316
|
-
"-"*7,
|
317
|
-
"-"*7
|
318
|
-
))
|
335
|
+
fmt.append(DATA_FORMAT % ("-" * 11, "-" * 12, "-" * 20, "-" * 7, "-" * 7))
|
319
336
|
|
320
337
|
for f in self.case_runners:
|
321
338
|
if f.ca.filter_rate != 0.0:
|
@@ -326,13 +343,16 @@ class TaskRunner(BaseModel):
|
|
326
343
|
filters = "None"
|
327
344
|
|
328
345
|
ds_str = f"{f.ca.dataset.data.name}-{f.ca.dataset.data.label}-{utils.numerize(f.ca.dataset.data.size)}"
|
329
|
-
fmt.append(
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
346
|
+
fmt.append(
|
347
|
+
DATA_FORMAT
|
348
|
+
% (
|
349
|
+
f.config.db_name,
|
350
|
+
f.ca.label.name,
|
351
|
+
ds_str,
|
352
|
+
filters,
|
353
|
+
self.task_label,
|
354
|
+
),
|
355
|
+
)
|
336
356
|
|
337
357
|
tmp_logger = logging.getLogger("no_color")
|
338
358
|
for f in fmt:
|
vectordb_bench/backend/utils.py
CHANGED
@@ -2,7 +2,7 @@ import time
|
|
2
2
|
from functools import wraps
|
3
3
|
|
4
4
|
|
5
|
-
def numerize(n) -> str:
|
5
|
+
def numerize(n: int) -> str:
|
6
6
|
"""display positive number n for readability
|
7
7
|
|
8
8
|
Examples:
|
@@ -16,32 +16,34 @@ def numerize(n) -> str:
|
|
16
16
|
"K": 1e6,
|
17
17
|
"M": 1e9,
|
18
18
|
"B": 1e12,
|
19
|
-
"END": float(
|
19
|
+
"END": float("inf"),
|
20
20
|
}
|
21
21
|
|
22
22
|
display_n, sufix = n, ""
|
23
23
|
for s, base in sufix2upbound.items():
|
24
24
|
# number >= 1000B will alway have sufix 'B'
|
25
25
|
if s == "END":
|
26
|
-
display_n = int(n/1e9)
|
26
|
+
display_n = int(n / 1e9)
|
27
27
|
sufix = "B"
|
28
28
|
break
|
29
29
|
|
30
30
|
if n < base:
|
31
31
|
sufix = "" if s == "EMPTY" else s
|
32
|
-
display_n = int(n/(base/1e3))
|
32
|
+
display_n = int(n / (base / 1e3))
|
33
33
|
break
|
34
34
|
return f"{display_n}{sufix}"
|
35
35
|
|
36
36
|
|
37
|
-
def time_it(func):
|
38
|
-
"""
|
37
|
+
def time_it(func: any):
|
38
|
+
"""returns result and elapsed time"""
|
39
|
+
|
39
40
|
@wraps(func)
|
40
41
|
def inner(*args, **kwargs):
|
41
42
|
pref = time.perf_counter()
|
42
43
|
result = func(*args, **kwargs)
|
43
44
|
delta = time.perf_counter() - pref
|
44
45
|
return result, delta
|
46
|
+
|
45
47
|
return inner
|
46
48
|
|
47
49
|
|
@@ -62,14 +64,19 @@ def compose_train_files(train_count: int, use_shuffled: bool) -> list[str]:
|
|
62
64
|
return train_files
|
63
65
|
|
64
66
|
|
65
|
-
|
67
|
+
ONE_PERCENT = 0.01
|
68
|
+
NINETY_NINE_PERCENT = 0.99
|
69
|
+
|
70
|
+
|
71
|
+
def compose_gt_file(filters: float | str | None = None) -> str:
|
66
72
|
if filters is None:
|
67
73
|
return "neighbors.parquet"
|
68
74
|
|
69
|
-
if filters ==
|
75
|
+
if filters == ONE_PERCENT:
|
70
76
|
return "neighbors_head_1p.parquet"
|
71
77
|
|
72
|
-
if filters ==
|
78
|
+
if filters == NINETY_NINE_PERCENT:
|
73
79
|
return "neighbors_tail_1p.parquet"
|
74
80
|
|
75
|
-
|
81
|
+
msg = f"Filters not supported: {filters}"
|
82
|
+
raise ValueError(msg)
|
vectordb_bench/base.py
CHANGED