vectordb-bench 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +49 -24
- vectordb_bench/__main__.py +4 -3
- vectordb_bench/backend/assembler.py +12 -13
- vectordb_bench/backend/cases.py +56 -46
- vectordb_bench/backend/clients/__init__.py +101 -14
- vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +26 -0
- vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +18 -0
- vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +345 -0
- vectordb_bench/backend/clients/aliyun_opensearch/config.py +47 -0
- vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
- vectordb_bench/backend/clients/alloydb/cli.py +52 -35
- vectordb_bench/backend/clients/alloydb/config.py +30 -30
- vectordb_bench/backend/clients/api.py +8 -9
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
- vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
- vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
- vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
- vectordb_bench/backend/clients/chroma/chroma.py +38 -36
- vectordb_bench/backend/clients/chroma/config.py +4 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
- vectordb_bench/backend/clients/memorydb/cli.py +8 -8
- vectordb_bench/backend/clients/memorydb/config.py +2 -2
- vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
- vectordb_bench/backend/clients/milvus/cli.py +62 -80
- vectordb_bench/backend/clients/milvus/config.py +31 -7
- vectordb_bench/backend/clients/milvus/milvus.py +23 -26
- vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
- vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
- vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
- vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
- vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
- vectordb_bench/backend/clients/pgvector/cli.py +40 -31
- vectordb_bench/backend/clients/pgvector/config.py +63 -73
- vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
- vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
- vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
- vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
- vectordb_bench/backend/clients/pinecone/config.py +1 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
- vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
- vectordb_bench/backend/clients/redis/cli.py +6 -12
- vectordb_bench/backend/clients/redis/config.py +7 -5
- vectordb_bench/backend/clients/redis/redis.py +94 -58
- vectordb_bench/backend/clients/test/cli.py +1 -2
- vectordb_bench/backend/clients/test/config.py +2 -2
- vectordb_bench/backend/clients/test/test.py +4 -5
- vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
- vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
- vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
- vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/data_source.py +30 -18
- vectordb_bench/backend/dataset.py +47 -27
- vectordb_bench/backend/result_collector.py +2 -3
- vectordb_bench/backend/runner/__init__.py +4 -6
- vectordb_bench/backend/runner/mp_runner.py +85 -34
- vectordb_bench/backend/runner/rate_runner.py +51 -23
- vectordb_bench/backend/runner/read_write_runner.py +140 -46
- vectordb_bench/backend/runner/serial_runner.py +99 -50
- vectordb_bench/backend/runner/util.py +4 -19
- vectordb_bench/backend/task_runner.py +95 -74
- vectordb_bench/backend/utils.py +17 -9
- vectordb_bench/base.py +0 -1
- vectordb_bench/cli/cli.py +65 -60
- vectordb_bench/cli/vectordbbench.py +6 -7
- vectordb_bench/frontend/components/check_results/charts.py +8 -19
- vectordb_bench/frontend/components/check_results/data.py +4 -16
- vectordb_bench/frontend/components/check_results/filters.py +8 -16
- vectordb_bench/frontend/components/check_results/nav.py +4 -4
- vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
- vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
- vectordb_bench/frontend/components/concurrent/charts.py +12 -12
- vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
- vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
- vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
- vectordb_bench/frontend/components/custom/initStyle.py +1 -1
- vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
- vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
- vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
- vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
- vectordb_bench/frontend/components/tables/data.py +3 -6
- vectordb_bench/frontend/config/dbCaseConfigs.py +108 -83
- vectordb_bench/frontend/pages/concurrent.py +3 -5
- vectordb_bench/frontend/pages/custom.py +30 -9
- vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
- vectordb_bench/frontend/pages/run_test.py +3 -7
- vectordb_bench/frontend/utils.py +1 -1
- vectordb_bench/frontend/vdb_benchmark.py +4 -6
- vectordb_bench/interface.py +56 -26
- vectordb_bench/log_util.py +59 -64
- vectordb_bench/metric.py +10 -11
- vectordb_bench/models.py +26 -43
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +34 -42
- vectordb_bench-0.0.20.dist-info/RECORD +135 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
- vectordb_bench-0.0.18.dist-info/RECORD +0 -131
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0
@@ -1,24 +1,20 @@
|
|
1
|
+
import concurrent
|
1
2
|
import logging
|
2
|
-
import psutil
|
3
3
|
import traceback
|
4
|
-
import concurrent
|
5
|
-
import numpy as np
|
6
4
|
from enum import Enum, auto
|
7
5
|
|
8
|
-
|
9
|
-
|
10
|
-
from ..base import BaseModel
|
11
|
-
from ..models import TaskConfig, PerformanceTimeoutError, TaskStage
|
6
|
+
import numpy as np
|
7
|
+
import psutil
|
12
8
|
|
13
|
-
from .
|
14
|
-
|
15
|
-
|
16
|
-
)
|
17
|
-
from ..metric import Metric
|
18
|
-
from .runner import MultiProcessingSearchRunner
|
19
|
-
from .runner import SerialSearchRunner, SerialInsertRunner
|
20
|
-
from .data_source import DatasetSource
|
9
|
+
from vectordb_bench.base import BaseModel
|
10
|
+
from vectordb_bench.metric import Metric
|
11
|
+
from vectordb_bench.models import PerformanceTimeoutError, TaskConfig, TaskStage
|
21
12
|
|
13
|
+
from . import utils
|
14
|
+
from .cases import Case, CaseLabel
|
15
|
+
from .clients import MetricType, api
|
16
|
+
from .data_source import DatasetSource
|
17
|
+
from .runner import MultiProcessingSearchRunner, SerialInsertRunner, SerialSearchRunner
|
22
18
|
|
23
19
|
log = logging.getLogger(__name__)
|
24
20
|
|
@@ -53,24 +49,39 @@ class CaseRunner(BaseModel):
|
|
53
49
|
search_runner: MultiProcessingSearchRunner | None = None
|
54
50
|
final_search_runner: MultiProcessingSearchRunner | None = None
|
55
51
|
|
56
|
-
def __eq__(self, obj):
|
52
|
+
def __eq__(self, obj: any):
|
57
53
|
if isinstance(obj, CaseRunner):
|
58
|
-
return
|
59
|
-
self.
|
60
|
-
self.config.
|
61
|
-
self.
|
54
|
+
return (
|
55
|
+
self.ca.label == CaseLabel.Performance
|
56
|
+
and self.config.db == obj.config.db
|
57
|
+
and self.config.db_case_config == obj.config.db_case_config
|
58
|
+
and self.ca.dataset == obj.ca.dataset
|
59
|
+
)
|
62
60
|
return False
|
63
61
|
|
64
62
|
def display(self) -> dict:
|
65
|
-
c_dict = self.ca.dict(
|
66
|
-
|
63
|
+
c_dict = self.ca.dict(
|
64
|
+
include={
|
65
|
+
"label": True,
|
66
|
+
"filters": True,
|
67
|
+
"dataset": {
|
68
|
+
"data": {
|
69
|
+
"name": True,
|
70
|
+
"size": True,
|
71
|
+
"dim": True,
|
72
|
+
"metric_type": True,
|
73
|
+
"label": True,
|
74
|
+
},
|
75
|
+
},
|
76
|
+
},
|
77
|
+
)
|
78
|
+
c_dict["db"] = self.config.db_name
|
67
79
|
return c_dict
|
68
80
|
|
69
81
|
@property
|
70
82
|
def normalize(self) -> bool:
|
71
83
|
assert self.db
|
72
|
-
return self.db.need_normalize_cosine() and
|
73
|
-
self.ca.dataset.data.metric_type == MetricType.COSINE
|
84
|
+
return self.db.need_normalize_cosine() and self.ca.dataset.data.metric_type == MetricType.COSINE
|
74
85
|
|
75
86
|
def init_db(self, drop_old: bool = True) -> None:
|
76
87
|
db_cls = self.config.db.init_cls
|
@@ -80,8 +91,7 @@ class CaseRunner(BaseModel):
|
|
80
91
|
db_config=self.config.db_config.to_dict(),
|
81
92
|
db_case_config=self.config.db_case_config,
|
82
93
|
drop_old=drop_old,
|
83
|
-
)
|
84
|
-
|
94
|
+
)
|
85
95
|
|
86
96
|
def _pre_run(self, drop_old: bool = True):
|
87
97
|
try:
|
@@ -89,12 +99,9 @@ class CaseRunner(BaseModel):
|
|
89
99
|
self.ca.dataset.prepare(self.dataset_source, filters=self.ca.filter_rate)
|
90
100
|
except ModuleNotFoundError as e:
|
91
101
|
log.warning(
|
92
|
-
f"pre run case error: please install client for db: {self.config.db}, error={e}"
|
102
|
+
f"pre run case error: please install client for db: {self.config.db}, error={e}",
|
93
103
|
)
|
94
104
|
raise e from None
|
95
|
-
except Exception as e:
|
96
|
-
log.warning(f"pre run case error: {e}")
|
97
|
-
raise e from None
|
98
105
|
|
99
106
|
def run(self, drop_old: bool = True) -> Metric:
|
100
107
|
log.info("Starting run")
|
@@ -103,12 +110,11 @@ class CaseRunner(BaseModel):
|
|
103
110
|
|
104
111
|
if self.ca.label == CaseLabel.Load:
|
105
112
|
return self._run_capacity_case()
|
106
|
-
|
113
|
+
if self.ca.label == CaseLabel.Performance:
|
107
114
|
return self._run_perf_case(drop_old)
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
raise ValueError(msg)
|
115
|
+
msg = f"unknown case type: {self.ca.label}"
|
116
|
+
log.warning(msg)
|
117
|
+
raise ValueError(msg)
|
112
118
|
|
113
119
|
def _run_capacity_case(self) -> Metric:
|
114
120
|
"""run capacity cases
|
@@ -120,7 +126,10 @@ class CaseRunner(BaseModel):
|
|
120
126
|
log.info("Start capacity case")
|
121
127
|
try:
|
122
128
|
runner = SerialInsertRunner(
|
123
|
-
self.db,
|
129
|
+
self.db,
|
130
|
+
self.ca.dataset,
|
131
|
+
self.normalize,
|
132
|
+
self.ca.load_timeout,
|
124
133
|
)
|
125
134
|
count = runner.run_endlessness()
|
126
135
|
except Exception as e:
|
@@ -128,7 +137,7 @@ class CaseRunner(BaseModel):
|
|
128
137
|
raise e from None
|
129
138
|
else:
|
130
139
|
log.info(
|
131
|
-
f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}"
|
140
|
+
f"Capacity case loading dataset reaches VectorDB's limit: max capacity = {count}",
|
132
141
|
)
|
133
142
|
return Metric(max_load_count=count)
|
134
143
|
|
@@ -138,7 +147,7 @@ class CaseRunner(BaseModel):
|
|
138
147
|
Returns:
|
139
148
|
Metric: load_duration, recall, serial_latency_p99, and, qps
|
140
149
|
"""
|
141
|
-
|
150
|
+
"""
|
142
151
|
if drop_old:
|
143
152
|
_, load_dur = self._load_train_data()
|
144
153
|
build_dur = self._optimize()
|
@@ -153,38 +162,40 @@ class CaseRunner(BaseModel):
|
|
153
162
|
|
154
163
|
m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
|
155
164
|
m.recall, m.serial_latency_p99 = self._serial_search()
|
156
|
-
|
165
|
+
"""
|
157
166
|
|
158
167
|
log.info("Start performance case")
|
159
168
|
try:
|
160
169
|
m = Metric()
|
161
170
|
if drop_old:
|
162
171
|
if TaskStage.LOAD in self.config.stages:
|
163
|
-
# self._load_train_data()
|
164
172
|
_, load_dur = self._load_train_data()
|
165
173
|
build_dur = self._optimize()
|
166
174
|
m.load_duration = round(load_dur + build_dur, 4)
|
167
175
|
log.info(
|
168
176
|
f"Finish loading the entire dataset into VectorDB,"
|
169
177
|
f" insert_duration={load_dur}, optimize_duration={build_dur}"
|
170
|
-
f" load_duration(insert + optimize) = {m.load_duration}"
|
178
|
+
f" load_duration(insert + optimize) = {m.load_duration}",
|
171
179
|
)
|
172
180
|
else:
|
173
181
|
log.info("Data loading skipped")
|
174
|
-
if
|
175
|
-
TaskStage.SEARCH_SERIAL in self.config.stages
|
176
|
-
or TaskStage.SEARCH_CONCURRENT in self.config.stages
|
177
|
-
):
|
182
|
+
if TaskStage.SEARCH_SERIAL in self.config.stages or TaskStage.SEARCH_CONCURRENT in self.config.stages:
|
178
183
|
self._init_search_runner()
|
179
184
|
if TaskStage.SEARCH_CONCURRENT in self.config.stages:
|
180
185
|
search_results = self._conc_search()
|
181
|
-
|
186
|
+
(
|
187
|
+
m.qps,
|
188
|
+
m.conc_num_list,
|
189
|
+
m.conc_qps_list,
|
190
|
+
m.conc_latency_p99_list,
|
191
|
+
m.conc_latency_avg_list,
|
192
|
+
) = search_results
|
182
193
|
if TaskStage.SEARCH_SERIAL in self.config.stages:
|
183
194
|
search_results = self._serial_search()
|
184
|
-
|
195
|
+
"""
|
185
196
|
m.recall = search_results.recall
|
186
197
|
m.serial_latencies = search_results.serial_latencies
|
187
|
-
|
198
|
+
"""
|
188
199
|
m.recall, m.ndcg, m.serial_latency_p99 = search_results
|
189
200
|
|
190
201
|
except Exception as e:
|
@@ -199,14 +210,19 @@ class CaseRunner(BaseModel):
|
|
199
210
|
def _load_train_data(self):
|
200
211
|
"""Insert train data and get the insert_duration"""
|
201
212
|
try:
|
202
|
-
runner = SerialInsertRunner(
|
213
|
+
runner = SerialInsertRunner(
|
214
|
+
self.db,
|
215
|
+
self.ca.dataset,
|
216
|
+
self.normalize,
|
217
|
+
self.ca.load_timeout,
|
218
|
+
)
|
203
219
|
runner.run()
|
204
220
|
except Exception as e:
|
205
221
|
raise e from None
|
206
222
|
finally:
|
207
223
|
runner = None
|
208
224
|
|
209
|
-
def _serial_search(self) -> tuple[float, float]:
|
225
|
+
def _serial_search(self) -> tuple[float, float, float]:
|
210
226
|
"""Performance serial tests, search the entire test data once,
|
211
227
|
calculate the recall, serial_latency_p99
|
212
228
|
|
@@ -214,11 +230,13 @@ class CaseRunner(BaseModel):
|
|
214
230
|
tuple[float, float]: recall, serial_latency_p99
|
215
231
|
"""
|
216
232
|
try:
|
217
|
-
|
233
|
+
results, _ = self.serial_search_runner.run()
|
218
234
|
except Exception as e:
|
219
|
-
log.warning(f"search error: {
|
235
|
+
log.warning(f"search error: {e!s}, {e}")
|
220
236
|
self.stop()
|
221
|
-
raise e from
|
237
|
+
raise e from e
|
238
|
+
else:
|
239
|
+
return results
|
222
240
|
|
223
241
|
def _conc_search(self):
|
224
242
|
"""Performance concurrency tests, search the test data endlessness
|
@@ -230,7 +248,7 @@ class CaseRunner(BaseModel):
|
|
230
248
|
try:
|
231
249
|
return self.search_runner.run()
|
232
250
|
except Exception as e:
|
233
|
-
log.warning(f"search error: {
|
251
|
+
log.warning(f"search error: {e!s}, {e}")
|
234
252
|
raise e from None
|
235
253
|
finally:
|
236
254
|
self.stop()
|
@@ -238,7 +256,7 @@ class CaseRunner(BaseModel):
|
|
238
256
|
@utils.time_it
|
239
257
|
def _task(self) -> None:
|
240
258
|
with self.db.init():
|
241
|
-
self.db.
|
259
|
+
self.db.optimize_with_size(data_size=self.ca.dataset.data.size)
|
242
260
|
|
243
261
|
def _optimize(self) -> float:
|
244
262
|
with concurrent.futures.ProcessPoolExecutor(max_workers=1) as executor:
|
@@ -249,7 +267,7 @@ class CaseRunner(BaseModel):
|
|
249
267
|
log.warning(f"VectorDB optimize timeout in {self.ca.optimize_timeout}")
|
250
268
|
for pid, _ in executor._processes.items():
|
251
269
|
psutil.Process(pid).kill()
|
252
|
-
raise PerformanceTimeoutError
|
270
|
+
raise PerformanceTimeoutError from e
|
253
271
|
except Exception as e:
|
254
272
|
log.warning(f"VectorDB optimize error: {e}")
|
255
273
|
raise e from None
|
@@ -285,6 +303,16 @@ class CaseRunner(BaseModel):
|
|
285
303
|
self.search_runner.stop()
|
286
304
|
|
287
305
|
|
306
|
+
DATA_FORMAT = " %-14s | %-12s %-20s %7s | %-10s"
|
307
|
+
TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
|
308
|
+
"DB",
|
309
|
+
"CaseType",
|
310
|
+
"Dataset",
|
311
|
+
"Filter",
|
312
|
+
"task_label",
|
313
|
+
)
|
314
|
+
|
315
|
+
|
288
316
|
class TaskRunner(BaseModel):
|
289
317
|
run_id: str
|
290
318
|
task_label: str
|
@@ -303,18 +331,8 @@ class TaskRunner(BaseModel):
|
|
303
331
|
return sum([1 for c in self.case_runners if c.status == status])
|
304
332
|
|
305
333
|
def display(self) -> None:
|
306
|
-
DATA_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s")
|
307
|
-
TITLE_FORMAT = (" %-14s | %-12s %-20s %7s | %-10s") % (
|
308
|
-
"DB", "CaseType", "Dataset", "Filter", "task_label")
|
309
|
-
|
310
334
|
fmt = [TITLE_FORMAT]
|
311
|
-
fmt.append(DATA_FORMAT%(
|
312
|
-
"-"*11,
|
313
|
-
"-"*12,
|
314
|
-
"-"*20,
|
315
|
-
"-"*7,
|
316
|
-
"-"*7
|
317
|
-
))
|
335
|
+
fmt.append(DATA_FORMAT % ("-" * 11, "-" * 12, "-" * 20, "-" * 7, "-" * 7))
|
318
336
|
|
319
337
|
for f in self.case_runners:
|
320
338
|
if f.ca.filter_rate != 0.0:
|
@@ -325,13 +343,16 @@ class TaskRunner(BaseModel):
|
|
325
343
|
filters = "None"
|
326
344
|
|
327
345
|
ds_str = f"{f.ca.dataset.data.name}-{f.ca.dataset.data.label}-{utils.numerize(f.ca.dataset.data.size)}"
|
328
|
-
fmt.append(
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
346
|
+
fmt.append(
|
347
|
+
DATA_FORMAT
|
348
|
+
% (
|
349
|
+
f.config.db_name,
|
350
|
+
f.ca.label.name,
|
351
|
+
ds_str,
|
352
|
+
filters,
|
353
|
+
self.task_label,
|
354
|
+
),
|
355
|
+
)
|
335
356
|
|
336
357
|
tmp_logger = logging.getLogger("no_color")
|
337
358
|
for f in fmt:
|
vectordb_bench/backend/utils.py
CHANGED
@@ -2,7 +2,7 @@ import time
|
|
2
2
|
from functools import wraps
|
3
3
|
|
4
4
|
|
5
|
-
def numerize(n) -> str:
|
5
|
+
def numerize(n: int) -> str:
|
6
6
|
"""display positive number n for readability
|
7
7
|
|
8
8
|
Examples:
|
@@ -16,31 +16,34 @@ def numerize(n) -> str:
|
|
16
16
|
"K": 1e6,
|
17
17
|
"M": 1e9,
|
18
18
|
"B": 1e12,
|
19
|
-
"END": float(
|
19
|
+
"END": float("inf"),
|
20
20
|
}
|
21
21
|
|
22
22
|
display_n, sufix = n, ""
|
23
23
|
for s, base in sufix2upbound.items():
|
24
24
|
# number >= 1000B will alway have sufix 'B'
|
25
25
|
if s == "END":
|
26
|
-
display_n = int(n/1e9)
|
26
|
+
display_n = int(n / 1e9)
|
27
27
|
sufix = "B"
|
28
28
|
break
|
29
29
|
|
30
30
|
if n < base:
|
31
31
|
sufix = "" if s == "EMPTY" else s
|
32
|
-
display_n = int(n/(base/1e3))
|
32
|
+
display_n = int(n / (base / 1e3))
|
33
33
|
break
|
34
34
|
return f"{display_n}{sufix}"
|
35
35
|
|
36
36
|
|
37
|
-
def time_it(func):
|
37
|
+
def time_it(func: any):
|
38
|
+
"""returns result and elapsed time"""
|
39
|
+
|
38
40
|
@wraps(func)
|
39
41
|
def inner(*args, **kwargs):
|
40
42
|
pref = time.perf_counter()
|
41
43
|
result = func(*args, **kwargs)
|
42
44
|
delta = time.perf_counter() - pref
|
43
45
|
return result, delta
|
46
|
+
|
44
47
|
return inner
|
45
48
|
|
46
49
|
|
@@ -61,14 +64,19 @@ def compose_train_files(train_count: int, use_shuffled: bool) -> list[str]:
|
|
61
64
|
return train_files
|
62
65
|
|
63
66
|
|
64
|
-
|
67
|
+
ONE_PERCENT = 0.01
|
68
|
+
NINETY_NINE_PERCENT = 0.99
|
69
|
+
|
70
|
+
|
71
|
+
def compose_gt_file(filters: float | str | None = None) -> str:
|
65
72
|
if filters is None:
|
66
73
|
return "neighbors.parquet"
|
67
74
|
|
68
|
-
if filters ==
|
75
|
+
if filters == ONE_PERCENT:
|
69
76
|
return "neighbors_head_1p.parquet"
|
70
77
|
|
71
|
-
if filters ==
|
78
|
+
if filters == NINETY_NINE_PERCENT:
|
72
79
|
return "neighbors_tail_1p.parquet"
|
73
80
|
|
74
|
-
|
81
|
+
msg = f"Filters not supported: {filters}"
|
82
|
+
raise ValueError(msg)
|
vectordb_bench/base.py
CHANGED