vectordb-bench 0.0.30__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +14 -27
- vectordb_bench/__main__.py +1 -1
- vectordb_bench/backend/assembler.py +19 -6
- vectordb_bench/backend/cases.py +186 -23
- vectordb_bench/backend/clients/__init__.py +16 -0
- vectordb_bench/backend/clients/api.py +22 -1
- vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +82 -41
- vectordb_bench/backend/clients/aws_opensearch/config.py +37 -4
- vectordb_bench/backend/clients/chroma/chroma.py +6 -2
- vectordb_bench/backend/clients/elastic_cloud/config.py +31 -1
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +133 -45
- vectordb_bench/backend/clients/milvus/config.py +1 -0
- vectordb_bench/backend/clients/milvus/milvus.py +75 -23
- vectordb_bench/backend/clients/oceanbase/cli.py +100 -0
- vectordb_bench/backend/clients/oceanbase/config.py +125 -0
- vectordb_bench/backend/clients/oceanbase/oceanbase.py +215 -0
- vectordb_bench/backend/clients/pinecone/pinecone.py +39 -25
- vectordb_bench/backend/clients/qdrant_cloud/config.py +73 -3
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +100 -33
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
- vectordb_bench/backend/dataset.py +146 -27
- vectordb_bench/backend/filter.py +76 -0
- vectordb_bench/backend/runner/__init__.py +3 -3
- vectordb_bench/backend/runner/mp_runner.py +52 -39
- vectordb_bench/backend/runner/rate_runner.py +68 -52
- vectordb_bench/backend/runner/read_write_runner.py +125 -68
- vectordb_bench/backend/runner/serial_runner.py +56 -23
- vectordb_bench/backend/task_runner.py +59 -20
- vectordb_bench/cli/cli.py +59 -1
- vectordb_bench/cli/vectordbbench.py +3 -0
- vectordb_bench/frontend/components/check_results/data.py +16 -11
- vectordb_bench/frontend/components/check_results/filters.py +53 -25
- vectordb_bench/frontend/components/check_results/headerIcon.py +18 -13
- vectordb_bench/frontend/components/check_results/nav.py +20 -0
- vectordb_bench/frontend/components/custom/displayCustomCase.py +43 -8
- vectordb_bench/frontend/components/custom/displaypPrams.py +10 -5
- vectordb_bench/frontend/components/custom/getCustomConfig.py +10 -0
- vectordb_bench/frontend/components/label_filter/charts.py +60 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +48 -52
- vectordb_bench/frontend/components/run_test/dbSelector.py +9 -5
- vectordb_bench/frontend/components/run_test/inputWidget.py +48 -0
- vectordb_bench/frontend/components/run_test/submitTask.py +3 -1
- vectordb_bench/frontend/components/streaming/charts.py +253 -0
- vectordb_bench/frontend/components/streaming/data.py +62 -0
- vectordb_bench/frontend/components/tables/data.py +1 -1
- vectordb_bench/frontend/components/welcome/explainPrams.py +66 -0
- vectordb_bench/frontend/components/welcome/pagestyle.py +106 -0
- vectordb_bench/frontend/components/welcome/welcomePrams.py +147 -0
- vectordb_bench/frontend/config/dbCaseConfigs.py +309 -42
- vectordb_bench/frontend/config/styles.py +34 -4
- vectordb_bench/frontend/pages/concurrent.py +5 -1
- vectordb_bench/frontend/pages/custom.py +4 -0
- vectordb_bench/frontend/pages/label_filter.py +56 -0
- vectordb_bench/frontend/pages/quries_per_dollar.py +5 -1
- vectordb_bench/frontend/{vdb_benchmark.py → pages/results.py} +10 -4
- vectordb_bench/frontend/pages/run_test.py +3 -3
- vectordb_bench/frontend/pages/streaming.py +135 -0
- vectordb_bench/frontend/pages/tables.py +4 -0
- vectordb_bench/frontend/vdbbench.py +31 -0
- vectordb_bench/interface.py +8 -3
- vectordb_bench/metric.py +15 -1
- vectordb_bench/models.py +31 -11
- vectordb_bench/results/ElasticCloud/result_20250318_standard_elasticcloud.json +5890 -0
- vectordb_bench/results/Milvus/result_20250509_standard_milvus.json +6138 -0
- vectordb_bench/results/OpenSearch/result_20250224_standard_opensearch.json +7319 -0
- vectordb_bench/results/Pinecone/result_20250124_standard_pinecone.json +2365 -0
- vectordb_bench/results/QdrantCloud/result_20250602_standard_qdrantcloud.json +3556 -0
- vectordb_bench/results/ZillizCloud/result_20250613_standard_zillizcloud.json +6290 -0
- vectordb_bench/results/dbPrices.json +12 -4
- vectordb_bench/results/getLeaderboardDataV2.py +59 -0
- vectordb_bench/results/leaderboard_v2.json +2662 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/METADATA +93 -40
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/RECORD +77 -58
- vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -791
- vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -679
- vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -1352
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/WHEEL +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {vectordb_bench-0.0.30.dist-info → vectordb_bench-1.0.1.dist-info}/top_level.txt +0 -0
vectordb_bench/__init__.py
CHANGED
@@ -18,37 +18,16 @@ class config:
|
|
18
18
|
DEFAULT_DATASET_URL = env.str("DEFAULT_DATASET_URL", AWS_S3_URL)
|
19
19
|
DATASET_LOCAL_DIR = env.path("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset")
|
20
20
|
NUM_PER_BATCH = env.int("NUM_PER_BATCH", 100)
|
21
|
+
TIME_PER_BATCH = 1 # 1s. for streaming insertion.
|
22
|
+
MAX_INSERT_RETRY = 5
|
23
|
+
MAX_SEARCH_RETRY = 5
|
24
|
+
|
25
|
+
LOAD_MAX_TRY_COUNT = 10
|
21
26
|
|
22
27
|
DROP_OLD = env.bool("DROP_OLD", True)
|
23
28
|
USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
|
24
29
|
|
25
|
-
NUM_CONCURRENCY = env.list(
|
26
|
-
"NUM_CONCURRENCY",
|
27
|
-
[
|
28
|
-
1,
|
29
|
-
5,
|
30
|
-
10,
|
31
|
-
15,
|
32
|
-
20,
|
33
|
-
25,
|
34
|
-
30,
|
35
|
-
35,
|
36
|
-
40,
|
37
|
-
45,
|
38
|
-
50,
|
39
|
-
55,
|
40
|
-
60,
|
41
|
-
65,
|
42
|
-
70,
|
43
|
-
75,
|
44
|
-
80,
|
45
|
-
85,
|
46
|
-
90,
|
47
|
-
95,
|
48
|
-
100,
|
49
|
-
],
|
50
|
-
subcast=int,
|
51
|
-
)
|
30
|
+
NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 20, 30, 40, 60, 80], subcast=int)
|
52
31
|
|
53
32
|
CONCURRENCY_DURATION = 30
|
54
33
|
|
@@ -68,6 +47,7 @@ class config:
|
|
68
47
|
|
69
48
|
CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
|
70
49
|
LOAD_TIMEOUT_DEFAULT = 24 * 3600 # 24h
|
50
|
+
LOAD_TIMEOUT_768D_100K = 24 * 3600 # 24h
|
71
51
|
LOAD_TIMEOUT_768D_1M = 24 * 3600 # 24h
|
72
52
|
LOAD_TIMEOUT_768D_10M = 240 * 3600 # 10d
|
73
53
|
LOAD_TIMEOUT_768D_100M = 2400 * 3600 # 100d
|
@@ -75,7 +55,11 @@ class config:
|
|
75
55
|
LOAD_TIMEOUT_1536D_500K = 24 * 3600 # 24h
|
76
56
|
LOAD_TIMEOUT_1536D_5M = 240 * 3600 # 10d
|
77
57
|
|
58
|
+
LOAD_TIMEOUT_1024D_1M = 24 * 3600 # 24h
|
59
|
+
LOAD_TIMEOUT_1024D_10M = 240 * 3600 # 10d
|
60
|
+
|
78
61
|
OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600 # 24h
|
62
|
+
OPTIMIZE_TIMEOUT_768D_100K = 24 * 3600 # 24h
|
79
63
|
OPTIMIZE_TIMEOUT_768D_1M = 24 * 3600 # 24h
|
80
64
|
OPTIMIZE_TIMEOUT_768D_10M = 240 * 3600 # 10d
|
81
65
|
OPTIMIZE_TIMEOUT_768D_100M = 2400 * 3600 # 100d
|
@@ -83,6 +67,9 @@ class config:
|
|
83
67
|
OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600 # 24h
|
84
68
|
OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600 # 10d
|
85
69
|
|
70
|
+
OPTIMIZE_TIMEOUT_1024D_1M = 24 * 3600 # 24h
|
71
|
+
OPTIMIZE_TIMEOUT_1024D_10M = 240 * 3600 # 10d
|
72
|
+
|
86
73
|
def display(self) -> str:
|
87
74
|
return [
|
88
75
|
i
|
vectordb_bench/__main__.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from vectordb_bench.backend.clients import EmptyDBCaseConfig
|
3
|
+
from vectordb_bench.backend.clients import DB, EmptyDBCaseConfig
|
4
4
|
from vectordb_bench.backend.data_source import DatasetSource
|
5
|
+
from vectordb_bench.backend.filter import FilterOp
|
5
6
|
from vectordb_bench.models import TaskConfig
|
6
7
|
|
7
8
|
from .cases import CaseLabel
|
@@ -10,6 +11,13 @@ from .task_runner import CaseRunner, RunningStatus, TaskRunner
|
|
10
11
|
log = logging.getLogger(__name__)
|
11
12
|
|
12
13
|
|
14
|
+
class FilterNotSupportedError(ValueError):
|
15
|
+
"""Raised when a filter type is not supported by a vector database."""
|
16
|
+
|
17
|
+
def __init__(self, db_name: str, filter_type: FilterOp):
|
18
|
+
super().__init__(f"{filter_type} Filter test is not supported by {db_name}.")
|
19
|
+
|
20
|
+
|
13
21
|
class Assembler:
|
14
22
|
@classmethod
|
15
23
|
def assemble(cls, run_id: str, task: TaskConfig, source: DatasetSource) -> CaseRunner:
|
@@ -39,25 +47,30 @@ class Assembler:
|
|
39
47
|
runners = [cls.assemble(run_id, task, source) for task in tasks]
|
40
48
|
load_runners = [r for r in runners if r.ca.label == CaseLabel.Load]
|
41
49
|
perf_runners = [r for r in runners if r.ca.label == CaseLabel.Performance]
|
50
|
+
streaming_runners = [r for r in runners if r.ca.label == CaseLabel.Streaming]
|
42
51
|
|
43
52
|
# group by db
|
44
|
-
db2runner = {}
|
53
|
+
db2runner: dict[DB, list[CaseRunner]] = {}
|
45
54
|
for r in perf_runners:
|
46
55
|
db = r.config.db
|
47
56
|
if db not in db2runner:
|
48
57
|
db2runner[db] = []
|
49
58
|
db2runner[db].append(r)
|
50
59
|
|
51
|
-
# check
|
52
|
-
for
|
53
|
-
|
60
|
+
# check
|
61
|
+
for db, runners in db2runner.items():
|
62
|
+
db_instance = db.init_cls
|
63
|
+
for runner in runners:
|
64
|
+
if not db_instance.filter_supported(runner.ca.filters):
|
65
|
+
raise FilterNotSupportedError(db.value, runner.ca.filters.type)
|
54
66
|
|
55
67
|
# sort by dataset size
|
56
68
|
for _, runner in db2runner.items():
|
57
|
-
runner.sort(key=lambda x: x.ca.dataset.data.size)
|
69
|
+
runner.sort(key=lambda x: (x.ca.dataset.data.size, 0 if x.ca.filters.type == FilterOp.StrEqual else 1))
|
58
70
|
|
59
71
|
all_runners = []
|
60
72
|
all_runners.extend(load_runners)
|
73
|
+
all_runners.extend(streaming_runners)
|
61
74
|
for v in db2runner.values():
|
62
75
|
all_runners.extend(v)
|
63
76
|
|
vectordb_bench/backend/cases.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
+
import json
|
1
2
|
import logging
|
2
3
|
from enum import Enum, auto
|
3
4
|
|
4
5
|
from vectordb_bench import config
|
5
6
|
from vectordb_bench.backend.clients.api import MetricType
|
7
|
+
from vectordb_bench.backend.filter import Filter, FilterOp, IntFilter, LabelFilter, NonFilter, non_filter
|
6
8
|
from vectordb_bench.base import BaseModel
|
7
|
-
from vectordb_bench.frontend.components.custom.getCustomConfig import
|
8
|
-
CustomDatasetConfig,
|
9
|
-
)
|
9
|
+
from vectordb_bench.frontend.components.custom.getCustomConfig import CustomDatasetConfig
|
10
10
|
|
11
|
-
from .dataset import CustomDataset, Dataset, DatasetManager
|
11
|
+
from .dataset import CustomDataset, Dataset, DatasetManager, DatasetWithSizeType
|
12
12
|
|
13
13
|
log = logging.getLogger(__name__)
|
14
14
|
|
@@ -42,11 +42,18 @@ class CaseType(Enum):
|
|
42
42
|
Performance1536D500K99P = 14
|
43
43
|
Performance1536D5M99P = 15
|
44
44
|
|
45
|
+
Performance1024D1M = 17
|
46
|
+
Performance1024D10M = 20
|
47
|
+
|
45
48
|
Performance1536D50K = 50
|
46
49
|
|
47
50
|
Custom = 100
|
48
51
|
PerformanceCustomDataset = 101
|
49
52
|
|
53
|
+
StreamingPerformanceCase = 200
|
54
|
+
|
55
|
+
LabelFilterPerformanceCase = 300
|
56
|
+
|
50
57
|
def case_cls(self, custom_configs: dict | None = None) -> type["Case"]:
|
51
58
|
if custom_configs is None:
|
52
59
|
return type2case.get(self)()
|
@@ -68,6 +75,7 @@ class CaseType(Enum):
|
|
68
75
|
class CaseLabel(Enum):
|
69
76
|
Load = auto()
|
70
77
|
Performance = auto()
|
78
|
+
Streaming = auto()
|
71
79
|
|
72
80
|
|
73
81
|
class Case(BaseModel):
|
@@ -87,31 +95,37 @@ class Case(BaseModel):
|
|
87
95
|
description: str
|
88
96
|
dataset: DatasetManager
|
89
97
|
|
90
|
-
load_timeout: float | int
|
98
|
+
load_timeout: float | int | None = None
|
91
99
|
optimize_timeout: float | int | None = None
|
92
100
|
|
93
101
|
filter_rate: float | None = None
|
94
102
|
|
95
103
|
@property
|
96
|
-
def filters(self) ->
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
104
|
+
def filters(self) -> Filter:
|
105
|
+
return non_filter
|
106
|
+
|
107
|
+
@property
|
108
|
+
def with_scalar_labels(self) -> bool:
|
109
|
+
return self.filters.type == FilterOp.StrEqual
|
110
|
+
|
111
|
+
def check_scalar_labels(self) -> None:
|
112
|
+
if self.with_scalar_labels and not self.dataset.data.with_scalar_labels:
|
113
|
+
msg = f"Case init failed: no scalar_labels data in current dataset ({self.dataset.data.full_name})"
|
114
|
+
raise ValueError(msg)
|
103
115
|
|
104
|
-
|
116
|
+
def __init__(self, **kwargs):
|
117
|
+
super().__init__(**kwargs)
|
118
|
+
self.check_scalar_labels()
|
105
119
|
|
106
120
|
|
107
|
-
class CapacityCase(Case
|
121
|
+
class CapacityCase(Case):
|
108
122
|
label: CaseLabel = CaseLabel.Load
|
109
123
|
filter_rate: float | None = None
|
110
124
|
load_timeout: float | int = config.CAPACITY_TIMEOUT_IN_SECONDS
|
111
125
|
optimize_timeout: float | int | None = None
|
112
126
|
|
113
127
|
|
114
|
-
class PerformanceCase(Case
|
128
|
+
class PerformanceCase(Case):
|
115
129
|
label: CaseLabel = CaseLabel.Performance
|
116
130
|
filter_rate: float | None = None
|
117
131
|
load_timeout: float | int = config.LOAD_TIMEOUT_DEFAULT
|
@@ -147,6 +161,14 @@ class Performance768D10M(PerformanceCase):
|
|
147
161
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M
|
148
162
|
|
149
163
|
|
164
|
+
class IntFilterPerformanceCase(PerformanceCase):
|
165
|
+
@property
|
166
|
+
def filters(self) -> Filter:
|
167
|
+
int_field = self.dataset.data.train_id_field
|
168
|
+
int_value = int(self.dataset.data.size * self.filter_rate)
|
169
|
+
return IntFilter(filter_rate=self.filter_rate, int_field=int_field, int_value=int_value)
|
170
|
+
|
171
|
+
|
150
172
|
class Performance768D1M(PerformanceCase):
|
151
173
|
case_id: CaseType = CaseType.Performance768D1M
|
152
174
|
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
|
@@ -158,7 +180,7 @@ class Performance768D1M(PerformanceCase):
|
|
158
180
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M
|
159
181
|
|
160
182
|
|
161
|
-
class Performance768D10M1P(
|
183
|
+
class Performance768D10M1P(IntFilterPerformanceCase):
|
162
184
|
case_id: CaseType = CaseType.Performance768D10M1P
|
163
185
|
filter_rate: float | int | None = 0.01
|
164
186
|
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
|
@@ -170,7 +192,7 @@ class Performance768D10M1P(PerformanceCase):
|
|
170
192
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M
|
171
193
|
|
172
194
|
|
173
|
-
class Performance768D1M1P(
|
195
|
+
class Performance768D1M1P(IntFilterPerformanceCase):
|
174
196
|
case_id: CaseType = CaseType.Performance768D1M1P
|
175
197
|
filter_rate: float | int | None = 0.01
|
176
198
|
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
|
@@ -182,7 +204,7 @@ class Performance768D1M1P(PerformanceCase):
|
|
182
204
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_1M
|
183
205
|
|
184
206
|
|
185
|
-
class Performance768D10M99P(
|
207
|
+
class Performance768D10M99P(IntFilterPerformanceCase):
|
186
208
|
case_id: CaseType = CaseType.Performance768D10M99P
|
187
209
|
filter_rate: float | int | None = 0.99
|
188
210
|
dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
|
@@ -194,7 +216,7 @@ class Performance768D10M99P(PerformanceCase):
|
|
194
216
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_768D_10M
|
195
217
|
|
196
218
|
|
197
|
-
class Performance768D1M99P(
|
219
|
+
class Performance768D1M99P(IntFilterPerformanceCase):
|
198
220
|
case_id: CaseType = CaseType.Performance768D1M99P
|
199
221
|
filter_rate: float | int | None = 0.99
|
200
222
|
dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
|
@@ -242,7 +264,7 @@ class Performance1536D5M(PerformanceCase):
|
|
242
264
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M
|
243
265
|
|
244
266
|
|
245
|
-
class Performance1536D500K1P(
|
267
|
+
class Performance1536D500K1P(IntFilterPerformanceCase):
|
246
268
|
case_id: CaseType = CaseType.Performance1536D500K1P
|
247
269
|
filter_rate: float | int | None = 0.01
|
248
270
|
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
|
@@ -254,7 +276,7 @@ class Performance1536D500K1P(PerformanceCase):
|
|
254
276
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K
|
255
277
|
|
256
278
|
|
257
|
-
class Performance1536D5M1P(
|
279
|
+
class Performance1536D5M1P(IntFilterPerformanceCase):
|
258
280
|
case_id: CaseType = CaseType.Performance1536D5M1P
|
259
281
|
filter_rate: float | int | None = 0.01
|
260
282
|
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
|
@@ -266,7 +288,7 @@ class Performance1536D5M1P(PerformanceCase):
|
|
266
288
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M
|
267
289
|
|
268
290
|
|
269
|
-
class Performance1536D500K99P(
|
291
|
+
class Performance1536D500K99P(IntFilterPerformanceCase):
|
270
292
|
case_id: CaseType = CaseType.Performance1536D500K99P
|
271
293
|
filter_rate: float | int | None = 0.99
|
272
294
|
dataset: DatasetManager = Dataset.OPENAI.manager(500_000)
|
@@ -278,7 +300,7 @@ class Performance1536D500K99P(PerformanceCase):
|
|
278
300
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_500K
|
279
301
|
|
280
302
|
|
281
|
-
class Performance1536D5M99P(
|
303
|
+
class Performance1536D5M99P(IntFilterPerformanceCase):
|
282
304
|
case_id: CaseType = CaseType.Performance1536D5M99P
|
283
305
|
filter_rate: float | int | None = 0.99
|
284
306
|
dataset: DatasetManager = Dataset.OPENAI.manager(5_000_000)
|
@@ -290,6 +312,30 @@ class Performance1536D5M99P(PerformanceCase):
|
|
290
312
|
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1536D_5M
|
291
313
|
|
292
314
|
|
315
|
+
class Performance1024D1M(PerformanceCase):
|
316
|
+
case_id: CaseType = CaseType.Performance1024D1M
|
317
|
+
filter_rate: float | int | None = None
|
318
|
+
dataset: DatasetManager = Dataset.BIOASQ.manager(1_000_000)
|
319
|
+
name: str = "Search Performance Test (1M Dataset, 1024 Dim)"
|
320
|
+
description: str = """This case tests the search performance of a vector database with a medium 1M dataset
|
321
|
+
(<b>Bioasq 1M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
|
322
|
+
recall, and maximum QPS."""
|
323
|
+
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_1M
|
324
|
+
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_1M
|
325
|
+
|
326
|
+
|
327
|
+
class Performance1024D10M(PerformanceCase):
|
328
|
+
case_id: CaseType = CaseType.Performance1024D10M
|
329
|
+
filter_rate: float | int | None = None
|
330
|
+
dataset: DatasetManager = Dataset.BIOASQ.manager(10_000_000)
|
331
|
+
name: str = "Search Performance Test (10M Dataset, 1024 Dim)"
|
332
|
+
description: str = """This case tests the search performance of a vector database with a large 10M dataset
|
333
|
+
(<b>Bioasq 10M vectors</b>, 1024 dimensions), at varying parallel levels. Results will show index building time,
|
334
|
+
recall, and maximum QPS."""
|
335
|
+
load_timeout: float | int = config.LOAD_TIMEOUT_1024D_10M
|
336
|
+
optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1024D_10M
|
337
|
+
|
338
|
+
|
293
339
|
class Performance1536D50K(PerformanceCase):
|
294
340
|
case_id: CaseType = CaseType.Performance1536D50K
|
295
341
|
filter_rate: float | int | None = None
|
@@ -318,7 +364,10 @@ class PerformanceCustomDataset(PerformanceCase):
|
|
318
364
|
case_id: CaseType = CaseType.PerformanceCustomDataset
|
319
365
|
name: str = "Performance With Custom Dataset"
|
320
366
|
description: str = ""
|
367
|
+
gt_file: str
|
321
368
|
dataset: DatasetManager
|
369
|
+
label_percentage: float | None = None
|
370
|
+
use_filter: bool
|
322
371
|
|
323
372
|
def __init__(
|
324
373
|
self,
|
@@ -327,6 +376,8 @@ class PerformanceCustomDataset(PerformanceCase):
|
|
327
376
|
load_timeout: float,
|
328
377
|
optimize_timeout: float,
|
329
378
|
dataset_config: dict,
|
379
|
+
label_percentage: float | None = None,
|
380
|
+
use_filter: bool = False,
|
330
381
|
**kwargs,
|
331
382
|
):
|
332
383
|
dataset_config = CustomDatasetConfig(**dataset_config)
|
@@ -339,16 +390,124 @@ class PerformanceCustomDataset(PerformanceCase):
|
|
339
390
|
with_gt=dataset_config.with_gt,
|
340
391
|
dir=dataset_config.dir,
|
341
392
|
file_num=dataset_config.file_count,
|
393
|
+
train_file=dataset_config.train_name,
|
394
|
+
test_file=f"{dataset_config.test_name}.parquet",
|
395
|
+
train_id_field=dataset_config.train_id_name,
|
396
|
+
train_vector_field=dataset_config.train_col_name,
|
397
|
+
test_vector_field=dataset_config.test_col_name,
|
398
|
+
gt_neighbors_field=dataset_config.gt_col_name,
|
399
|
+
scalar_labels_file=f"{dataset_config.scalar_labels_name}.parquet",
|
342
400
|
)
|
343
401
|
super().__init__(
|
344
402
|
name=name,
|
345
403
|
description=description,
|
346
404
|
load_timeout=load_timeout,
|
347
405
|
optimize_timeout=optimize_timeout,
|
406
|
+
gt_file=f"{dataset_config.gt_name}.parquet",
|
348
407
|
dataset=DatasetManager(data=dataset),
|
408
|
+
use_filter=use_filter,
|
409
|
+
label_percentage=label_percentage,
|
410
|
+
)
|
411
|
+
|
412
|
+
@property
|
413
|
+
def filters(self) -> Filter:
|
414
|
+
if self.use_filter is True:
|
415
|
+
return LabelFilter(label_percentage=self.label_percentage)
|
416
|
+
return NonFilter(gt_file_name=self.gt_file)
|
417
|
+
|
418
|
+
|
419
|
+
class StreamingPerformanceCase(Case):
|
420
|
+
case_id: CaseType = CaseType.StreamingPerformanceCase
|
421
|
+
label: CaseLabel = CaseLabel.Streaming
|
422
|
+
dataset_with_size_type: DatasetWithSizeType
|
423
|
+
insert_rate: int
|
424
|
+
search_stages: list[float]
|
425
|
+
concurrencies: list[int]
|
426
|
+
optimize_after_write: bool = True
|
427
|
+
read_dur_after_write: int = 30
|
428
|
+
|
429
|
+
def __init__(
|
430
|
+
self,
|
431
|
+
dataset_with_size_type: DatasetWithSizeType | str = DatasetWithSizeType.CohereSmall.value,
|
432
|
+
insert_rate: int = 500,
|
433
|
+
search_stages: list[float] | str = (0.5, 0.8),
|
434
|
+
concurrencies: list[int] | str = (5, 10),
|
435
|
+
**kwargs,
|
436
|
+
):
|
437
|
+
num_per_batch = config.NUM_PER_BATCH
|
438
|
+
if insert_rate % config.NUM_PER_BATCH != 0:
|
439
|
+
_insert_rate = max(
|
440
|
+
num_per_batch,
|
441
|
+
insert_rate // num_per_batch * num_per_batch,
|
442
|
+
)
|
443
|
+
log.warning(
|
444
|
+
f"[streaming_case init] insert_rate(={insert_rate}) should be "
|
445
|
+
f"divisible by NUM_PER_BATCH={num_per_batch}), reset to {_insert_rate}",
|
446
|
+
)
|
447
|
+
insert_rate = _insert_rate
|
448
|
+
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
|
449
|
+
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
|
450
|
+
dataset = dataset_with_size_type.get_manager()
|
451
|
+
name = f"Streaming-Perf - {dataset_with_size_type.value}, {insert_rate} rows/s"
|
452
|
+
description = (
|
453
|
+
"This case tests the search performance of vector database while maintaining "
|
454
|
+
f"a fixed insertion speed. (dataset: {dataset_with_size_type.value})"
|
455
|
+
)
|
456
|
+
|
457
|
+
if isinstance(search_stages, str):
|
458
|
+
search_stages = json.loads(search_stages)
|
459
|
+
if isinstance(concurrencies, str):
|
460
|
+
concurrencies = json.loads(concurrencies)
|
461
|
+
|
462
|
+
super().__init__(
|
463
|
+
name=name,
|
464
|
+
description=description,
|
465
|
+
dataset=dataset,
|
466
|
+
dataset_with_size_type=dataset_with_size_type,
|
467
|
+
insert_rate=insert_rate,
|
468
|
+
search_stages=search_stages,
|
469
|
+
concurrencies=concurrencies,
|
470
|
+
**kwargs,
|
349
471
|
)
|
350
472
|
|
351
473
|
|
474
|
+
class LabelFilterPerformanceCase(PerformanceCase):
|
475
|
+
case_id: CaseType = CaseType.LabelFilterPerformanceCase
|
476
|
+
dataset_with_size_type: DatasetWithSizeType
|
477
|
+
label_percentage: float
|
478
|
+
|
479
|
+
def __init__(
|
480
|
+
self,
|
481
|
+
dataset_with_size_type: DatasetWithSizeType | str,
|
482
|
+
label_percentage: float,
|
483
|
+
**kwargs,
|
484
|
+
):
|
485
|
+
if not isinstance(dataset_with_size_type, DatasetWithSizeType):
|
486
|
+
dataset_with_size_type = DatasetWithSizeType(dataset_with_size_type)
|
487
|
+
name = f"Label-Filter-{label_percentage*100:.1f}% - {dataset_with_size_type.value}"
|
488
|
+
description = f"Label-Filter-{label_percentage*100:.1f}% Performance Test ({dataset_with_size_type.value})"
|
489
|
+
dataset = dataset_with_size_type.get_manager()
|
490
|
+
load_timeout = dataset_with_size_type.get_load_timeout()
|
491
|
+
optimize_timeout = dataset_with_size_type.get_optimize_timeout()
|
492
|
+
filters = LabelFilter(label_percentage=label_percentage)
|
493
|
+
filter_rate = filters.filter_rate
|
494
|
+
super().__init__(
|
495
|
+
name=name,
|
496
|
+
description=description,
|
497
|
+
dataset=dataset,
|
498
|
+
load_timeout=load_timeout,
|
499
|
+
optimize_timeout=optimize_timeout,
|
500
|
+
filter_rate=filter_rate,
|
501
|
+
dataset_with_size_type=dataset_with_size_type,
|
502
|
+
label_percentage=label_percentage,
|
503
|
+
**kwargs,
|
504
|
+
)
|
505
|
+
|
506
|
+
@property
|
507
|
+
def filters(self) -> Filter:
|
508
|
+
return LabelFilter(label_percentage=self.label_percentage)
|
509
|
+
|
510
|
+
|
352
511
|
type2case = {
|
353
512
|
CaseType.CapacityDim960: CapacityDim960,
|
354
513
|
CaseType.CapacityDim128: CapacityDim128,
|
@@ -365,6 +524,10 @@ type2case = {
|
|
365
524
|
CaseType.Performance1536D5M1P: Performance1536D5M1P,
|
366
525
|
CaseType.Performance1536D500K99P: Performance1536D500K99P,
|
367
526
|
CaseType.Performance1536D5M99P: Performance1536D5M99P,
|
527
|
+
CaseType.Performance1024D1M: Performance1024D1M,
|
528
|
+
CaseType.Performance1024D10M: Performance1024D10M,
|
368
529
|
CaseType.Performance1536D50K: Performance1536D50K,
|
369
530
|
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
|
531
|
+
CaseType.StreamingPerformanceCase: StreamingPerformanceCase,
|
532
|
+
CaseType.LabelFilterPerformanceCase: LabelFilterPerformanceCase,
|
370
533
|
}
|
@@ -47,6 +47,7 @@ class DB(Enum):
|
|
47
47
|
Clickhouse = "Clickhouse"
|
48
48
|
Vespa = "Vespa"
|
49
49
|
LanceDB = "LanceDB"
|
50
|
+
OceanBase = "OceanBase"
|
50
51
|
|
51
52
|
@property
|
52
53
|
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
|
@@ -151,6 +152,11 @@ class DB(Enum):
|
|
151
152
|
|
152
153
|
return MongoDB
|
153
154
|
|
155
|
+
if self == DB.OceanBase:
|
156
|
+
from .oceanbase.oceanbase import OceanBase
|
157
|
+
|
158
|
+
return OceanBase
|
159
|
+
|
154
160
|
if self == DB.MariaDB:
|
155
161
|
from .mariadb.mariadb import MariaDB
|
156
162
|
|
@@ -282,6 +288,11 @@ class DB(Enum):
|
|
282
288
|
|
283
289
|
return MongoDBConfig
|
284
290
|
|
291
|
+
if self == DB.OceanBase:
|
292
|
+
from .oceanbase.config import OceanBaseConfig
|
293
|
+
|
294
|
+
return OceanBaseConfig
|
295
|
+
|
285
296
|
if self == DB.MariaDB:
|
286
297
|
from .mariadb.config import MariaDBConfig
|
287
298
|
|
@@ -394,6 +405,11 @@ class DB(Enum):
|
|
394
405
|
|
395
406
|
return MongoDBIndexConfig
|
396
407
|
|
408
|
+
if self == DB.OceanBase:
|
409
|
+
from .oceanbase.config import _oceanbase_case_config
|
410
|
+
|
411
|
+
return _oceanbase_case_config.get(index_type)
|
412
|
+
|
397
413
|
if self == DB.MariaDB:
|
398
414
|
from .mariadb.config import _mariadb_case_config
|
399
415
|
|
@@ -4,6 +4,8 @@ from enum import Enum
|
|
4
4
|
|
5
5
|
from pydantic import BaseModel, SecretStr, validator
|
6
6
|
|
7
|
+
from vectordb_bench.backend.filter import Filter, FilterOp
|
8
|
+
|
7
9
|
|
8
10
|
class MetricType(str, Enum):
|
9
11
|
L2 = "L2"
|
@@ -17,6 +19,7 @@ class MetricType(str, Enum):
|
|
17
19
|
class IndexType(str, Enum):
|
18
20
|
HNSW = "HNSW"
|
19
21
|
HNSW_SQ = "HNSW_SQ"
|
22
|
+
HNSW_BQ = "HNSW_BQ"
|
20
23
|
HNSW_PQ = "HNSW_PQ"
|
21
24
|
HNSW_PRQ = "HNSW_PRQ"
|
22
25
|
DISKANN = "DISKANN"
|
@@ -28,6 +31,9 @@ class IndexType(str, Enum):
|
|
28
31
|
Flat = "FLAT"
|
29
32
|
AUTOINDEX = "AUTOINDEX"
|
30
33
|
ES_HNSW = "hnsw"
|
34
|
+
ES_HNSW_INT8 = "int8_hnsw"
|
35
|
+
ES_HNSW_INT4 = "int4_hnsw"
|
36
|
+
ES_HNSW_BBQ = "bbq_hnsw"
|
31
37
|
ES_IVFFlat = "ivfflat"
|
32
38
|
GPU_IVF_FLAT = "GPU_IVF_FLAT"
|
33
39
|
GPU_BRUTE_FORCE = "GPU_BRUTE_FORCE"
|
@@ -125,6 +131,21 @@ class VectorDB(ABC):
|
|
125
131
|
>>> milvus.search_embedding()
|
126
132
|
"""
|
127
133
|
|
134
|
+
"The filtering types supported by the VectorDB Client, default only non-filter"
|
135
|
+
supported_filter_types: list[FilterOp] = [FilterOp.NonFilter]
|
136
|
+
|
137
|
+
@classmethod
|
138
|
+
def filter_supported(cls, filters: Filter) -> bool:
|
139
|
+
"""Ensure that the filters are supported before testing filtering cases."""
|
140
|
+
return filters.type in cls.supported_filter_types
|
141
|
+
|
142
|
+
def prepare_filter(self, filters: Filter):
|
143
|
+
"""The vector database is allowed to pre-prepare different filter conditions
|
144
|
+
to reduce redundancy during the testing process.
|
145
|
+
|
146
|
+
(All search tests in a case use consistent filtering conditions.)"""
|
147
|
+
return
|
148
|
+
|
128
149
|
@abstractmethod
|
129
150
|
def __init__(
|
130
151
|
self,
|
@@ -175,6 +196,7 @@ class VectorDB(ABC):
|
|
175
196
|
self,
|
176
197
|
embeddings: list[list[float]],
|
177
198
|
metadata: list[int],
|
199
|
+
labels_data: list[str] | None = None,
|
178
200
|
**kwargs,
|
179
201
|
) -> tuple[int, Exception]:
|
180
202
|
"""Insert the embeddings to the vector database. The default number of embeddings for
|
@@ -195,7 +217,6 @@ class VectorDB(ABC):
|
|
195
217
|
self,
|
196
218
|
query: list[float],
|
197
219
|
k: int = 100,
|
198
|
-
filters: dict | None = None,
|
199
220
|
) -> list[int]:
|
200
221
|
"""Get k most similar embeddings to query vector.
|
201
222
|
|