vectordb-bench 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/__init__.py +14 -3
- vectordb_bench/backend/assembler.py +2 -2
- vectordb_bench/backend/cases.py +146 -57
- vectordb_bench/backend/clients/__init__.py +6 -1
- vectordb_bench/backend/clients/api.py +23 -11
- vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
- vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +11 -9
- vectordb_bench/backend/clients/milvus/config.py +2 -3
- vectordb_bench/backend/clients/milvus/milvus.py +32 -19
- vectordb_bench/backend/clients/pgvector/config.py +49 -0
- vectordb_bench/backend/clients/pgvector/pgvector.py +171 -0
- vectordb_bench/backend/clients/pinecone/config.py +3 -3
- vectordb_bench/backend/clients/pinecone/pinecone.py +19 -13
- vectordb_bench/backend/clients/qdrant_cloud/config.py +23 -6
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +12 -13
- vectordb_bench/backend/clients/weaviate_cloud/config.py +3 -3
- vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +9 -8
- vectordb_bench/backend/clients/zilliz_cloud/config.py +5 -4
- vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +3 -1
- vectordb_bench/backend/dataset.py +100 -162
- vectordb_bench/backend/result_collector.py +2 -2
- vectordb_bench/backend/runner/mp_runner.py +29 -13
- vectordb_bench/backend/runner/serial_runner.py +98 -36
- vectordb_bench/backend/task_runner.py +43 -48
- vectordb_bench/frontend/components/check_results/charts.py +10 -21
- vectordb_bench/frontend/components/check_results/data.py +31 -15
- vectordb_bench/frontend/components/check_results/expanderStyle.py +37 -0
- vectordb_bench/frontend/components/check_results/filters.py +61 -33
- vectordb_bench/frontend/components/check_results/footer.py +8 -0
- vectordb_bench/frontend/components/check_results/headerIcon.py +8 -4
- vectordb_bench/frontend/components/check_results/nav.py +7 -6
- vectordb_bench/frontend/components/check_results/priceTable.py +3 -2
- vectordb_bench/frontend/components/check_results/stPageConfig.py +18 -0
- vectordb_bench/frontend/components/get_results/saveAsImage.py +50 -0
- vectordb_bench/frontend/components/run_test/autoRefresh.py +1 -1
- vectordb_bench/frontend/components/run_test/caseSelector.py +19 -16
- vectordb_bench/frontend/components/run_test/dbConfigSetting.py +20 -7
- vectordb_bench/frontend/components/run_test/dbSelector.py +5 -5
- vectordb_bench/frontend/components/run_test/hideSidebar.py +4 -6
- vectordb_bench/frontend/components/run_test/submitTask.py +16 -10
- vectordb_bench/frontend/const/dbCaseConfigs.py +291 -0
- vectordb_bench/frontend/const/dbPrices.py +6 -0
- vectordb_bench/frontend/const/styles.py +58 -0
- vectordb_bench/frontend/pages/{qps_with_price.py → quries_per_dollar.py} +24 -17
- vectordb_bench/frontend/pages/run_test.py +17 -11
- vectordb_bench/frontend/vdb_benchmark.py +19 -12
- vectordb_bench/metric.py +19 -10
- vectordb_bench/models.py +14 -40
- vectordb_bench/results/dbPrices.json +32 -0
- vectordb_bench/results/getLeaderboardData.py +52 -0
- vectordb_bench/results/leaderboard.json +1 -0
- vectordb_bench/results/{result_20230609_standard.json → result_20230705_standard.json} +1910 -897
- {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/METADATA +107 -27
- vectordb_bench-0.0.3.dist-info/RECORD +67 -0
- vectordb_bench/frontend/const.py +0 -391
- vectordb_bench-0.0.1.dist-info/RECORD +0 -56
- {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/WHEEL +0 -0
- {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,20 @@
|
|
1
1
|
"""
|
2
2
|
Usage:
|
3
|
-
>>> from xxx import
|
4
|
-
>>>
|
5
|
-
>>> gist_s.dict()
|
6
|
-
dataset: {'data': {'name': 'GIST', 'dim': 128, 'metric_type': 'L2', 'label': 'SMALL', 'size': 50000000}, 'data_dir': 'xxx'}
|
3
|
+
>>> from xxx.dataset import Dataset
|
4
|
+
>>> Dataset.Cohere.get(100_000)
|
7
5
|
"""
|
8
6
|
|
9
7
|
import os
|
10
8
|
import logging
|
11
9
|
import pathlib
|
12
|
-
import math
|
13
10
|
from hashlib import md5
|
14
|
-
from enum import Enum
|
15
|
-
from typing import Any
|
16
|
-
|
11
|
+
from enum import Enum
|
17
12
|
import s3fs
|
18
13
|
import pandas as pd
|
19
14
|
from tqdm import tqdm
|
20
|
-
from pydantic
|
15
|
+
from pydantic import validator, PrivateAttr
|
16
|
+
import polars as pl
|
17
|
+
from pyarrow.parquet import ParquetFile
|
21
18
|
|
22
19
|
from ..base import BaseModel
|
23
20
|
from .. import config
|
@@ -26,118 +23,83 @@ from . import utils
|
|
26
23
|
|
27
24
|
log = logging.getLogger(__name__)
|
28
25
|
|
29
|
-
|
30
|
-
class
|
31
|
-
name: str
|
32
|
-
|
33
|
-
|
34
|
-
|
26
|
+
|
27
|
+
class BaseDataset(BaseModel):
|
28
|
+
name: str
|
29
|
+
size: int
|
30
|
+
dim: int
|
31
|
+
metric_type: MetricType
|
32
|
+
use_shuffled: bool
|
33
|
+
_size_label: dict = PrivateAttr()
|
34
|
+
|
35
|
+
@validator("size")
|
36
|
+
def verify_size(cls, v):
|
37
|
+
if v not in cls._size_label:
|
38
|
+
raise ValueError(f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}")
|
39
|
+
return v
|
40
|
+
|
41
|
+
@property
|
42
|
+
def label(self) -> str:
|
43
|
+
return self._size_label.get(self.size)
|
35
44
|
|
36
45
|
@property
|
37
46
|
def dir_name(self) -> str:
|
38
47
|
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
39
48
|
|
40
|
-
|
41
|
-
class
|
49
|
+
|
50
|
+
class LAION(BaseDataset):
|
51
|
+
name: str = "LAION"
|
52
|
+
dim: int = 768
|
53
|
+
metric_type: MetricType = MetricType.L2
|
54
|
+
use_shuffled: bool = False
|
55
|
+
_size_label: dict = {100_000_000: "LARGE"}
|
56
|
+
|
57
|
+
|
58
|
+
class GIST(BaseDataset):
|
42
59
|
name: str = "GIST"
|
43
60
|
dim: int = 960
|
44
61
|
metric_type: MetricType = MetricType.L2
|
45
62
|
use_shuffled: bool = False
|
63
|
+
_size_label: dict = {
|
64
|
+
100_000: "SMALL",
|
65
|
+
1_000_000: "MEDIUM",
|
66
|
+
}
|
46
67
|
|
47
|
-
@property
|
48
|
-
def dir_name(self) -> str:
|
49
|
-
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
50
68
|
|
51
|
-
|
52
|
-
class Cohere:
|
69
|
+
class Cohere(BaseDataset):
|
53
70
|
name: str = "Cohere"
|
54
71
|
dim: int = 768
|
55
72
|
metric_type: MetricType = MetricType.COSINE
|
56
73
|
use_shuffled: bool = config.USE_SHUFFLED_DATA
|
74
|
+
_size_label: dict = {
|
75
|
+
100_000: "SMALL",
|
76
|
+
1_000_000: "MEDIUM",
|
77
|
+
10_000_000: "LARGE",
|
78
|
+
}
|
57
79
|
|
58
|
-
@property
|
59
|
-
def dir_name(self) -> str:
|
60
|
-
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
61
80
|
|
62
|
-
|
63
|
-
class Glove:
|
81
|
+
class Glove(BaseDataset):
|
64
82
|
name: str = "Glove"
|
65
83
|
dim: int = 200
|
66
84
|
metric_type: MetricType = MetricType.COSINE
|
67
85
|
use_shuffled: bool = False
|
86
|
+
_size_label: dict = {1_000_000: "MEDIUM"}
|
68
87
|
|
69
|
-
@property
|
70
|
-
def dir_name(self) -> str:
|
71
|
-
return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
|
72
88
|
|
73
|
-
|
74
|
-
class SIFT:
|
89
|
+
class SIFT(BaseDataset):
|
75
90
|
name: str = "SIFT"
|
76
91
|
dim: int = 128
|
77
|
-
metric_type: MetricType = MetricType.
|
92
|
+
metric_type: MetricType = MetricType.L2
|
78
93
|
use_shuffled: bool = False
|
94
|
+
_size_label: dict = {
|
79
95
|
|
80
|
-
|
81
|
-
|
82
|
-
|
96
|
+
500_000: "SMALL",
|
97
|
+
5_000_000: "MEDIUM",
|
98
|
+
50_000_000: "LARGE",
|
99
|
+
}
|
83
100
|
|
84
|
-
|
85
|
-
class
|
86
|
-
label: str = "LARGE"
|
87
|
-
size: int = 100_000_000
|
88
|
-
|
89
|
-
@dataclass
|
90
|
-
class GIST_S(GIST):
|
91
|
-
label: str = "SMALL"
|
92
|
-
size: int = 100_000
|
93
|
-
|
94
|
-
@dataclass
|
95
|
-
class GIST_M(GIST):
|
96
|
-
label: str = "MEDIUM"
|
97
|
-
size: int = 1_000_000
|
98
|
-
|
99
|
-
@dataclass
|
100
|
-
class Cohere_S(Cohere):
|
101
|
-
label: str = "SMALL"
|
102
|
-
size: int = 100_000
|
103
|
-
|
104
|
-
@dataclass
|
105
|
-
class Cohere_M(Cohere):
|
106
|
-
label: str = "MEDIUM"
|
107
|
-
size: int = 1_000_000
|
108
|
-
|
109
|
-
@dataclass
|
110
|
-
class Cohere_L(Cohere):
|
111
|
-
label : str = "LARGE"
|
112
|
-
size : int = 10_000_000
|
113
|
-
|
114
|
-
@dataclass
|
115
|
-
class Glove_S(Glove):
|
116
|
-
label: str = "SMALL"
|
117
|
-
size : int = 100_000
|
118
|
-
|
119
|
-
@dataclass
|
120
|
-
class Glove_M(Glove):
|
121
|
-
label: str = "MEDIUM"
|
122
|
-
size : int = 1_000_000
|
123
|
-
|
124
|
-
@dataclass
|
125
|
-
class SIFT_S(SIFT):
|
126
|
-
label: str = "SMALL"
|
127
|
-
size : int = 500_000
|
128
|
-
|
129
|
-
@dataclass
|
130
|
-
class SIFT_M(SIFT):
|
131
|
-
label: str = "MEDIUM"
|
132
|
-
size : int = 5_000_000
|
133
|
-
|
134
|
-
@dataclass
|
135
|
-
class SIFT_L(SIFT):
|
136
|
-
label: str = "LARGE"
|
137
|
-
size : int = 50_000_000
|
138
|
-
|
139
|
-
|
140
|
-
class DataSet(BaseModel):
|
101
|
+
|
102
|
+
class DatasetManager(BaseModel):
|
141
103
|
"""Download dataset if not int the local directory. Provide data for cases.
|
142
104
|
|
143
105
|
DataSet is iterable, each iteration will return the next batch of data in pandas.DataFrame
|
@@ -147,12 +109,12 @@ class DataSet(BaseModel):
|
|
147
109
|
>>> for data in cohere_s:
|
148
110
|
>>> print(data.columns)
|
149
111
|
"""
|
150
|
-
data:
|
112
|
+
data: BaseDataset
|
151
113
|
test_data: pd.DataFrame | None = None
|
152
114
|
train_files : list[str] = []
|
153
115
|
|
154
116
|
def __eq__(self, obj):
|
155
|
-
if isinstance(obj,
|
117
|
+
if isinstance(obj, DatasetManager):
|
156
118
|
return self.data.name == obj.data.name and \
|
157
119
|
self.data.label == obj.data.label
|
158
120
|
return False
|
@@ -304,90 +266,66 @@ class DataSet(BaseModel):
|
|
304
266
|
|
305
267
|
def _read_file(self, file_name: str) -> pd.DataFrame:
|
306
268
|
"""read one file from disk into memory"""
|
307
|
-
|
308
|
-
|
269
|
+
log.info(f"Read the entire file into memory: {file_name}")
|
309
270
|
p = pathlib.Path(self.data_dir, file_name)
|
310
|
-
log.info(f"reading file into memory: {p}")
|
311
271
|
if not p.exists():
|
312
272
|
log.warning(f"No such file: {p}")
|
313
273
|
return pd.DataFrame()
|
314
|
-
|
315
|
-
|
316
|
-
return df
|
274
|
+
|
275
|
+
return pl.read_parquet(p)
|
317
276
|
|
318
277
|
|
319
278
|
class DataSetIterator:
|
320
|
-
def __init__(self, dataset:
|
279
|
+
def __init__(self, dataset: DatasetManager):
|
321
280
|
self._ds = dataset
|
322
281
|
self._idx = 0 # file number
|
323
|
-
self.
|
282
|
+
self._cur = None
|
324
283
|
self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
|
325
284
|
|
285
|
+
def _get_iter(self, file_name: str):
|
286
|
+
p = pathlib.Path(self._ds.data_dir, file_name)
|
287
|
+
log.info(f"Get iterator for {p.name}")
|
288
|
+
if not p.exists():
|
289
|
+
raise IndexError(f"No such file {p}")
|
290
|
+
log.warning(f"No such file: {p}")
|
291
|
+
return ParquetFile(p).iter_batches(config.NUM_PER_BATCH)
|
292
|
+
|
326
293
|
def __next__(self) -> pd.DataFrame:
|
327
294
|
"""return the data in the next file of the training list"""
|
328
295
|
if self._idx < len(self._ds.train_files):
|
329
|
-
|
330
|
-
if _sub == 0 and self._idx == 0: # init
|
296
|
+
if self._cur is None:
|
331
297
|
file_name = self._ds.train_files[self._idx]
|
332
|
-
self.
|
333
|
-
self._iter_num = math.ceil(self._curr.shape[0]/100_000)
|
298
|
+
self._cur = self._get_iter(file_name)
|
334
299
|
|
335
|
-
|
300
|
+
try:
|
301
|
+
return next(self._cur).to_pandas()
|
302
|
+
except StopIteration:
|
336
303
|
if self._idx == len(self._ds.train_files) - 1:
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
self._curr = None
|
344
|
-
file_name = self._ds.train_files[self._idx]
|
345
|
-
self._curr = self._ds._read_file(file_name)
|
346
|
-
|
347
|
-
sub_df = self._curr[_sub*100_000: (_sub+1)*100_000]
|
348
|
-
self._sub_idx[self._idx] += 1
|
349
|
-
log.info(f"Get the [{_sub+1}/{self._iter_num}] batch of {self._idx+1}/{len(self._ds.train_files)} train file")
|
350
|
-
return sub_df
|
351
|
-
self._curr = None
|
304
|
+
raise StopIteration from None
|
305
|
+
|
306
|
+
self._idx += 1
|
307
|
+
file_name = self._ds.train_files[self._idx]
|
308
|
+
self._cur = self._get_iter(file_name)
|
309
|
+
return next(self._cur).to_pandas()
|
352
310
|
raise StopIteration
|
353
311
|
|
354
312
|
|
355
|
-
class
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
Label.SMALL: DataSet(data=Cohere_S()),
|
375
|
-
Label.MEDIUM: DataSet(data=Cohere_M()),
|
376
|
-
Label.LARGE: DataSet(data=Cohere_L()),
|
377
|
-
},
|
378
|
-
Name.Glove:{
|
379
|
-
Label.SMALL: DataSet(data=Glove_S()),
|
380
|
-
Label.MEDIUM: DataSet(data=Glove_M()),
|
381
|
-
},
|
382
|
-
Name.SIFT: {
|
383
|
-
Label.SMALL: DataSet(data=SIFT_S()),
|
384
|
-
Label.MEDIUM: DataSet(data=SIFT_M()),
|
385
|
-
Label.LARGE: DataSet(data=SIFT_L()),
|
386
|
-
},
|
387
|
-
Name.LAION: {
|
388
|
-
Label.LARGE: DataSet(data=LAION_L()),
|
389
|
-
},
|
390
|
-
}
|
391
|
-
|
392
|
-
def get(ds: Name, label: Label):
|
393
|
-
return _global_ds_mapping.get(ds, {}).get(label)
|
313
|
+
class Dataset(Enum):
|
314
|
+
"""
|
315
|
+
Value is Dataset classes, DO NOT use it
|
316
|
+
Example:
|
317
|
+
>>> all_dataset = [ds.name for ds in Dataset]
|
318
|
+
>>> Dataset.COHERE.manager(100_000)
|
319
|
+
>>> Dataset.COHERE.get(100_000)
|
320
|
+
"""
|
321
|
+
LAION = LAION
|
322
|
+
GIST = GIST
|
323
|
+
COHERE = Cohere
|
324
|
+
GLOVE = Glove
|
325
|
+
SIFT = SIFT
|
326
|
+
|
327
|
+
def get(self, size: int) -> BaseDataset:
|
328
|
+
return self.value(size=size)
|
329
|
+
|
330
|
+
def manager(self, size: int) -> DatasetManager:
|
331
|
+
return DatasetManager(data=self.get(size))
|
@@ -6,10 +6,10 @@ class ResultCollector:
|
|
6
6
|
@classmethod
|
7
7
|
def collect(cls, result_dir: pathlib.Path) -> list[TestResult]:
|
8
8
|
results = []
|
9
|
-
if not result_dir.exists() or len(list(result_dir.glob("*.json"))) == 0:
|
9
|
+
if not result_dir.exists() or len(list(result_dir.glob("result_*.json"))) == 0:
|
10
10
|
return []
|
11
11
|
|
12
|
-
for json_file in result_dir.glob("*.json"):
|
12
|
+
for json_file in result_dir.glob("result_*.json"):
|
13
13
|
results.append(TestResult.read_file(json_file, trans_unit=True))
|
14
14
|
|
15
15
|
return results
|
@@ -40,7 +40,12 @@ class MultiProcessingSearchRunner:
|
|
40
40
|
self.test_data = utils.SharedNumpyArray(test_data)
|
41
41
|
log.debug(f"test dataset columns: {len(test_data)}")
|
42
42
|
|
43
|
-
def search(self, test_np: utils.SharedNumpyArray) -> tuple[int, float]:
|
43
|
+
def search(self, test_np: utils.SharedNumpyArray, q: mp.Queue, cond: mp.Condition) -> tuple[int, float]:
|
44
|
+
# sync all process
|
45
|
+
q.put(1)
|
46
|
+
with cond:
|
47
|
+
cond.wait()
|
48
|
+
|
44
49
|
with self.db.init():
|
45
50
|
test_data = test_np.read().tolist()
|
46
51
|
num, idx = len(test_data), 0
|
@@ -77,7 +82,7 @@ class MultiProcessingSearchRunner:
|
|
77
82
|
|
78
83
|
@staticmethod
|
79
84
|
def get_mp_context():
|
80
|
-
mp_start_method = "
|
85
|
+
mp_start_method = "spawn"
|
81
86
|
log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
|
82
87
|
return mp.get_context(mp_start_method)
|
83
88
|
|
@@ -85,21 +90,32 @@ class MultiProcessingSearchRunner:
|
|
85
90
|
max_qps = 0
|
86
91
|
try:
|
87
92
|
for conc in self.concurrencies:
|
88
|
-
with
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
93
|
+
with mp.Manager() as m:
|
94
|
+
q, cond = m.Queue(), m.Condition()
|
95
|
+
with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
|
96
|
+
log.info(f"Start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
|
97
|
+
future_iter = [executor.submit(self.search, self.test_data, q, cond) for i in range(conc)]
|
98
|
+
# Sync all processes
|
99
|
+
while q.qsize() < conc:
|
100
|
+
sleep_t = conc if conc < 10 else 10
|
101
|
+
time.sleep(sleep_t)
|
102
|
+
|
103
|
+
with cond:
|
104
|
+
cond.notify_all()
|
105
|
+
log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
|
106
|
+
|
107
|
+
start = time.perf_counter()
|
108
|
+
all_count = sum([r.result()[0] for r in future_iter])
|
109
|
+
cost = time.perf_counter() - start
|
110
|
+
|
111
|
+
qps = round(all_count / cost, 4)
|
112
|
+
log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
|
97
113
|
|
98
114
|
if qps > max_qps:
|
99
115
|
max_qps = qps
|
100
|
-
log.info(f"
|
116
|
+
log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
|
101
117
|
except Exception as e:
|
102
|
-
log.warning(f"
|
118
|
+
log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
|
103
119
|
traceback.print_exc()
|
104
120
|
|
105
121
|
# No results available, raise exception
|
@@ -4,53 +4,99 @@ import traceback
|
|
4
4
|
import concurrent
|
5
5
|
import multiprocessing as mp
|
6
6
|
import math
|
7
|
+
import psutil
|
8
|
+
|
7
9
|
import numpy as np
|
8
10
|
import pandas as pd
|
9
11
|
|
10
12
|
from ..clients import api
|
11
13
|
from ...metric import calc_recall
|
12
|
-
from ...models import LoadTimeoutError
|
14
|
+
from ...models import LoadTimeoutError, PerformanceTimeoutError
|
13
15
|
from .. import utils
|
14
16
|
from ... import config
|
17
|
+
from vectordb_bench.backend.dataset import DatasetManager
|
15
18
|
|
16
19
|
NUM_PER_BATCH = config.NUM_PER_BATCH
|
17
|
-
|
20
|
+
LOAD_MAX_TRY_COUNT = 10
|
21
|
+
WAITTING_TIME = 60
|
18
22
|
|
19
23
|
log = logging.getLogger(__name__)
|
20
24
|
|
21
|
-
|
22
25
|
class SerialInsertRunner:
|
23
|
-
def __init__(self, db: api.VectorDB,
|
24
|
-
|
26
|
+
def __init__(self, db: api.VectorDB, dataset: DatasetManager, normalize: bool, timeout: float | None = None):
|
27
|
+
self.timeout = timeout if isinstance(timeout, (int, float)) else None
|
28
|
+
self.dataset = dataset
|
25
29
|
self.db = db
|
26
|
-
self.
|
27
|
-
self.train_id = train_id
|
30
|
+
self.normalize = normalize
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
def insert_data(self, left_id: int = 0) -> int:
|
32
|
+
def task(self) -> int:
|
33
|
+
count = 0
|
32
34
|
with self.db.init():
|
33
|
-
|
35
|
+
log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
|
36
|
+
start = time.perf_counter()
|
37
|
+
for data_df in self.dataset:
|
38
|
+
all_metadata = data_df['id'].tolist()
|
39
|
+
|
40
|
+
emb_np = np.stack(data_df['emb'])
|
41
|
+
if self.normalize:
|
42
|
+
log.debug("normalize the 100k train data")
|
43
|
+
all_embeddings = emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis].tolist()
|
44
|
+
else:
|
45
|
+
all_embeddings = emb_np.tolist()
|
46
|
+
del(emb_np)
|
47
|
+
log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
|
48
|
+
|
49
|
+
last_batch = self.dataset.data.size - count == len(all_metadata)
|
50
|
+
insert_count, error = self.db.insert_embeddings(
|
51
|
+
embeddings=all_embeddings,
|
52
|
+
metadata=all_metadata,
|
53
|
+
last_batch=last_batch,
|
54
|
+
)
|
55
|
+
if error is not None:
|
56
|
+
raise error
|
57
|
+
|
58
|
+
assert insert_count == len(all_metadata)
|
59
|
+
count += insert_count
|
60
|
+
if count % 100_000 == 0:
|
61
|
+
log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
|
62
|
+
|
63
|
+
log.info(f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}")
|
64
|
+
return count
|
34
65
|
|
66
|
+
def endless_insert_data(self, all_embeddings, all_metadata, left_id: int = 0) -> int:
|
67
|
+
with self.db.init():
|
35
68
|
# unique id for endlessness insertion
|
36
|
-
all_metadata = [i+left_id for i in
|
69
|
+
all_metadata = [i+left_id for i in all_metadata]
|
37
70
|
|
38
|
-
|
71
|
+
NUM_BATCHES = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
|
39
72
|
log.info(f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
|
40
73
|
count = 0
|
41
|
-
for batch_id in range(
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
74
|
+
for batch_id in range(NUM_BATCHES):
|
75
|
+
retry_count = 0
|
76
|
+
already_insert_count = 0
|
77
|
+
metadata = all_metadata[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
|
78
|
+
embeddings = all_embeddings[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
|
79
|
+
|
80
|
+
log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Start inserting {len(metadata)} embeddings")
|
81
|
+
while retry_count < LOAD_MAX_TRY_COUNT:
|
82
|
+
insert_count, error = self.db.insert_embeddings(
|
83
|
+
embeddings=embeddings[already_insert_count :],
|
84
|
+
metadata=metadata[already_insert_count :],
|
85
|
+
)
|
86
|
+
already_insert_count += insert_count
|
87
|
+
if error is not None:
|
88
|
+
retry_count += 1
|
89
|
+
time.sleep(WAITTING_TIME)
|
90
|
+
|
91
|
+
log.info(f"Failed to insert data, try {retry_count} time")
|
92
|
+
if retry_count >= LOAD_MAX_TRY_COUNT:
|
93
|
+
raise error
|
94
|
+
else:
|
95
|
+
break
|
96
|
+
log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Finish inserting {len(metadata)} embeddings")
|
97
|
+
|
98
|
+
assert already_insert_count == len(metadata)
|
99
|
+
count += already_insert_count
|
54
100
|
log.info(f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
|
55
101
|
return count
|
56
102
|
|
@@ -58,30 +104,46 @@ class SerialInsertRunner:
|
|
58
104
|
def _insert_all_batches(self) -> int:
|
59
105
|
"""Performance case only"""
|
60
106
|
with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context('spawn'), max_workers=1) as executor:
|
61
|
-
future = executor.submit(self.
|
62
|
-
|
63
|
-
|
107
|
+
future = executor.submit(self.task)
|
108
|
+
try:
|
109
|
+
count = future.result(timeout=self.timeout)
|
110
|
+
except TimeoutError as e:
|
111
|
+
msg = f"VectorDB load dataset timeout in {self.timeout}"
|
112
|
+
log.warning(msg)
|
113
|
+
for pid, _ in executor._processes.items():
|
114
|
+
psutil.Process(pid).kill()
|
115
|
+
raise PerformanceTimeoutError(msg) from e
|
116
|
+
except Exception as e:
|
117
|
+
log.warning(f"VectorDB load dataset error: {e}")
|
118
|
+
raise e from e
|
119
|
+
else:
|
120
|
+
return count
|
64
121
|
|
65
122
|
def run_endlessness(self) -> int:
|
66
123
|
"""run forever util DB raises exception or crash"""
|
124
|
+
# datasets for load tests are quite small, can fit into memory
|
125
|
+
# only 1 file
|
126
|
+
data_df = [data_df for data_df in self.dataset][0]
|
127
|
+
all_embeddings, all_metadata = np.stack(data_df["emb"]).tolist(), data_df['id'].tolist()
|
128
|
+
|
67
129
|
start_time = time.perf_counter()
|
68
130
|
max_load_count, times = 0, 0
|
69
131
|
try:
|
70
132
|
with self.db.init():
|
71
133
|
self.db.ready_to_load()
|
72
|
-
while time.perf_counter() - start_time <
|
73
|
-
count = self.
|
134
|
+
while time.perf_counter() - start_time < self.timeout:
|
135
|
+
count = self.endless_insert_data(all_embeddings, all_metadata, left_id=max_load_count)
|
74
136
|
max_load_count += count
|
75
137
|
times += 1
|
76
138
|
log.info(f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, {max_load_count}")
|
77
|
-
raise LoadTimeoutError("capacity case load timeout and stop")
|
78
|
-
except LoadTimeoutError as e:
|
79
|
-
log.info("load timetout, stop the load case")
|
80
|
-
raise e from None
|
81
139
|
except Exception as e:
|
82
140
|
log.info(f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, {max_load_count}, err={e}")
|
83
141
|
traceback.print_exc()
|
84
142
|
return max_load_count
|
143
|
+
else:
|
144
|
+
msg = f"capacity case load timeout in {self.timeout}s"
|
145
|
+
log.info(msg)
|
146
|
+
raise LoadTimeoutError(msg)
|
85
147
|
|
86
148
|
def run(self) -> int:
|
87
149
|
count, dur = self._insert_all_batches()
|
@@ -113,7 +175,7 @@ class SerialSearchRunner:
|
|
113
175
|
test_data, ground_truth = args
|
114
176
|
|
115
177
|
log.debug(f"test dataset size: {len(test_data)}")
|
116
|
-
log.
|
178
|
+
log.debug(f"ground truth size: {ground_truth.columns}, shape: {ground_truth.shape}")
|
117
179
|
|
118
180
|
latencies, recalls = [], []
|
119
181
|
for idx, emb in enumerate(test_data):
|