vectordb-bench 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. vectordb_bench/__init__.py +14 -3
  2. vectordb_bench/backend/assembler.py +2 -2
  3. vectordb_bench/backend/cases.py +146 -57
  4. vectordb_bench/backend/clients/__init__.py +6 -1
  5. vectordb_bench/backend/clients/api.py +23 -11
  6. vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
  7. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +11 -9
  8. vectordb_bench/backend/clients/milvus/config.py +2 -3
  9. vectordb_bench/backend/clients/milvus/milvus.py +32 -19
  10. vectordb_bench/backend/clients/pgvector/config.py +49 -0
  11. vectordb_bench/backend/clients/pgvector/pgvector.py +171 -0
  12. vectordb_bench/backend/clients/pinecone/config.py +3 -3
  13. vectordb_bench/backend/clients/pinecone/pinecone.py +19 -13
  14. vectordb_bench/backend/clients/qdrant_cloud/config.py +23 -6
  15. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +12 -13
  16. vectordb_bench/backend/clients/weaviate_cloud/config.py +3 -3
  17. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +9 -8
  18. vectordb_bench/backend/clients/zilliz_cloud/config.py +5 -4
  19. vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +3 -1
  20. vectordb_bench/backend/dataset.py +100 -162
  21. vectordb_bench/backend/result_collector.py +2 -2
  22. vectordb_bench/backend/runner/mp_runner.py +29 -13
  23. vectordb_bench/backend/runner/serial_runner.py +98 -36
  24. vectordb_bench/backend/task_runner.py +43 -48
  25. vectordb_bench/frontend/components/check_results/charts.py +10 -21
  26. vectordb_bench/frontend/components/check_results/data.py +31 -15
  27. vectordb_bench/frontend/components/check_results/expanderStyle.py +37 -0
  28. vectordb_bench/frontend/components/check_results/filters.py +61 -33
  29. vectordb_bench/frontend/components/check_results/footer.py +8 -0
  30. vectordb_bench/frontend/components/check_results/headerIcon.py +8 -4
  31. vectordb_bench/frontend/components/check_results/nav.py +7 -6
  32. vectordb_bench/frontend/components/check_results/priceTable.py +3 -2
  33. vectordb_bench/frontend/components/check_results/stPageConfig.py +18 -0
  34. vectordb_bench/frontend/components/get_results/saveAsImage.py +50 -0
  35. vectordb_bench/frontend/components/run_test/autoRefresh.py +1 -1
  36. vectordb_bench/frontend/components/run_test/caseSelector.py +19 -16
  37. vectordb_bench/frontend/components/run_test/dbConfigSetting.py +20 -7
  38. vectordb_bench/frontend/components/run_test/dbSelector.py +5 -5
  39. vectordb_bench/frontend/components/run_test/hideSidebar.py +4 -6
  40. vectordb_bench/frontend/components/run_test/submitTask.py +16 -10
  41. vectordb_bench/frontend/const/dbCaseConfigs.py +291 -0
  42. vectordb_bench/frontend/const/dbPrices.py +6 -0
  43. vectordb_bench/frontend/const/styles.py +58 -0
  44. vectordb_bench/frontend/pages/{qps_with_price.py → quries_per_dollar.py} +24 -17
  45. vectordb_bench/frontend/pages/run_test.py +17 -11
  46. vectordb_bench/frontend/vdb_benchmark.py +19 -12
  47. vectordb_bench/metric.py +19 -10
  48. vectordb_bench/models.py +14 -40
  49. vectordb_bench/results/dbPrices.json +32 -0
  50. vectordb_bench/results/getLeaderboardData.py +52 -0
  51. vectordb_bench/results/leaderboard.json +1 -0
  52. vectordb_bench/results/{result_20230609_standard.json → result_20230705_standard.json} +1910 -897
  53. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/METADATA +107 -27
  54. vectordb_bench-0.0.3.dist-info/RECORD +67 -0
  55. vectordb_bench/frontend/const.py +0 -391
  56. vectordb_bench-0.0.1.dist-info/RECORD +0 -56
  57. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/LICENSE +0 -0
  58. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/WHEEL +0 -0
  59. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/entry_points.txt +0 -0
  60. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,20 @@
1
1
  """
2
2
  Usage:
3
- >>> from xxx import dataset as ds
4
- >>> gist_s = ds.get(ds.Name.GIST, ds.Label.SMALL)
5
- >>> gist_s.dict()
6
- dataset: {'data': {'name': 'GIST', 'dim': 128, 'metric_type': 'L2', 'label': 'SMALL', 'size': 50000000}, 'data_dir': 'xxx'}
3
+ >>> from xxx.dataset import Dataset
4
+ >>> Dataset.Cohere.get(100_000)
7
5
  """
8
6
 
9
7
  import os
10
8
  import logging
11
9
  import pathlib
12
- import math
13
10
  from hashlib import md5
14
- from enum import Enum, auto
15
- from typing import Any
16
-
11
+ from enum import Enum
17
12
  import s3fs
18
13
  import pandas as pd
19
14
  from tqdm import tqdm
20
- from pydantic.dataclasses import dataclass
15
+ from pydantic import validator, PrivateAttr
16
+ import polars as pl
17
+ from pyarrow.parquet import ParquetFile
21
18
 
22
19
  from ..base import BaseModel
23
20
  from .. import config
@@ -26,118 +23,83 @@ from . import utils
26
23
 
27
24
  log = logging.getLogger(__name__)
28
25
 
29
- @dataclass
30
- class LAION:
31
- name: str = "LAION"
32
- dim: int = 768
33
- metric_type: MetricType = MetricType.COSINE
34
- use_shuffled: bool = False
26
+
27
+ class BaseDataset(BaseModel):
28
+ name: str
29
+ size: int
30
+ dim: int
31
+ metric_type: MetricType
32
+ use_shuffled: bool
33
+ _size_label: dict = PrivateAttr()
34
+
35
+ @validator("size")
36
+ def verify_size(cls, v):
37
+ if v not in cls._size_label:
38
+ raise ValueError(f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}")
39
+ return v
40
+
41
+ @property
42
+ def label(self) -> str:
43
+ return self._size_label.get(self.size)
35
44
 
36
45
  @property
37
46
  def dir_name(self) -> str:
38
47
  return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
39
48
 
40
- @dataclass
41
- class GIST:
49
+
50
+ class LAION(BaseDataset):
51
+ name: str = "LAION"
52
+ dim: int = 768
53
+ metric_type: MetricType = MetricType.L2
54
+ use_shuffled: bool = False
55
+ _size_label: dict = {100_000_000: "LARGE"}
56
+
57
+
58
+ class GIST(BaseDataset):
42
59
  name: str = "GIST"
43
60
  dim: int = 960
44
61
  metric_type: MetricType = MetricType.L2
45
62
  use_shuffled: bool = False
63
+ _size_label: dict = {
64
+ 100_000: "SMALL",
65
+ 1_000_000: "MEDIUM",
66
+ }
46
67
 
47
- @property
48
- def dir_name(self) -> str:
49
- return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
50
68
 
51
- @dataclass
52
- class Cohere:
69
+ class Cohere(BaseDataset):
53
70
  name: str = "Cohere"
54
71
  dim: int = 768
55
72
  metric_type: MetricType = MetricType.COSINE
56
73
  use_shuffled: bool = config.USE_SHUFFLED_DATA
74
+ _size_label: dict = {
75
+ 100_000: "SMALL",
76
+ 1_000_000: "MEDIUM",
77
+ 10_000_000: "LARGE",
78
+ }
57
79
 
58
- @property
59
- def dir_name(self) -> str:
60
- return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
61
80
 
62
- @dataclass
63
- class Glove:
81
+ class Glove(BaseDataset):
64
82
  name: str = "Glove"
65
83
  dim: int = 200
66
84
  metric_type: MetricType = MetricType.COSINE
67
85
  use_shuffled: bool = False
86
+ _size_label: dict = {1_000_000: "MEDIUM"}
68
87
 
69
- @property
70
- def dir_name(self) -> str:
71
- return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
72
88
 
73
- @dataclass
74
- class SIFT:
89
+ class SIFT(BaseDataset):
75
90
  name: str = "SIFT"
76
91
  dim: int = 128
77
- metric_type: MetricType = MetricType.COSINE
92
+ metric_type: MetricType = MetricType.L2
78
93
  use_shuffled: bool = False
94
+ _size_label: dict = {
79
95
 
80
- @property
81
- def dir_name(self) -> str:
82
- return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
96
+ 500_000: "SMALL",
97
+ 5_000_000: "MEDIUM",
98
+ 50_000_000: "LARGE",
99
+ }
83
100
 
84
- @dataclass
85
- class LAION_L(LAION):
86
- label: str = "LARGE"
87
- size: int = 100_000_000
88
-
89
- @dataclass
90
- class GIST_S(GIST):
91
- label: str = "SMALL"
92
- size: int = 100_000
93
-
94
- @dataclass
95
- class GIST_M(GIST):
96
- label: str = "MEDIUM"
97
- size: int = 1_000_000
98
-
99
- @dataclass
100
- class Cohere_S(Cohere):
101
- label: str = "SMALL"
102
- size: int = 100_000
103
-
104
- @dataclass
105
- class Cohere_M(Cohere):
106
- label: str = "MEDIUM"
107
- size: int = 1_000_000
108
-
109
- @dataclass
110
- class Cohere_L(Cohere):
111
- label : str = "LARGE"
112
- size : int = 10_000_000
113
-
114
- @dataclass
115
- class Glove_S(Glove):
116
- label: str = "SMALL"
117
- size : int = 100_000
118
-
119
- @dataclass
120
- class Glove_M(Glove):
121
- label: str = "MEDIUM"
122
- size : int = 1_000_000
123
-
124
- @dataclass
125
- class SIFT_S(SIFT):
126
- label: str = "SMALL"
127
- size : int = 500_000
128
-
129
- @dataclass
130
- class SIFT_M(SIFT):
131
- label: str = "MEDIUM"
132
- size : int = 5_000_000
133
-
134
- @dataclass
135
- class SIFT_L(SIFT):
136
- label: str = "LARGE"
137
- size : int = 50_000_000
138
-
139
-
140
- class DataSet(BaseModel):
101
+
102
+ class DatasetManager(BaseModel):
141
103
  """Download dataset if not int the local directory. Provide data for cases.
142
104
 
143
105
  DataSet is iterable, each iteration will return the next batch of data in pandas.DataFrame
@@ -147,12 +109,12 @@ class DataSet(BaseModel):
147
109
  >>> for data in cohere_s:
148
110
  >>> print(data.columns)
149
111
  """
150
- data: GIST | Cohere | Glove | SIFT | Any
112
+ data: BaseDataset
151
113
  test_data: pd.DataFrame | None = None
152
114
  train_files : list[str] = []
153
115
 
154
116
  def __eq__(self, obj):
155
- if isinstance(obj, DataSet):
117
+ if isinstance(obj, DatasetManager):
156
118
  return self.data.name == obj.data.name and \
157
119
  self.data.label == obj.data.label
158
120
  return False
@@ -304,90 +266,66 @@ class DataSet(BaseModel):
304
266
 
305
267
  def _read_file(self, file_name: str) -> pd.DataFrame:
306
268
  """read one file from disk into memory"""
307
- import pyarrow.parquet as pq
308
-
269
+ log.info(f"Read the entire file into memory: {file_name}")
309
270
  p = pathlib.Path(self.data_dir, file_name)
310
- log.info(f"reading file into memory: {p}")
311
271
  if not p.exists():
312
272
  log.warning(f"No such file: {p}")
313
273
  return pd.DataFrame()
314
- data = pq.read_table(p)
315
- df = data.to_pandas()
316
- return df
274
+
275
+ return pl.read_parquet(p)
317
276
 
318
277
 
319
278
  class DataSetIterator:
320
- def __init__(self, dataset: DataSet):
279
+ def __init__(self, dataset: DatasetManager):
321
280
  self._ds = dataset
322
281
  self._idx = 0 # file number
323
- self._curr: pd.DataFrame | None = None
282
+ self._cur = None
324
283
  self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
325
284
 
285
+ def _get_iter(self, file_name: str):
286
+ p = pathlib.Path(self._ds.data_dir, file_name)
287
+ log.info(f"Get iterator for {p.name}")
288
+ if not p.exists():
289
+ raise IndexError(f"No such file {p}")
290
+ log.warning(f"No such file: {p}")
291
+ return ParquetFile(p).iter_batches(config.NUM_PER_BATCH)
292
+
326
293
  def __next__(self) -> pd.DataFrame:
327
294
  """return the data in the next file of the training list"""
328
295
  if self._idx < len(self._ds.train_files):
329
- _sub = self._sub_idx[self._idx]
330
- if _sub == 0 and self._idx == 0: # init
296
+ if self._cur is None:
331
297
  file_name = self._ds.train_files[self._idx]
332
- self._curr = self._ds._read_file(file_name)
333
- self._iter_num = math.ceil(self._curr.shape[0]/100_000)
298
+ self._cur = self._get_iter(file_name)
334
299
 
335
- if _sub == self._iter_num:
300
+ try:
301
+ return next(self._cur).to_pandas()
302
+ except StopIteration:
336
303
  if self._idx == len(self._ds.train_files) - 1:
337
- self._curr = None
338
- raise StopIteration
339
- else:
340
- self._idx += 1
341
- _sub = self._sub_idx[self._idx]
342
-
343
- self._curr = None
344
- file_name = self._ds.train_files[self._idx]
345
- self._curr = self._ds._read_file(file_name)
346
-
347
- sub_df = self._curr[_sub*100_000: (_sub+1)*100_000]
348
- self._sub_idx[self._idx] += 1
349
- log.info(f"Get the [{_sub+1}/{self._iter_num}] batch of {self._idx+1}/{len(self._ds.train_files)} train file")
350
- return sub_df
351
- self._curr = None
304
+ raise StopIteration from None
305
+
306
+ self._idx += 1
307
+ file_name = self._ds.train_files[self._idx]
308
+ self._cur = self._get_iter(file_name)
309
+ return next(self._cur).to_pandas()
352
310
  raise StopIteration
353
311
 
354
312
 
355
- class Name(Enum):
356
- GIST = auto()
357
- Cohere = auto()
358
- Glove = auto()
359
- SIFT = auto()
360
- LAION = auto()
361
-
362
-
363
- class Label(Enum):
364
- SMALL = auto()
365
- MEDIUM = auto()
366
- LARGE = auto()
367
-
368
- _global_ds_mapping = {
369
- Name.GIST: {
370
- Label.SMALL: DataSet(data=GIST_S()),
371
- Label.MEDIUM: DataSet(data=GIST_M()),
372
- },
373
- Name.Cohere: {
374
- Label.SMALL: DataSet(data=Cohere_S()),
375
- Label.MEDIUM: DataSet(data=Cohere_M()),
376
- Label.LARGE: DataSet(data=Cohere_L()),
377
- },
378
- Name.Glove:{
379
- Label.SMALL: DataSet(data=Glove_S()),
380
- Label.MEDIUM: DataSet(data=Glove_M()),
381
- },
382
- Name.SIFT: {
383
- Label.SMALL: DataSet(data=SIFT_S()),
384
- Label.MEDIUM: DataSet(data=SIFT_M()),
385
- Label.LARGE: DataSet(data=SIFT_L()),
386
- },
387
- Name.LAION: {
388
- Label.LARGE: DataSet(data=LAION_L()),
389
- },
390
- }
391
-
392
- def get(ds: Name, label: Label):
393
- return _global_ds_mapping.get(ds, {}).get(label)
313
+ class Dataset(Enum):
314
+ """
315
+ Value is Dataset classes, DO NOT use it
316
+ Example:
317
+ >>> all_dataset = [ds.name for ds in Dataset]
318
+ >>> Dataset.COHERE.manager(100_000)
319
+ >>> Dataset.COHERE.get(100_000)
320
+ """
321
+ LAION = LAION
322
+ GIST = GIST
323
+ COHERE = Cohere
324
+ GLOVE = Glove
325
+ SIFT = SIFT
326
+
327
+ def get(self, size: int) -> BaseDataset:
328
+ return self.value(size=size)
329
+
330
+ def manager(self, size: int) -> DatasetManager:
331
+ return DatasetManager(data=self.get(size))
@@ -6,10 +6,10 @@ class ResultCollector:
6
6
  @classmethod
7
7
  def collect(cls, result_dir: pathlib.Path) -> list[TestResult]:
8
8
  results = []
9
- if not result_dir.exists() or len(list(result_dir.glob("*.json"))) == 0:
9
+ if not result_dir.exists() or len(list(result_dir.glob("result_*.json"))) == 0:
10
10
  return []
11
11
 
12
- for json_file in result_dir.glob("*.json"):
12
+ for json_file in result_dir.glob("result_*.json"):
13
13
  results.append(TestResult.read_file(json_file, trans_unit=True))
14
14
 
15
15
  return results
@@ -40,7 +40,12 @@ class MultiProcessingSearchRunner:
40
40
  self.test_data = utils.SharedNumpyArray(test_data)
41
41
  log.debug(f"test dataset columns: {len(test_data)}")
42
42
 
43
- def search(self, test_np: utils.SharedNumpyArray) -> tuple[int, float]:
43
+ def search(self, test_np: utils.SharedNumpyArray, q: mp.Queue, cond: mp.Condition) -> tuple[int, float]:
44
+ # sync all process
45
+ q.put(1)
46
+ with cond:
47
+ cond.wait()
48
+
44
49
  with self.db.init():
45
50
  test_data = test_np.read().tolist()
46
51
  num, idx = len(test_data), 0
@@ -77,7 +82,7 @@ class MultiProcessingSearchRunner:
77
82
 
78
83
  @staticmethod
79
84
  def get_mp_context():
80
- mp_start_method = "forkserver" if "forkserver" in mp.get_all_start_methods() else "spawn"
85
+ mp_start_method = "spawn"
81
86
  log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
82
87
  return mp.get_context(mp_start_method)
83
88
 
@@ -85,21 +90,32 @@ class MultiProcessingSearchRunner:
85
90
  max_qps = 0
86
91
  try:
87
92
  for conc in self.concurrencies:
88
- with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
89
- start = time.perf_counter()
90
- log.info(f"start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
91
- future_iter = executor.map(self.search, [self.test_data for i in range(conc)])
92
- all_count = sum([r[0] for r in future_iter])
93
-
94
- cost = time.perf_counter() - start
95
- qps = round(all_count / cost, 4)
96
- log.info(f"end search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
93
+ with mp.Manager() as m:
94
+ q, cond = m.Queue(), m.Condition()
95
+ with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
96
+ log.info(f"Start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
97
+ future_iter = [executor.submit(self.search, self.test_data, q, cond) for i in range(conc)]
98
+ # Sync all processes
99
+ while q.qsize() < conc:
100
+ sleep_t = conc if conc < 10 else 10
101
+ time.sleep(sleep_t)
102
+
103
+ with cond:
104
+ cond.notify_all()
105
+ log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
106
+
107
+ start = time.perf_counter()
108
+ all_count = sum([r.result()[0] for r in future_iter])
109
+ cost = time.perf_counter() - start
110
+
111
+ qps = round(all_count / cost, 4)
112
+ log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
97
113
 
98
114
  if qps > max_qps:
99
115
  max_qps = qps
100
- log.info(f"update largest qps with concurrency {conc}: current max_qps={max_qps}")
116
+ log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
101
117
  except Exception as e:
102
- log.warning(f"fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
118
+ log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
103
119
  traceback.print_exc()
104
120
 
105
121
  # No results available, raise exception
@@ -4,53 +4,99 @@ import traceback
4
4
  import concurrent
5
5
  import multiprocessing as mp
6
6
  import math
7
+ import psutil
8
+
7
9
  import numpy as np
8
10
  import pandas as pd
9
11
 
10
12
  from ..clients import api
11
13
  from ...metric import calc_recall
12
- from ...models import LoadTimeoutError
14
+ from ...models import LoadTimeoutError, PerformanceTimeoutError
13
15
  from .. import utils
14
16
  from ... import config
17
+ from vectordb_bench.backend.dataset import DatasetManager
15
18
 
16
19
  NUM_PER_BATCH = config.NUM_PER_BATCH
17
- LOAD_TIMEOUT = 24 * 60 * 60
20
+ LOAD_MAX_TRY_COUNT = 10
21
+ WAITTING_TIME = 60
18
22
 
19
23
  log = logging.getLogger(__name__)
20
24
 
21
-
22
25
  class SerialInsertRunner:
23
- def __init__(self, db: api.VectorDB, train_emb: list[list[float]], train_id: list[int]):
24
- log.debug(f"Dataset shape: {len(train_emb)}")
26
+ def __init__(self, db: api.VectorDB, dataset: DatasetManager, normalize: bool, timeout: float | None = None):
27
+ self.timeout = timeout if isinstance(timeout, (int, float)) else None
28
+ self.dataset = dataset
25
29
  self.db = db
26
- self.shared_emb = train_emb
27
- self.train_id = train_id
30
+ self.normalize = normalize
28
31
 
29
- self.seq_batches = math.ceil(len(train_emb)/NUM_PER_BATCH)
30
-
31
- def insert_data(self, left_id: int = 0) -> int:
32
+ def task(self) -> int:
33
+ count = 0
32
34
  with self.db.init():
33
- all_embeddings = self.shared_emb
35
+ log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
36
+ start = time.perf_counter()
37
+ for data_df in self.dataset:
38
+ all_metadata = data_df['id'].tolist()
39
+
40
+ emb_np = np.stack(data_df['emb'])
41
+ if self.normalize:
42
+ log.debug("normalize the 100k train data")
43
+ all_embeddings = emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis].tolist()
44
+ else:
45
+ all_embeddings = emb_np.tolist()
46
+ del(emb_np)
47
+ log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
48
+
49
+ last_batch = self.dataset.data.size - count == len(all_metadata)
50
+ insert_count, error = self.db.insert_embeddings(
51
+ embeddings=all_embeddings,
52
+ metadata=all_metadata,
53
+ last_batch=last_batch,
54
+ )
55
+ if error is not None:
56
+ raise error
57
+
58
+ assert insert_count == len(all_metadata)
59
+ count += insert_count
60
+ if count % 100_000 == 0:
61
+ log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
62
+
63
+ log.info(f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}")
64
+ return count
34
65
 
66
+ def endless_insert_data(self, all_embeddings, all_metadata, left_id: int = 0) -> int:
67
+ with self.db.init():
35
68
  # unique id for endlessness insertion
36
- all_metadata = [i+left_id for i in self.train_id]
69
+ all_metadata = [i+left_id for i in all_metadata]
37
70
 
38
- num_conc_batches = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
71
+ NUM_BATCHES = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
39
72
  log.info(f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
40
73
  count = 0
41
- for batch_id in range(self.seq_batches):
42
- metadata = all_metadata[batch_id*NUM_PER_BATCH: (batch_id+1)*NUM_PER_BATCH]
43
- embeddings = all_embeddings[batch_id*NUM_PER_BATCH: (batch_id+1)*NUM_PER_BATCH]
44
-
45
- log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_conc_batches}], Start inserting {len(metadata)} embeddings")
46
- insert_count = self.db.insert_embeddings(
47
- embeddings=embeddings,
48
- metadata=metadata,
49
- )
50
- log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_conc_batches}], Finish inserting {len(metadata)} embeddings")
51
-
52
- assert insert_count == len(metadata)
53
- count += insert_count
74
+ for batch_id in range(NUM_BATCHES):
75
+ retry_count = 0
76
+ already_insert_count = 0
77
+ metadata = all_metadata[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
78
+ embeddings = all_embeddings[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
79
+
80
+ log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Start inserting {len(metadata)} embeddings")
81
+ while retry_count < LOAD_MAX_TRY_COUNT:
82
+ insert_count, error = self.db.insert_embeddings(
83
+ embeddings=embeddings[already_insert_count :],
84
+ metadata=metadata[already_insert_count :],
85
+ )
86
+ already_insert_count += insert_count
87
+ if error is not None:
88
+ retry_count += 1
89
+ time.sleep(WAITTING_TIME)
90
+
91
+ log.info(f"Failed to insert data, try {retry_count} time")
92
+ if retry_count >= LOAD_MAX_TRY_COUNT:
93
+ raise error
94
+ else:
95
+ break
96
+ log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Finish inserting {len(metadata)} embeddings")
97
+
98
+ assert already_insert_count == len(metadata)
99
+ count += already_insert_count
54
100
  log.info(f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
55
101
  return count
56
102
 
@@ -58,30 +104,46 @@ class SerialInsertRunner:
58
104
  def _insert_all_batches(self) -> int:
59
105
  """Performance case only"""
60
106
  with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context('spawn'), max_workers=1) as executor:
61
- future = executor.submit(self.insert_data)
62
- count = future.result()
63
- return count
107
+ future = executor.submit(self.task)
108
+ try:
109
+ count = future.result(timeout=self.timeout)
110
+ except TimeoutError as e:
111
+ msg = f"VectorDB load dataset timeout in {self.timeout}"
112
+ log.warning(msg)
113
+ for pid, _ in executor._processes.items():
114
+ psutil.Process(pid).kill()
115
+ raise PerformanceTimeoutError(msg) from e
116
+ except Exception as e:
117
+ log.warning(f"VectorDB load dataset error: {e}")
118
+ raise e from e
119
+ else:
120
+ return count
64
121
 
65
122
  def run_endlessness(self) -> int:
66
123
  """run forever util DB raises exception or crash"""
124
+ # datasets for load tests are quite small, can fit into memory
125
+ # only 1 file
126
+ data_df = [data_df for data_df in self.dataset][0]
127
+ all_embeddings, all_metadata = np.stack(data_df["emb"]).tolist(), data_df['id'].tolist()
128
+
67
129
  start_time = time.perf_counter()
68
130
  max_load_count, times = 0, 0
69
131
  try:
70
132
  with self.db.init():
71
133
  self.db.ready_to_load()
72
- while time.perf_counter() - start_time < config.CASE_TIMEOUT_IN_SECOND:
73
- count = self.insert_data(left_id=max_load_count)
134
+ while time.perf_counter() - start_time < self.timeout:
135
+ count = self.endless_insert_data(all_embeddings, all_metadata, left_id=max_load_count)
74
136
  max_load_count += count
75
137
  times += 1
76
138
  log.info(f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, {max_load_count}")
77
- raise LoadTimeoutError("capacity case load timeout and stop")
78
- except LoadTimeoutError as e:
79
- log.info("load timetout, stop the load case")
80
- raise e from None
81
139
  except Exception as e:
82
140
  log.info(f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, {max_load_count}, err={e}")
83
141
  traceback.print_exc()
84
142
  return max_load_count
143
+ else:
144
+ msg = f"capacity case load timeout in {self.timeout}s"
145
+ log.info(msg)
146
+ raise LoadTimeoutError(msg)
85
147
 
86
148
  def run(self) -> int:
87
149
  count, dur = self._insert_all_batches()
@@ -113,7 +175,7 @@ class SerialSearchRunner:
113
175
  test_data, ground_truth = args
114
176
 
115
177
  log.debug(f"test dataset size: {len(test_data)}")
116
- log.info(f"ground truth size: {ground_truth.columns}, shape: {ground_truth.shape}")
178
+ log.debug(f"ground truth size: {ground_truth.columns}, shape: {ground_truth.shape}")
117
179
 
118
180
  latencies, recalls = [], []
119
181
  for idx, emb in enumerate(test_data):