vectordb-bench 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. vectordb_bench/__init__.py +14 -3
  2. vectordb_bench/backend/assembler.py +2 -2
  3. vectordb_bench/backend/cases.py +146 -57
  4. vectordb_bench/backend/clients/__init__.py +6 -1
  5. vectordb_bench/backend/clients/api.py +23 -11
  6. vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
  7. vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +11 -9
  8. vectordb_bench/backend/clients/milvus/config.py +2 -3
  9. vectordb_bench/backend/clients/milvus/milvus.py +32 -19
  10. vectordb_bench/backend/clients/pgvector/config.py +49 -0
  11. vectordb_bench/backend/clients/pgvector/pgvector.py +171 -0
  12. vectordb_bench/backend/clients/pinecone/config.py +3 -3
  13. vectordb_bench/backend/clients/pinecone/pinecone.py +19 -13
  14. vectordb_bench/backend/clients/qdrant_cloud/config.py +23 -6
  15. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +12 -13
  16. vectordb_bench/backend/clients/weaviate_cloud/config.py +3 -3
  17. vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +9 -8
  18. vectordb_bench/backend/clients/zilliz_cloud/config.py +5 -4
  19. vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +3 -1
  20. vectordb_bench/backend/dataset.py +100 -162
  21. vectordb_bench/backend/result_collector.py +2 -2
  22. vectordb_bench/backend/runner/mp_runner.py +29 -13
  23. vectordb_bench/backend/runner/serial_runner.py +98 -36
  24. vectordb_bench/backend/task_runner.py +43 -48
  25. vectordb_bench/frontend/components/check_results/charts.py +10 -21
  26. vectordb_bench/frontend/components/check_results/data.py +31 -15
  27. vectordb_bench/frontend/components/check_results/expanderStyle.py +37 -0
  28. vectordb_bench/frontend/components/check_results/filters.py +61 -33
  29. vectordb_bench/frontend/components/check_results/footer.py +8 -0
  30. vectordb_bench/frontend/components/check_results/headerIcon.py +8 -4
  31. vectordb_bench/frontend/components/check_results/nav.py +7 -6
  32. vectordb_bench/frontend/components/check_results/priceTable.py +3 -2
  33. vectordb_bench/frontend/components/check_results/stPageConfig.py +18 -0
  34. vectordb_bench/frontend/components/get_results/saveAsImage.py +50 -0
  35. vectordb_bench/frontend/components/run_test/autoRefresh.py +1 -1
  36. vectordb_bench/frontend/components/run_test/caseSelector.py +19 -16
  37. vectordb_bench/frontend/components/run_test/dbConfigSetting.py +20 -7
  38. vectordb_bench/frontend/components/run_test/dbSelector.py +5 -5
  39. vectordb_bench/frontend/components/run_test/hideSidebar.py +4 -6
  40. vectordb_bench/frontend/components/run_test/submitTask.py +16 -10
  41. vectordb_bench/frontend/const/dbCaseConfigs.py +291 -0
  42. vectordb_bench/frontend/const/dbPrices.py +6 -0
  43. vectordb_bench/frontend/const/styles.py +58 -0
  44. vectordb_bench/frontend/pages/{qps_with_price.py → quries_per_dollar.py} +24 -17
  45. vectordb_bench/frontend/pages/run_test.py +17 -11
  46. vectordb_bench/frontend/vdb_benchmark.py +19 -12
  47. vectordb_bench/metric.py +19 -10
  48. vectordb_bench/models.py +14 -40
  49. vectordb_bench/results/dbPrices.json +32 -0
  50. vectordb_bench/results/getLeaderboardData.py +52 -0
  51. vectordb_bench/results/leaderboard.json +1 -0
  52. vectordb_bench/results/{result_20230609_standard.json → result_20230705_standard.json} +1910 -897
  53. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/METADATA +107 -27
  54. vectordb_bench-0.0.3.dist-info/RECORD +67 -0
  55. vectordb_bench/frontend/const.py +0 -391
  56. vectordb_bench-0.0.1.dist-info/RECORD +0 -56
  57. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/LICENSE +0 -0
  58. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/WHEEL +0 -0
  59. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/entry_points.txt +0 -0
  60. {vectordb_bench-0.0.1.dist-info → vectordb_bench-0.0.3.dist-info}/top_level.txt +0 -0
@@ -18,12 +18,23 @@ class config:
18
18
  USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
19
19
 
20
20
  RESULTS_LOCAL_DIR = pathlib.Path(__file__).parent.joinpath("results")
21
- CASE_TIMEOUT_IN_SECOND = 24 * 60 * 60
21
+
22
+ CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
23
+ LOAD_TIMEOUT_1M = 2.5 * 3600 # 2.5h
24
+ LOAD_TIMEOUT_10M = 25 * 3600 # 25h
25
+ LOAD_TIMEOUT_100M = 250 * 3600 # 10.41d
26
+
27
+ OPTIMIZE_TIMEOUT_1M = 15 * 60 # 15min
28
+ OPTIMIZE_TIMEOUT_10M = 2.5 * 3600 # 2.5h
29
+ OPTIMIZE_TIMEOUT_100M = 25 * 3600 # 1.04d
22
30
 
23
31
 
24
32
  def display(self) -> str:
25
- tmp = [i for i in inspect.getmembers(self)
26
- if not inspect.ismethod(i[1]) and not i[0].startswith('_') \
33
+ tmp = [
34
+ i for i in inspect.getmembers(self)
35
+ if not inspect.ismethod(i[1])
36
+ and not i[0].startswith('_')
37
+ and "TIMEOUT" not in i[0]
27
38
  ]
28
39
  return tmp
29
40
 
@@ -1,4 +1,4 @@
1
- from .cases import type2case, CaseLabel
1
+ from .cases import CaseLabel
2
2
  from .task_runner import CaseRunner, RunningStatus, TaskRunner
3
3
  from ..models import TaskConfig
4
4
  from ..backend.clients import EmptyDBCaseConfig
@@ -11,7 +11,7 @@ log = logging.getLogger(__name__)
11
11
  class Assembler:
12
12
  @classmethod
13
13
  def assemble(cls, run_id , task: TaskConfig) -> CaseRunner:
14
- c_cls = type2case.get(task.case_config.case_id)
14
+ c_cls = task.case_config.case_id.case_cls
15
15
 
16
16
  c = c_cls()
17
17
  if type(task.db_case_config) != EmptyDBCaseConfig:
@@ -1,13 +1,59 @@
1
+ import typing
1
2
  import logging
2
3
  from enum import Enum, auto
3
4
 
4
- from . import dataset as ds
5
- from ..base import BaseModel
6
- from ..models import CaseType
5
+ from vectordb_bench import config
6
+ from vectordb_bench.base import BaseModel
7
+
8
+ from .dataset import Dataset, DatasetManager
7
9
 
8
10
 
9
11
  log = logging.getLogger(__name__)
10
12
 
13
+ Case = typing.TypeVar("Case")
14
+
15
+
16
+ class CaseType(Enum):
17
+ """
18
+ Example:
19
+ >>> case_cls = CaseType.CapacityDim128.case_cls
20
+ >>> assert c is not None
21
+ >>> CaseType.CapacityDim128.case_name
22
+ "Capacity Test (128 Dim Repeated)"
23
+ """
24
+
25
+ CapacityDim128 = 1
26
+ CapacityDim960 = 2
27
+
28
+ Performance100M = 3
29
+ Performance10M = 4
30
+ Performance1M = 5
31
+
32
+ Performance10M1P = 6
33
+ Performance1M1P = 7
34
+ Performance10M99P = 8
35
+ Performance1M99P = 9
36
+
37
+ Custom = 100
38
+
39
+ @property
40
+ def case_cls(self, custom_configs: dict | None = None) -> Case:
41
+ return type2case.get(self)
42
+
43
+ @property
44
+ def case_name(self) -> str:
45
+ c = self.case_cls
46
+ if c is not None:
47
+ return c().name
48
+ raise ValueError("Case unsupported")
49
+
50
+ @property
51
+ def case_description(self) -> str:
52
+ c = self.case_cls
53
+ if c is not None:
54
+ return c().description
55
+ raise ValueError("Case unsupported")
56
+
11
57
 
12
58
  class CaseLabel(Enum):
13
59
  Load = auto()
@@ -15,10 +61,10 @@ class CaseLabel(Enum):
15
61
 
16
62
 
17
63
  class Case(BaseModel):
18
- """ Undifined case
64
+ """Undifined case
19
65
 
20
66
  Fields:
21
- case_id(CaseType): default 11 case type plus one custom cases.
67
+ case_id(CaseType): default 9 case type plus one custom cases.
22
68
  label(CaseLabel): performance or load.
23
69
  dataset(DataSet): dataset for this case runner.
24
70
  filter_rate(float | None): one of 99% | 1% | None
@@ -27,7 +73,12 @@ class Case(BaseModel):
27
73
 
28
74
  case_id: CaseType
29
75
  label: CaseLabel
30
- dataset: ds.DataSet
76
+ name: str
77
+ description: str
78
+ dataset: DatasetManager
79
+
80
+ load_timeout: float | int
81
+ optimize_timeout: float | int | None
31
82
 
32
83
  filter_rate: float | None
33
84
 
@@ -45,80 +96,118 @@ class Case(BaseModel):
45
96
 
46
97
  class CapacityCase(Case, BaseModel):
47
98
  label: CaseLabel = CaseLabel.Load
48
- filter_rate: float | int | None = None
99
+ filter_rate: float | None = None
100
+ load_timeout: float | int = config.CAPACITY_TIMEOUT_IN_SECONDS
101
+ optimize_timeout: float | int | None = None
102
+
49
103
 
50
104
  class PerformanceCase(Case, BaseModel):
51
105
  label: CaseLabel = CaseLabel.Performance
52
- filter_rate: float | int | None = None
106
+ filter_rate: float | None = None
53
107
 
54
- class CapacityLDimCase(CapacityCase):
55
- case_id: CaseType = CaseType.CapacityLDim
56
- dataset: ds.DataSet = ds.get(ds.Name.GIST, ds.Label.SMALL)
57
108
 
58
- class CapacitySDimCase(CapacityCase):
59
- case_id: CaseType = CaseType.CapacitySDim
60
- dataset: ds.DataSet = ds.get(ds.Name.SIFT, ds.Label.SMALL)
109
+ class CapacityDim960(CapacityCase):
110
+ case_id: CaseType = CaseType.CapacityDim960
111
+ dataset: DatasetManager = Dataset.GIST.manager(100_000)
112
+ name: str = "Capacity Test (960 Dim Repeated)"
113
+ description: str = """This case tests the vector database's loading capacity by repeatedly inserting large-dimension vectors (GIST 100K vectors, <b>960 dimensions</b>) until it is fully loaded.
114
+ Number of inserted vectors will be reported."""
61
115
 
62
- class PerformanceLZero(PerformanceCase):
63
- case_id: CaseType = CaseType.PerformanceLZero
64
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
65
116
 
66
- class PerformanceMZero(PerformanceCase):
67
- case_id: CaseType = CaseType.PerformanceMZero
68
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
117
+ class CapacityDim128(CapacityCase):
118
+ case_id: CaseType = CaseType.CapacityDim128
119
+ dataset: DatasetManager = Dataset.SIFT.manager(500_000)
120
+ name: str = "Capacity Test (128 Dim Repeated)"
121
+ description: str = """This case tests the vector database's loading capacity by repeatedly inserting small-dimension vectors (SIFT 100K vectors, <b>128 dimensions</b>) until it is fully loaded.
122
+ Number of inserted vectors will be reported."""
69
123
 
70
- class PerformanceSZero(PerformanceCase):
71
- case_id: CaseType = CaseType.PerformanceSZero
72
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.SMALL)
73
124
 
74
- class PerformanceLLow(PerformanceCase):
75
- case_id: CaseType = CaseType.PerformanceLLow
76
- filter_rate: float | int | None = 0.01
77
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
125
+ class Performance10M(PerformanceCase):
126
+ case_id: CaseType = CaseType.Performance10M
127
+ dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
128
+ name: str = "Search Performance Test (10M Dataset, 768 Dim)"
129
+ description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) at varying parallel levels.
130
+ Results will show index building time, recall, and maximum QPS."""
131
+ load_timeout: float | int = config.LOAD_TIMEOUT_10M
132
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
133
+
134
+
135
+ class Performance1M(PerformanceCase):
136
+ case_id: CaseType = CaseType.Performance1M
137
+ dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
138
+ name: str = "Search Performance Test (1M Dataset, 768 Dim)"
139
+ description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) at varying parallel levels.
140
+ Results will show index building time, recall, and maximum QPS."""
141
+ load_timeout: float | int = config.LOAD_TIMEOUT_1M
142
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
143
+
78
144
 
79
- class PerformanceMLow(PerformanceCase):
80
- case_id: CaseType = CaseType.PerformanceMLow
145
+ class Performance10M1P(PerformanceCase):
146
+ case_id: CaseType = CaseType.Performance10M1P
81
147
  filter_rate: float | int | None = 0.01
82
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
148
+ dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
149
+ name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 1%)"
150
+ description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
151
+ Results will show index building time, recall, and maximum QPS."""
152
+ load_timeout: float | int = config.LOAD_TIMEOUT_10M
153
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
83
154
 
84
- class PerformanceSLow(PerformanceCase):
85
- case_id: CaseType = CaseType.PerformanceSLow
155
+
156
+ class Performance1M1P(PerformanceCase):
157
+ case_id: CaseType = CaseType.Performance1M1P
86
158
  filter_rate: float | int | None = 0.01
87
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.SMALL)
159
+ dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
160
+ name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 1%)"
161
+ description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
162
+ Results will show index building time, recall, and maximum QPS."""
163
+ load_timeout: float | int = config.LOAD_TIMEOUT_1M
164
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
88
165
 
89
- class PerformanceLHigh(PerformanceCase):
90
- case_id: CaseType = CaseType.PerformanceLHigh
91
- filter_rate: float | int | None = 0.99
92
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
93
166
 
94
- class PerformanceMHigh(PerformanceCase):
95
- case_id: CaseType = CaseType.PerformanceMHigh
167
+ class Performance10M99P(PerformanceCase):
168
+ case_id: CaseType = CaseType.Performance10M99P
96
169
  filter_rate: float | int | None = 0.99
97
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
170
+ dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
171
+ name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 99%)"
172
+ description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
173
+ Results will show index building time, recall, and maximum QPS."""
174
+ load_timeout: float | int = config.LOAD_TIMEOUT_10M
175
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
98
176
 
99
- class PerformanceSHigh(PerformanceCase):
100
- case_id: CaseType = CaseType.PerformanceSLow
177
+
178
+ class Performance1M99P(PerformanceCase):
179
+ case_id: CaseType = CaseType.Performance1M99P
101
180
  filter_rate: float | int | None = 0.99
102
- dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.SMALL)
181
+ dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
182
+ name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 99%)"
183
+ description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
184
+ Results will show index building time, recall, and maximum QPS."""
185
+ load_timeout: float | int = config.LOAD_TIMEOUT_1M
186
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
187
+
188
+
103
189
 
104
190
  class Performance100M(PerformanceCase):
105
191
  case_id: CaseType = CaseType.Performance100M
106
192
  filter_rate: float | int | None = None
107
- dataset: ds.DataSet = ds.get(ds.Name.LAION, ds.Label.LARGE)
193
+ dataset: DatasetManager = Dataset.LAION.manager(100_000_000)
194
+ name: str = "Search Performance Test (100M Dataset, 768 Dim)"
195
+ description: str = """This case tests the search performance of a vector database with a large 100M dataset (<b>LAION 100M vectors</b>, 768 dimensions), at varying parallel levels.
196
+ Results will show index building time, recall, and maximum QPS."""
197
+ load_timeout: float | int = config.LOAD_TIMEOUT_100M
198
+ optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_100M
199
+
108
200
 
109
201
  type2case = {
110
- CaseType.CapacityLDim: CapacityLDimCase,
111
- CaseType.CapacitySDim: CapacitySDimCase,
112
-
113
- CaseType.PerformanceLZero: PerformanceLZero,
114
- CaseType.PerformanceMZero: PerformanceMZero,
115
- CaseType.PerformanceSZero: PerformanceSZero,
116
-
117
- CaseType.PerformanceLLow: PerformanceLLow,
118
- CaseType.PerformanceMLow: PerformanceMLow,
119
- CaseType.PerformanceSLow: PerformanceSLow,
120
- CaseType.PerformanceLHigh: PerformanceLHigh,
121
- CaseType.PerformanceMHigh: PerformanceMHigh,
122
- CaseType.PerformanceSHigh: PerformanceSHigh,
202
+ CaseType.CapacityDim960: CapacityDim960,
203
+ CaseType.CapacityDim128: CapacityDim128,
204
+
123
205
  CaseType.Performance100M: Performance100M,
206
+ CaseType.Performance10M: Performance10M,
207
+ CaseType.Performance1M: Performance1M,
208
+
209
+ CaseType.Performance10M1P: Performance10M1P,
210
+ CaseType.Performance1M1P: Performance1M1P,
211
+ CaseType.Performance10M99P: Performance10M99P,
212
+ CaseType.Performance1M99P: Performance1M99P,
124
213
  }
@@ -15,7 +15,7 @@ from .pinecone.pinecone import Pinecone
15
15
  from .weaviate_cloud.weaviate_cloud import WeaviateCloud
16
16
  from .qdrant_cloud.qdrant_cloud import QdrantCloud
17
17
  from .zilliz_cloud.zilliz_cloud import ZillizCloud
18
-
18
+ from .pgvector.pgvector import PgVector
19
19
 
20
20
  class DB(Enum):
21
21
  """Database types
@@ -35,6 +35,7 @@ class DB(Enum):
35
35
  ElasticCloud = "ElasticCloud"
36
36
  QdrantCloud = "QdrantCloud"
37
37
  WeaviateCloud = "WeaviateCloud"
38
+ PgVector = "PgVector"
38
39
 
39
40
 
40
41
  @property
@@ -49,8 +50,12 @@ db2client = {
49
50
  DB.ElasticCloud: ElasticCloud,
50
51
  DB.QdrantCloud: QdrantCloud,
51
52
  DB.Pinecone: Pinecone,
53
+ DB.PgVector: PgVector
52
54
  }
53
55
 
56
+ for db in DB:
57
+ assert issubclass(db.init_cls, VectorDB)
58
+
54
59
 
55
60
  __all__ = [
56
61
  "DB", "VectorDB", "DBConfig", "DBCaseConfig", "IndexType", "MetricType", "EmptyDBCaseConfig",
@@ -3,7 +3,7 @@ from enum import Enum
3
3
  from typing import Any, Type
4
4
  from contextlib import contextmanager
5
5
 
6
- from pydantic import BaseModel
6
+ from pydantic import BaseModel, validator, SecretStr
7
7
 
8
8
 
9
9
  class MetricType(str, Enum):
@@ -32,12 +32,20 @@ class DBConfig(ABC, BaseModel):
32
32
  ZillizCloudConfig.db_label = 1cu-perf
33
33
  """
34
34
 
35
- db_label: str | None = None
35
+ db_label: str = ""
36
36
 
37
37
  @abstractmethod
38
38
  def to_dict(self) -> dict:
39
39
  raise NotImplementedError
40
40
 
41
+ @validator("*")
42
+ def not_empty_field(cls, v, field):
43
+ if field.name == "db_label":
44
+ return v
45
+ if isinstance(v, (str, SecretStr)) and len(v) == 0:
46
+ raise ValueError("Empty string!")
47
+ return v
48
+
41
49
 
42
50
  class DBCaseConfig(ABC):
43
51
  """Case specific vector database configs, usually uesed for index params like HNSW"""
@@ -65,7 +73,7 @@ class VectorDB(ABC):
65
73
 
66
74
  In each process, the benchmark cases ensure VectorDB.init() calls before any other methods operations
67
75
 
68
- insert_embeddings, search_embedding, and, ready_to_search will be timed for each call.
76
+ insert_embeddings, search_embedding, and, optimize will be timed for each call.
69
77
 
70
78
  Examples:
71
79
  >>> milvus = Milvus()
@@ -82,9 +90,12 @@ class VectorDB(ABC):
82
90
  db_case_config: DBCaseConfig | None,
83
91
  collection_name: str,
84
92
  drop_old: bool = False,
85
- **kwargs
93
+ **kwargs,
86
94
  ) -> None:
87
- """Initialize wrapper around the vector database client
95
+ """Initialize wrapper around the vector database client.
96
+
97
+ Please drop the existing collection if drop_old is True. And create collection
98
+ if collection not in the Vector Database
88
99
 
89
100
  Args:
90
101
  dim(int): the dimension of the dataset
@@ -122,15 +133,15 @@ class VectorDB(ABC):
122
133
  self,
123
134
  embeddings: list[list[float]],
124
135
  metadata: list[int],
125
- kwargs: Any,
126
- ) -> int:
136
+ **kwargs,
137
+ ) -> (int, Exception):
127
138
  """Insert the embeddings to the vector database. The default number of embeddings for
128
139
  each insert_embeddings is 5000.
129
140
 
130
141
  Args:
131
142
  embeddings(list[list[float]]): list of embedding to add to the vector database.
132
143
  metadatas(list[int]): metadata associated with the embeddings, for filtering.
133
- kwargs(Any): vector database specific parameters.
144
+ **kwargs(Any): vector database specific parameters.
134
145
 
135
146
  Returns:
136
147
  int: inserted data count
@@ -158,13 +169,14 @@ class VectorDB(ABC):
158
169
 
159
170
  # TODO: remove
160
171
  @abstractmethod
161
- def ready_to_search(self):
162
- """ready_to_search will be called between insertion and search in performance cases.
172
+ def optimize(self):
173
+ """optimize will be called between insertion and search in performance cases.
163
174
 
164
175
  Should be blocked until the vectorDB is ready to be tested on
165
176
  heavy performance cases.
166
177
 
167
- Time(insert the dataset) + Time(ready_to_search) will be recorded as "load_duration" metric
178
+ Time(insert the dataset) + Time(optimize) will be recorded as "load_duration" metric
179
+ Optimize's execution time is limited, the limited time is based on cases.
168
180
  """
169
181
  raise NotImplementedError
170
182
 
@@ -4,9 +4,9 @@ from pydantic import SecretStr, BaseModel
4
4
  from ..api import DBConfig, DBCaseConfig, MetricType, IndexType
5
5
 
6
6
 
7
- class ElasticsearchConfig(DBConfig, BaseModel):
7
+ class ElasticCloudConfig(DBConfig, BaseModel):
8
8
  cloud_id: SecretStr
9
- password: SecretStr | None = None
9
+ password: SecretStr
10
10
 
11
11
  def to_dict(self) -> dict:
12
12
  return {
@@ -20,7 +20,7 @@ class ESElementType(str, Enum):
20
20
  byte = "byte" # 1 byte, -128 to 127
21
21
 
22
22
 
23
- class ElasticsearchIndexConfig(BaseModel, DBCaseConfig):
23
+ class ElasticCloudIndexConfig(BaseModel, DBCaseConfig):
24
24
  element_type: ESElementType = ESElementType.float
25
25
  index: IndexType = IndexType.ES_HNSW # ES only support 'hnsw'
26
26
 
@@ -45,8 +45,8 @@ class ElasticsearchIndexConfig(BaseModel, DBCaseConfig):
45
45
  "index_options": {
46
46
  "type": self.index.value,
47
47
  "m": self.M,
48
- "ef_construction": self.efConstruction
49
- }
48
+ "ef_construction": self.efConstruction,
49
+ },
50
50
  }
51
51
  return params
52
52
 
@@ -2,7 +2,7 @@ import logging
2
2
  from contextlib import contextmanager
3
3
  from typing import Iterable, Type
4
4
  from ..api import VectorDB, DBCaseConfig, DBConfig, IndexType
5
- from .config import ElasticsearchIndexConfig, ElasticsearchConfig
5
+ from .config import ElasticCloudIndexConfig, ElasticCloudConfig
6
6
  from elasticsearch.helpers import bulk
7
7
 
8
8
 
@@ -16,11 +16,12 @@ class ElasticCloud(VectorDB):
16
16
  self,
17
17
  dim: int,
18
18
  db_config: dict,
19
- db_case_config: ElasticsearchIndexConfig,
19
+ db_case_config: ElasticCloudIndexConfig,
20
20
  indice: str = "vdb_bench_indice", # must be lowercase
21
21
  id_col_name: str = "id",
22
22
  vector_col_name: str = "vector",
23
23
  drop_old: bool = False,
24
+ **kwargs,
24
25
  ):
25
26
  self.dim = dim
26
27
  self.db_config = db_config
@@ -43,12 +44,12 @@ class ElasticCloud(VectorDB):
43
44
 
44
45
  @classmethod
45
46
  def config_cls(cls) -> Type[DBConfig]:
46
- return ElasticsearchConfig
47
+ return ElasticCloudConfig
47
48
 
48
49
 
49
50
  @classmethod
50
51
  def case_config_cls(cls, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
51
- return ElasticsearchIndexConfig
52
+ return ElasticCloudIndexConfig
52
53
 
53
54
 
54
55
  @contextmanager
@@ -83,7 +84,8 @@ class ElasticCloud(VectorDB):
83
84
  self,
84
85
  embeddings: Iterable[list[float]],
85
86
  metadata: list[int],
86
- ) -> int:
87
+ **kwargs,
88
+ ) -> (int, Exception):
87
89
  """Insert the embeddings to the elasticsearch."""
88
90
  assert self.client is not None, "should self.init() first"
89
91
 
@@ -99,10 +101,10 @@ class ElasticCloud(VectorDB):
99
101
  ]
100
102
  try:
101
103
  bulk_insert_res = bulk(self.client, insert_data)
102
- return bulk_insert_res[0]
104
+ return (bulk_insert_res[0], None)
103
105
  except Exception as e:
104
106
  log.warning(f"Failed to insert data: {self.indice} error: {str(e)}")
105
- raise e from None
107
+ return (0, e)
106
108
 
107
109
  def search_embedding(
108
110
  self,
@@ -143,8 +145,8 @@ class ElasticCloud(VectorDB):
143
145
  log.warning(f"Failed to search: {self.indice} error: {str(e)}")
144
146
  raise e from None
145
147
 
146
- def ready_to_search(self):
147
- """ready_to_search will be called between insertion and search in performance cases."""
148
+ def optimize(self):
149
+ """optimize will be called between insertion and search in performance cases."""
148
150
  pass
149
151
 
150
152
  def ready_to_load(self):
@@ -2,14 +2,13 @@ from pydantic import BaseModel, SecretStr
2
2
  from ..api import DBConfig, DBCaseConfig, MetricType, IndexType
3
3
 
4
4
 
5
- class MilvusConfig(DBConfig, BaseModel):
6
- uri: SecretStr | None = "http://localhost:19530"
5
+ class MilvusConfig(DBConfig):
6
+ uri: SecretStr = "http://localhost:19530"
7
7
 
8
8
  def to_dict(self) -> dict:
9
9
  return {"uri": self.uri.get_secret_value()}
10
10
 
11
11
 
12
-
13
12
  class MilvusIndexConfig(BaseModel):
14
13
  """Base config for milvus"""
15
14
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  import logging
4
4
  from contextlib import contextmanager
5
- from typing import Any, Iterable, Type
5
+ from typing import Iterable, Type
6
6
 
7
7
  from pymilvus import Collection, utility
8
8
  from pymilvus import CollectionSchema, DataType, FieldSchema, MilvusException
@@ -13,6 +13,7 @@ from .config import MilvusConfig, _milvus_case_config
13
13
 
14
14
  log = logging.getLogger(__name__)
15
15
 
16
+ MILVUS_LOAD_REQS_SIZE = 1.5 * 1024 *1024
16
17
 
17
18
  class Milvus(VectorDB):
18
19
  def __init__(
@@ -23,12 +24,14 @@ class Milvus(VectorDB):
23
24
  collection_name: str = "VectorDBBenchCollection",
24
25
  drop_old: bool = False,
25
26
  name: str = "Milvus",
27
+ **kwargs,
26
28
  ):
27
29
  """Initialize wrapper around the milvus vector database."""
28
30
  self.name = name
29
31
  self.db_config = db_config
30
32
  self.case_config = db_case_config
31
33
  self.collection_name = collection_name
34
+ self.batch_size = int(MILVUS_LOAD_REQS_SIZE / (dim *4))
32
35
 
33
36
  self._primary_field = "pk"
34
37
  self._scalar_field = "id"
@@ -51,7 +54,7 @@ class Milvus(VectorDB):
51
54
  log.info(f"{self.name} create collection: {self.collection_name}")
52
55
 
53
56
  # Create the collection
54
- coll = Collection(
57
+ Collection(
55
58
  name=self.collection_name,
56
59
  schema=CollectionSchema(fields),
57
60
  consistency_level="Session",
@@ -105,6 +108,14 @@ class Milvus(VectorDB):
105
108
 
106
109
  def _optimize(self):
107
110
  log.info(f"{self.name} optimizing before search")
111
+ try:
112
+ self.col.load()
113
+ except Exception as e:
114
+ log.warning(f"{self.name} optimize error: {e}")
115
+ raise e from None
116
+
117
+ def _post_insert(self):
118
+ log.info(f"{self.name} post insert before optimize")
108
119
  try:
109
120
  self.col.flush()
110
121
  self.col.compact()
@@ -117,10 +128,6 @@ class Milvus(VectorDB):
117
128
  index_name=self._index_name,
118
129
  )
119
130
  utility.wait_for_index_building_complete(self.collection_name)
120
- self.col.load()
121
- # self.col.load(_refresh=True)
122
- # utility.wait_for_loading_complete(self.collection_name)
123
- # import time; time.sleep(10)
124
131
  except Exception as e:
125
132
  log.warning(f"{self.name} optimize error: {e}")
126
133
  raise e from None
@@ -130,7 +137,7 @@ class Milvus(VectorDB):
130
137
  self._pre_load(self.col)
131
138
  pass
132
139
 
133
- def ready_to_search(self):
140
+ def optimize(self):
134
141
  assert self.col, "Please call self.init() before"
135
142
  self._optimize()
136
143
 
@@ -138,23 +145,29 @@ class Milvus(VectorDB):
138
145
  self,
139
146
  embeddings: Iterable[list[float]],
140
147
  metadata: list[int],
141
- **kwargs: Any,
142
- ) -> int:
148
+ **kwargs,
149
+ ) -> (int, Exception):
143
150
  """Insert embeddings into Milvus. should call self.init() first"""
144
151
  # use the first insert_embeddings to init collection
145
152
  assert self.col is not None
146
- insert_data = [
147
- metadata,
148
- metadata,
149
- embeddings,
150
- ]
151
-
153
+ assert len(embeddings) == len(metadata)
154
+ insert_count = 0
152
155
  try:
153
- res = self.col.insert(insert_data, **kwargs)
154
- return len(res.primary_keys)
156
+ for batch_start_offset in range(0, len(embeddings), self.batch_size):
157
+ batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
158
+ insert_data = [
159
+ metadata[batch_start_offset : batch_end_offset],
160
+ metadata[batch_start_offset : batch_end_offset],
161
+ embeddings[batch_start_offset : batch_end_offset],
162
+ ]
163
+ res = self.col.insert(insert_data)
164
+ insert_count += len(res.primary_keys)
165
+ if kwargs.get("last_batch"):
166
+ self._post_insert()
155
167
  except MilvusException as e:
156
- log.warning("Failed to insert data")
157
- raise e from None
168
+ log.info(f"Failed to insert data: {e}")
169
+ return (insert_count, e)
170
+ return (insert_count, None)
158
171
 
159
172
  def search_embedding(
160
173
  self,