vectordb-bench 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. vectordb_bench/__init__.py +1 -0
  2. vectordb_bench/backend/cases.py +45 -1
  3. vectordb_bench/backend/clients/__init__.py +47 -0
  4. vectordb_bench/backend/clients/api.py +2 -0
  5. vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +104 -40
  6. vectordb_bench/backend/clients/aws_opensearch/cli.py +52 -15
  7. vectordb_bench/backend/clients/aws_opensearch/config.py +27 -7
  8. vectordb_bench/backend/clients/hologres/cli.py +50 -0
  9. vectordb_bench/backend/clients/hologres/config.py +121 -0
  10. vectordb_bench/backend/clients/hologres/hologres.py +365 -0
  11. vectordb_bench/backend/clients/lancedb/lancedb.py +1 -0
  12. vectordb_bench/backend/clients/milvus/cli.py +29 -9
  13. vectordb_bench/backend/clients/milvus/config.py +2 -0
  14. vectordb_bench/backend/clients/milvus/milvus.py +1 -1
  15. vectordb_bench/backend/clients/oceanbase/cli.py +1 -0
  16. vectordb_bench/backend/clients/oceanbase/config.py +3 -1
  17. vectordb_bench/backend/clients/oceanbase/oceanbase.py +20 -4
  18. vectordb_bench/backend/clients/oss_opensearch/cli.py +155 -0
  19. vectordb_bench/backend/clients/oss_opensearch/config.py +157 -0
  20. vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +582 -0
  21. vectordb_bench/backend/clients/oss_opensearch/run.py +166 -0
  22. vectordb_bench/backend/clients/pgdiskann/cli.py +45 -0
  23. vectordb_bench/backend/clients/pgdiskann/config.py +16 -0
  24. vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +94 -26
  25. vectordb_bench/backend/clients/s3_vectors/config.py +41 -0
  26. vectordb_bench/backend/clients/s3_vectors/s3_vectors.py +171 -0
  27. vectordb_bench/backend/clients/tidb/cli.py +0 -4
  28. vectordb_bench/backend/clients/tidb/config.py +22 -2
  29. vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -1
  30. vectordb_bench/backend/clients/zilliz_cloud/config.py +4 -1
  31. vectordb_bench/backend/dataset.py +70 -0
  32. vectordb_bench/backend/filter.py +17 -0
  33. vectordb_bench/backend/runner/mp_runner.py +4 -0
  34. vectordb_bench/backend/runner/rate_runner.py +23 -11
  35. vectordb_bench/backend/runner/read_write_runner.py +10 -9
  36. vectordb_bench/backend/runner/serial_runner.py +23 -7
  37. vectordb_bench/backend/task_runner.py +5 -4
  38. vectordb_bench/cli/cli.py +36 -0
  39. vectordb_bench/cli/vectordbbench.py +4 -0
  40. vectordb_bench/fig/custom_case_run_test.png +0 -0
  41. vectordb_bench/fig/custom_dataset.png +0 -0
  42. vectordb_bench/fig/homepage/bar-chart.png +0 -0
  43. vectordb_bench/fig/homepage/concurrent.png +0 -0
  44. vectordb_bench/fig/homepage/custom.png +0 -0
  45. vectordb_bench/fig/homepage/label_filter.png +0 -0
  46. vectordb_bench/fig/homepage/qp$.png +0 -0
  47. vectordb_bench/fig/homepage/run_test.png +0 -0
  48. vectordb_bench/fig/homepage/streaming.png +0 -0
  49. vectordb_bench/fig/homepage/table.png +0 -0
  50. vectordb_bench/fig/run_test_select_case.png +0 -0
  51. vectordb_bench/fig/run_test_select_db.png +0 -0
  52. vectordb_bench/fig/run_test_submit.png +0 -0
  53. vectordb_bench/frontend/components/check_results/filters.py +1 -4
  54. vectordb_bench/frontend/components/check_results/nav.py +2 -1
  55. vectordb_bench/frontend/components/concurrent/charts.py +5 -0
  56. vectordb_bench/frontend/components/int_filter/charts.py +60 -0
  57. vectordb_bench/frontend/components/streaming/data.py +7 -0
  58. vectordb_bench/frontend/components/welcome/welcomePrams.py +42 -4
  59. vectordb_bench/frontend/config/dbCaseConfigs.py +142 -16
  60. vectordb_bench/frontend/config/styles.py +4 -0
  61. vectordb_bench/frontend/pages/concurrent.py +1 -1
  62. vectordb_bench/frontend/pages/custom.py +1 -1
  63. vectordb_bench/frontend/pages/int_filter.py +56 -0
  64. vectordb_bench/frontend/pages/streaming.py +16 -3
  65. vectordb_bench/interface.py +5 -1
  66. vectordb_bench/metric.py +7 -0
  67. vectordb_bench/models.py +39 -4
  68. vectordb_bench/results/S3Vectors/result_20250722_standard_s3vectors.json +2509 -0
  69. vectordb_bench/results/getLeaderboardDataV2.py +23 -2
  70. vectordb_bench/results/leaderboard_v2.json +200 -0
  71. vectordb_bench/results/leaderboard_v2_streaming.json +128 -0
  72. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/METADATA +40 -8
  73. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/RECORD +77 -51
  74. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/WHEEL +0 -0
  75. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/entry_points.txt +0 -0
  76. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/licenses/LICENSE +0 -0
  77. {vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/top_level.txt +0 -0
@@ -85,6 +85,7 @@ class OceanBaseHNSWConfig(OceanBaseIndexConfig, DBCaseConfig):
85
85
  class OceanBaseIVFConfig(OceanBaseIndexConfig, DBCaseConfig):
86
86
  m: int
87
87
  sample_per_nlist: int
88
+ nbits: int | None = None
88
89
  nlist: int
89
90
  index: IndexType
90
91
  ivf_nprobes: int | None = None
@@ -96,8 +97,9 @@ class OceanBaseIVFConfig(OceanBaseIndexConfig, DBCaseConfig):
96
97
  "metric_type": self.parse_metric(),
97
98
  "index_type": self.index.value,
98
99
  "params": {
99
- "m": self.M,
100
+ "m": self.m,
100
101
  "sample_per_nlist": self.sample_per_nlist,
102
+ "nbits": self.nbits,
101
103
  "nlist": self.nlist,
102
104
  },
103
105
  }
@@ -7,6 +7,8 @@ from typing import Any
7
7
 
8
8
  import mysql.connector as mysql
9
9
 
10
+ from vectordb_bench.backend.filter import Filter, FilterOp
11
+
10
12
  from ..api import IndexType, VectorDB
11
13
  from .config import OceanBaseConfigDict, OceanBaseHNSWConfig
12
14
 
@@ -16,6 +18,12 @@ OCEANBASE_DEFAULT_LOAD_BATCH_SIZE = 256
16
18
 
17
19
 
18
20
  class OceanBase(VectorDB):
21
+ supported_filter_types: list[FilterOp] = [
22
+ FilterOp.NonFilter,
23
+ FilterOp.NumGE,
24
+ FilterOp.StrEqual,
25
+ ]
26
+
19
27
  def __init__(
20
28
  self,
21
29
  dim: int,
@@ -187,22 +195,30 @@ class OceanBase(VectorDB):
187
195
 
188
196
  return insert_count, None
189
197
 
198
+ def prepare_filter(self, filters: Filter):
199
+ if filters.type == FilterOp.NonFilter:
200
+ self.expr = ""
201
+ elif filters.type == FilterOp.NumGE:
202
+ self.expr = f"WHERE id >= {filters.int_value}"
203
+ elif filters.type == FilterOp.StrEqual:
204
+ self.expr = f"WHERE id == '{filters.label_value}'"
205
+ else:
206
+ msg = f"Not support Filter for Oceanbase - {filters}"
207
+ raise ValueError(msg)
208
+
190
209
  def search_embedding(
191
210
  self,
192
211
  query: list[float],
193
212
  k: int = 100,
194
- filters: dict[str, Any] | None = None,
195
- timeout: int | None = None,
196
213
  ) -> list[int]:
197
214
  if not self._cursor:
198
215
  raise ValueError("Cursor is not initialized")
199
216
 
200
217
  packed = struct.pack(f"<{len(query)}f", *query)
201
218
  hex_vec = packed.hex()
202
- filter_clause = f"WHERE id >= {filters['id']}" if filters else ""
203
219
  query_str = (
204
220
  f"SELECT id FROM {self.table_name} " # noqa: S608
205
- f"{filter_clause} ORDER BY "
221
+ f"{self.expr} ORDER BY "
206
222
  f"{self.db_case_config.parse_metric_func_str()}(embedding, X'{hex_vec}') "
207
223
  f"APPROXIMATE LIMIT {k}"
208
224
  )
@@ -0,0 +1,155 @@
1
+ import logging
2
+ from typing import Annotated, TypedDict, Unpack
3
+
4
+ import click
5
+ from pydantic import SecretStr
6
+
7
+ from ....cli.cli import (
8
+ CommonTypedDict,
9
+ HNSWFlavor1,
10
+ cli,
11
+ click_parameter_decorators_from_typed_dict,
12
+ run,
13
+ )
14
+ from .. import DB
15
+ from .config import OSSOpenSearchQuantization, OSSOS_Engine
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+
20
+ class OSSOpenSearchTypedDict(TypedDict):
21
+ host: Annotated[str, click.option("--host", type=str, help="Db host", required=True)]
22
+ port: Annotated[int, click.option("--port", type=int, default=80, help="Db Port")]
23
+ user: Annotated[str, click.option("--user", type=str, help="Db User")]
24
+ password: Annotated[str, click.option("--password", type=str, help="Db password")]
25
+ number_of_shards: Annotated[
26
+ int,
27
+ click.option("--number-of-shards", type=int, help="Number of primary shards for the index", default=1),
28
+ ]
29
+ number_of_replicas: Annotated[
30
+ int,
31
+ click.option(
32
+ "--number-of-replicas", type=int, help="Number of replica copies for each primary shard", default=1
33
+ ),
34
+ ]
35
+ index_thread_qty: Annotated[
36
+ int,
37
+ click.option(
38
+ "--index-thread-qty",
39
+ type=int,
40
+ help="Thread count for native engine indexing",
41
+ default=4,
42
+ ),
43
+ ]
44
+
45
+ engine: Annotated[
46
+ str,
47
+ click.option(
48
+ "--engine",
49
+ type=click.Choice(["nmslib", "faiss", "lucene"], case_sensitive=False),
50
+ help="HNSW algorithm implementation to use",
51
+ default="faiss",
52
+ ),
53
+ ]
54
+
55
+ metric_type: Annotated[
56
+ str,
57
+ click.option(
58
+ "--metric-type",
59
+ type=click.Choice(["l2", "cosine", "ip"], case_sensitive=False),
60
+ help="Distance metric type for vector similarity",
61
+ default="l2",
62
+ ),
63
+ ]
64
+
65
+ number_of_segments: Annotated[
66
+ int,
67
+ click.option("--number-of-segments", type=int, help="Target number of segments after merging", default=1),
68
+ ]
69
+
70
+ refresh_interval: Annotated[
71
+ str,
72
+ click.option(
73
+ "--refresh-interval", type=str, help="How often to make new data available for search", default="60s"
74
+ ),
75
+ ]
76
+
77
+ force_merge_enabled: Annotated[
78
+ bool,
79
+ click.option("--force-merge-enabled", type=bool, help="Whether to perform force merge operation", default=True),
80
+ ]
81
+
82
+ flush_threshold_size: Annotated[
83
+ str,
84
+ click.option(
85
+ "--flush-threshold-size", type=str, help="Size threshold for flushing the transaction log", default="5120mb"
86
+ ),
87
+ ]
88
+
89
+ cb_threshold: Annotated[
90
+ str,
91
+ click.option(
92
+ "--cb-threshold",
93
+ type=str,
94
+ help="k-NN Memory circuit breaker threshold",
95
+ default="50%",
96
+ ),
97
+ ]
98
+
99
+ quantization_type: Annotated[
100
+ str | None,
101
+ click.option(
102
+ "--quantization-type",
103
+ type=click.Choice(["fp32", "fp16"]),
104
+ help="quantization type for vectors (in index)",
105
+ default="fp32",
106
+ required=False,
107
+ ),
108
+ ]
109
+
110
+ engine: Annotated[
111
+ str | None,
112
+ click.option(
113
+ "--engine",
114
+ type=click.Choice(["faiss", "lucene"]),
115
+ help="quantization type for vectors (in index)",
116
+ default="faiss",
117
+ required=False,
118
+ ),
119
+ ]
120
+
121
+
122
+ class OSSOpenSearchHNSWTypedDict(CommonTypedDict, OSSOpenSearchTypedDict, HNSWFlavor1): ...
123
+
124
+
125
+ @cli.command()
126
+ @click_parameter_decorators_from_typed_dict(OSSOpenSearchHNSWTypedDict)
127
+ def OSSOpenSearch(**parameters: Unpack[OSSOpenSearchHNSWTypedDict]):
128
+ from .config import OSSOpenSearchConfig, OSSOpenSearchIndexConfig
129
+
130
+ run(
131
+ db=DB.OSSOpenSearch,
132
+ db_config=OSSOpenSearchConfig(
133
+ host=parameters["host"],
134
+ port=parameters["port"],
135
+ user=parameters["user"],
136
+ password=SecretStr(parameters["password"]),
137
+ ),
138
+ db_case_config=OSSOpenSearchIndexConfig(
139
+ number_of_shards=parameters["number_of_shards"],
140
+ number_of_replicas=parameters["number_of_replicas"],
141
+ index_thread_qty=parameters["index_thread_qty"],
142
+ number_of_segments=parameters["number_of_segments"],
143
+ refresh_interval=parameters["refresh_interval"],
144
+ force_merge_enabled=parameters["force_merge_enabled"],
145
+ flush_threshold_size=parameters["flush_threshold_size"],
146
+ index_thread_qty_during_force_merge=parameters["index_thread_qty_during_force_merge"],
147
+ cb_threshold=parameters["cb_threshold"],
148
+ efConstruction=parameters["ef_construction"],
149
+ efSearch=parameters["ef_runtime"],
150
+ M=parameters["m"],
151
+ engine=OSSOS_Engine(parameters["engine"]),
152
+ quantization_type=OSSOpenSearchQuantization(parameters["quantization_type"]),
153
+ ),
154
+ **parameters,
155
+ )
@@ -0,0 +1,157 @@
1
+ import logging
2
+ from enum import Enum
3
+
4
+ from pydantic import BaseModel, SecretStr, root_validator, validator
5
+
6
+ from ..api import DBCaseConfig, DBConfig, MetricType
7
+
8
+ log = logging.getLogger(__name__)
9
+
10
+
11
+ class OSSOpenSearchConfig(DBConfig, BaseModel):
12
+ host: str = ""
13
+ port: int = 80
14
+ user: str | None = None
15
+ password: SecretStr | None = None
16
+
17
+ def to_dict(self) -> dict:
18
+ use_ssl = self.port == 443
19
+ http_auth = (
20
+ (self.user, self.password.get_secret_value())
21
+ if self.user is not None and self.password is not None and len(self.user) != 0 and len(self.password) != 0
22
+ else ()
23
+ )
24
+ return {
25
+ "hosts": [{"host": self.host, "port": self.port}],
26
+ "http_auth": http_auth,
27
+ "use_ssl": use_ssl,
28
+ "http_compress": True,
29
+ "verify_certs": use_ssl,
30
+ "ssl_assert_hostname": False,
31
+ "ssl_show_warn": False,
32
+ "timeout": 600,
33
+ }
34
+
35
+ @validator("*")
36
+ def not_empty_field(cls, v: any, field: any):
37
+ if (
38
+ field.name in cls.common_short_configs()
39
+ or field.name in cls.common_long_configs()
40
+ or field.name in ["user", "password", "host"]
41
+ ):
42
+ return v
43
+ if isinstance(v, str | SecretStr) and len(v) == 0:
44
+ raise ValueError("Empty string!")
45
+ return v
46
+
47
+
48
+ class OSSOS_Engine(Enum):
49
+ faiss = "faiss"
50
+ lucene = "lucene"
51
+
52
+
53
+ class OSSOpenSearchQuantization(Enum):
54
+ fp32 = "fp32"
55
+ fp16 = "fp16"
56
+
57
+
58
+ class OSSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
59
+ metric_type: MetricType = MetricType.L2
60
+ engine: OSSOS_Engine = OSSOS_Engine.faiss
61
+ efConstruction: int = 256
62
+ efSearch: int = 100
63
+ engine_name: str | None = None
64
+ metric_type_name: str | None = None
65
+ M: int = 16
66
+ index_thread_qty: int | None = 4
67
+ number_of_shards: int | None = 1
68
+ number_of_replicas: int | None = 0
69
+ number_of_segments: int | None = 1
70
+ refresh_interval: str | None = "60s"
71
+ force_merge_enabled: bool | None = True
72
+ flush_threshold_size: str | None = "5120mb"
73
+ index_thread_qty_during_force_merge: int = 8
74
+ cb_threshold: str | None = "50%"
75
+ number_of_indexing_clients: int | None = 1
76
+ use_routing: bool = False # for label-filter cases
77
+ oversample_factor: float = 1.0
78
+ quantization_type: OSSOpenSearchQuantization = OSSOpenSearchQuantization.fp32
79
+
80
+ @root_validator
81
+ def validate_engine_name(cls, values: dict):
82
+ """Map engine_name string from UI to engine enum"""
83
+ if values.get("engine_name"):
84
+ engine_name = values["engine_name"].lower()
85
+ if engine_name == "faiss":
86
+ values["engine"] = OSSOS_Engine.faiss
87
+ elif engine_name == "lucene":
88
+ values["engine"] = OSSOS_Engine.lucene
89
+ else:
90
+ log.warning(f"Unknown engine_name: {engine_name}, defaulting to faiss")
91
+ values["engine"] = OSSOS_Engine.faiss
92
+ return values
93
+
94
+ def __eq__(self, obj: any):
95
+ return (
96
+ self.engine == obj.engine
97
+ and self.M == obj.M
98
+ and self.efConstruction == obj.efConstruction
99
+ and self.number_of_shards == obj.number_of_shards
100
+ and self.number_of_replicas == obj.number_of_replicas
101
+ and self.number_of_segments == obj.number_of_segments
102
+ and self.use_routing == obj.use_routing
103
+ and self.quantization_type == obj.quantization_type
104
+ )
105
+
106
+ def __hash__(self) -> int:
107
+ return hash(
108
+ (
109
+ self.engine,
110
+ self.M,
111
+ self.efConstruction,
112
+ self.number_of_shards,
113
+ self.number_of_replicas,
114
+ self.number_of_segments,
115
+ self.use_routing,
116
+ self.quantization_type,
117
+ )
118
+ )
119
+
120
+ def parse_metric(self) -> str:
121
+ log.info(f"User specified metric_type: {self.metric_type_name}")
122
+ self.metric_type = MetricType[self.metric_type_name.upper()]
123
+ if self.metric_type == MetricType.IP:
124
+ return "innerproduct"
125
+ if self.metric_type == MetricType.COSINE:
126
+ return "cosinesimil"
127
+ if self.metric_type == MetricType.L2:
128
+ log.info("Using l2 as specified by user")
129
+ return "l2"
130
+ return "l2"
131
+
132
+ @property
133
+ def use_quant(self) -> bool:
134
+ return self.quantization_type is not OSSOpenSearchQuantization.fp32
135
+
136
+ def index_param(self) -> dict:
137
+ log.info(f"Using engine: {self.engine} for index creation")
138
+ log.info(f"Using metric_type: {self.metric_type_name} for index creation")
139
+ log.info(f"Resulting space_type: {self.parse_metric()} for index creation")
140
+
141
+ return {
142
+ "name": "hnsw",
143
+ "engine": self.engine.value,
144
+ "space_type": self.parse_metric(),
145
+ "parameters": {
146
+ "ef_construction": self.efConstruction,
147
+ "m": self.M,
148
+ **(
149
+ {"encoder": {"name": "sq", "parameters": {"type": self.quantization_type.value}}}
150
+ if self.use_quant
151
+ else {}
152
+ ),
153
+ },
154
+ }
155
+
156
+ def search_param(self) -> dict:
157
+ return {"ef_search": self.efSearch}