vectordb-bench 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. vectordb_bench/backend/clients/__init__.py +4 -4
  2. vectordb_bench/backend/clients/api.py +1 -0
  3. vectordb_bench/backend/clients/chroma/chroma.py +2 -14
  4. vectordb_bench/backend/clients/milvus/config.py +19 -0
  5. vectordb_bench/backend/clients/pgvecto_rs/config.py +44 -32
  6. vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +16 -16
  7. vectordb_bench/backend/clients/pgvector/config.py +63 -12
  8. vectordb_bench/backend/clients/pgvector/pgvector.py +105 -77
  9. vectordb_bench/backend/clients/qdrant_cloud/config.py +19 -6
  10. vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +11 -7
  11. vectordb_bench/backend/clients/zilliz_cloud/config.py +4 -0
  12. vectordb_bench/backend/data_source.py +13 -64
  13. vectordb_bench/backend/dataset.py +45 -67
  14. vectordb_bench/backend/runner/serial_runner.py +1 -1
  15. vectordb_bench/backend/task_runner.py +2 -2
  16. vectordb_bench/backend/utils.py +30 -0
  17. vectordb_bench/frontend/components/run_test/caseSelector.py +1 -1
  18. vectordb_bench/frontend/const/dbCaseConfigs.py +41 -77
  19. vectordb_bench/models.py +1 -0
  20. vectordb_bench/results/PgVector/result_20230727_standard_pgvector.json +8 -0
  21. vectordb_bench/results/PgVector/result_20230808_standard_pgvector.json +9 -3
  22. vectordb_bench/results/ZillizCloud/{result_20240105_beta_202401_zillizcloud.json → result_20240105_standard_202401_zillizcloud.json} +365 -41
  23. vectordb_bench/results/getLeaderboardData.py +1 -1
  24. vectordb_bench/results/leaderboard.json +1 -1
  25. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/METADATA +15 -2
  26. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/RECORD +30 -30
  27. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/WHEEL +1 -1
  28. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/LICENSE +0 -0
  29. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/entry_points.txt +0 -0
  30. {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/top_level.txt +0 -0
@@ -54,8 +54,8 @@ class DB(Enum):
54
54
  return ElasticCloud
55
55
 
56
56
  if self == DB.QdrantCloud:
57
- from .qdrant_cloud.qdrant_cloud import QdrantClient
58
- return QdrantClient
57
+ from .qdrant_cloud.qdrant_cloud import QdrantCloud
58
+ return QdrantCloud
59
59
 
60
60
  if self == DB.WeaviateCloud:
61
61
  from .weaviate_cloud.weaviate_cloud import WeaviateCloud
@@ -142,8 +142,8 @@ class DB(Enum):
142
142
  return WeaviateIndexConfig
143
143
 
144
144
  if self == DB.PgVector:
145
- from .pgvector.config import PgVectorIndexConfig
146
- return PgVectorIndexConfig
145
+ from .pgvector.config import _pgvector_case_config
146
+ return _pgvector_case_config.get(index_type)
147
147
 
148
148
  if self == DB.PgVectoRS:
149
149
  from .pgvecto_rs.config import _pgvecto_rs_case_config
@@ -16,6 +16,7 @@ class IndexType(str, Enum):
16
16
  HNSW = "HNSW"
17
17
  DISKANN = "DISKANN"
18
18
  IVFFlat = "IVF_FLAT"
19
+ IVFSQ8 = "IVF_SQ8"
19
20
  Flat = "FLAT"
20
21
  AUTOINDEX = "AUTOINDEX"
21
22
  ES_HNSW = "hnsw"
@@ -106,21 +106,9 @@ class ChromaClient(VectorDB):
106
106
  """
107
107
  if filters:
108
108
  # assumes benchmark test filters of format: {'metadata': '>=10000', 'id': 10000}
109
- metadata_value = filters.get("metadata")
110
109
  id_value = filters.get("id")
111
- if metadata_value and id_value:
112
- results = self.collection.query(
113
- query_embeddings=query, n_results=k,
114
- where={"$and": [{"id": {"$eq": id_value}},
115
- {"id": {"$gt": metadata_value}}
116
- ]}
117
- )
118
- elif metadata_value:
119
- results = self.collection.query(query_embeddings=query, n_results=k,
120
- where={"id": {"$gt": metadata_value}})
121
- else:
122
- results = self.collection.query(query_embeddings=query, n_results=k,
123
- where={"id": {"$eq": id_value}})
110
+ results = self.collection.query(query_embeddings=query, n_results=k,
111
+ where={"id": {"$gt": id_value}})
124
112
  #return list of id's in results
125
113
  return [int(i) for i in results.get('ids')[0]]
126
114
  results = self.collection.query(query_embeddings=query, n_results=k)
@@ -95,6 +95,24 @@ class IVFFlatConfig(MilvusIndexConfig, DBCaseConfig):
95
95
  "metric_type": self.parse_metric(),
96
96
  "params": {"nprobe": self.nprobe},
97
97
  }
98
+
99
+ class IVFSQ8Config(MilvusIndexConfig, DBCaseConfig):
100
+ nlist: int
101
+ nprobe: int | None = None
102
+ index: IndexType = IndexType.IVFSQ8
103
+
104
+ def index_param(self) -> dict:
105
+ return {
106
+ "metric_type": self.parse_metric(),
107
+ "index_type": self.index.value,
108
+ "params": {"nlist": self.nlist},
109
+ }
110
+
111
+ def search_param(self) -> dict:
112
+ return {
113
+ "metric_type": self.parse_metric(),
114
+ "params": {"nprobe": self.nprobe},
115
+ }
98
116
 
99
117
 
100
118
  class FLATConfig(MilvusIndexConfig, DBCaseConfig):
@@ -210,6 +228,7 @@ _milvus_case_config = {
210
228
  IndexType.HNSW: HNSWConfig,
211
229
  IndexType.DISKANN: DISKANNConfig,
212
230
  IndexType.IVFFlat: IVFFlatConfig,
231
+ IndexType.IVFSQ8: IVFSQ8Config,
213
232
  IndexType.Flat: FLATConfig,
214
233
  IndexType.GPU_IVF_FLAT: GPUIVFFlatConfig,
215
234
  IndexType.GPU_IVF_PQ: GPUIVFPQConfig,
@@ -8,42 +8,30 @@ POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
8
8
  class PgVectoRSConfig(DBConfig):
9
9
  user_name: SecretStr = "postgres"
10
10
  password: SecretStr
11
- url: SecretStr
11
+ host: str = "localhost"
12
+ port: int = 5432
12
13
  db_name: str
13
14
 
14
15
  def to_dict(self) -> dict:
15
16
  user_str = self.user_name.get_secret_value()
16
17
  pwd_str = self.password.get_secret_value()
17
- url_str = self.url.get_secret_value()
18
- host, port = url_str.split(":")
19
18
  return {
20
- "host": host,
21
- "port": port,
19
+ "host": self.host,
20
+ "port": self.port,
22
21
  "dbname": self.db_name,
23
22
  "user": user_str,
24
- "password": pwd_str,
23
+ "password": pwd_str
25
24
  }
26
25
 
27
-
28
26
  class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
29
27
  metric_type: MetricType | None = None
30
- quantizationType: Literal["trivial", "scalar", "product"]
31
- quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
32
-
33
- def parse_quantization(self) -> str:
34
- if self.quantizationType == "trivial":
35
- return "quantization = { trivial = { } }"
36
- elif self.quantizationType == "scalar":
37
- return "quantization = { scalar = { } }"
38
- else:
39
- return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
40
28
 
41
29
  def parse_metric(self) -> str:
42
30
  if self.metric_type == MetricType.L2:
43
- return "l2_ops"
31
+ return "vector_l2_ops"
44
32
  elif self.metric_type == MetricType.IP:
45
- return "dot_ops"
46
- return "cosine_ops"
33
+ return "vector_dot_ops"
34
+ return "vector_cos_ops"
47
35
 
48
36
  def parse_metric_fun_op(self) -> str:
49
37
  if self.metric_type == MetricType.L2:
@@ -52,16 +40,27 @@ class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
52
40
  return "<#>"
53
41
  return "<=>"
54
42
 
43
+ class PgVectoRSQuantConfig(PgVectoRSIndexConfig):
44
+ quantizationType: Literal["trivial", "scalar", "product"]
45
+ quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
55
46
 
56
- class HNSWConfig(PgVectoRSIndexConfig):
47
+ def parse_quantization(self) -> str:
48
+ if self.quantizationType == "trivial":
49
+ return "quantization = { trivial = { } }"
50
+ elif self.quantizationType == "scalar":
51
+ return "quantization = { scalar = { } }"
52
+ else:
53
+ return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
54
+
55
+
56
+ class HNSWConfig(PgVectoRSQuantConfig):
57
57
  M: int
58
58
  efConstruction: int
59
59
  index: IndexType = IndexType.HNSW
60
60
 
61
61
  def index_param(self) -> dict:
62
62
  options = f"""
63
- capacity = 1048576
64
- [algorithm.hnsw]
63
+ [indexing.hnsw]
65
64
  m = {self.M}
66
65
  ef_construction = {self.efConstruction}
67
66
  {self.parse_quantization()}
@@ -72,17 +71,16 @@ ef_construction = {self.efConstruction}
72
71
  return {"metrics_op": self.parse_metric_fun_op()}
73
72
 
74
73
 
75
- class IVFFlatConfig(PgVectoRSIndexConfig):
74
+ class IVFFlatConfig(PgVectoRSQuantConfig):
76
75
  nlist: int
77
76
  nprobe: int | None = None
78
77
  index: IndexType = IndexType.IVFFlat
79
78
 
80
79
  def index_param(self) -> dict:
81
80
  options = f"""
82
- capacity = 1048576
83
- [algorithm.ivf]
81
+ [indexing.ivf]
84
82
  nlist = {self.nlist}
85
- nprob = {self.nprobe if self.nprobe else 10}
83
+ nsample = {self.nprobe if self.nprobe else 10}
86
84
  {self.parse_quantization()}
87
85
  """
88
86
  return {"options": options, "metric": self.parse_metric()}
@@ -90,14 +88,29 @@ nprob = {self.nprobe if self.nprobe else 10}
90
88
  def search_param(self) -> dict:
91
89
  return {"metrics_op": self.parse_metric_fun_op()}
92
90
 
91
+ class IVFFlatSQ8Config(PgVectoRSIndexConfig):
92
+ nlist: int
93
+ nprobe: int | None = None
94
+ index: IndexType = IndexType.IVFSQ8
95
+
96
+ def index_param(self) -> dict:
97
+ options = f"""
98
+ [indexing.ivf]
99
+ nlist = {self.nlist}
100
+ nsample = {self.nprobe if self.nprobe else 10}
101
+ quantization = {{ scalar = {{ }} }}
102
+ """
103
+ return {"options": options, "metric": self.parse_metric()}
104
+
105
+ def search_param(self) -> dict:
106
+ return {"metrics_op": self.parse_metric_fun_op()}
93
107
 
94
- class FLATConfig(PgVectoRSIndexConfig):
108
+ class FLATConfig(PgVectoRSQuantConfig):
95
109
  index: IndexType = IndexType.Flat
96
110
 
97
111
  def index_param(self) -> dict:
98
112
  options = f"""
99
- capacity = 1048576
100
- [algorithm.flat]
113
+ [indexing.flat]
101
114
  {self.parse_quantization()}
102
115
  """
103
116
  return {"options": options, "metric": self.parse_metric()}
@@ -107,9 +120,8 @@ capacity = 1048576
107
120
 
108
121
 
109
122
  _pgvecto_rs_case_config = {
110
- IndexType.AUTOINDEX: HNSWConfig,
111
123
  IndexType.HNSW: HNSWConfig,
112
- IndexType.DISKANN: HNSWConfig,
113
124
  IndexType.IVFFlat: IVFFlatConfig,
125
+ IndexType.IVFSQ8: IVFFlatSQ8Config,
114
126
  IndexType.Flat: FLATConfig,
115
127
  }
@@ -1,18 +1,17 @@
1
- """Wrapper around the Pgvector vector database over VectorDB"""
1
+ """Wrapper around the Pgvecto.rs vector database over VectorDB"""
2
2
 
3
3
  import io
4
4
  import logging
5
5
  from contextlib import contextmanager
6
6
  from typing import Any
7
7
  import pandas as pd
8
-
9
8
  import psycopg2
9
+ import psycopg2.extras
10
10
 
11
11
  from ..api import VectorDB, DBCaseConfig
12
12
 
13
13
  log = logging.getLogger(__name__)
14
14
 
15
-
16
15
  class PgVectoRS(VectorDB):
17
16
  """Use SQLAlchemy instructions"""
18
17
 
@@ -66,6 +65,8 @@ class PgVectoRS(VectorDB):
66
65
  self.conn = psycopg2.connect(**self.db_config)
67
66
  self.conn.autocommit = False
68
67
  self.cursor = self.conn.cursor()
68
+ self.cursor.execute('SET search_path = "$user", public, vectors')
69
+ self.conn.commit()
69
70
 
70
71
  try:
71
72
  yield
@@ -113,7 +114,7 @@ class PgVectoRS(VectorDB):
113
114
  self.conn.commit()
114
115
  except Exception as e:
115
116
  log.warning(
116
- f"Failed to create pgvector table: {self.table_name} error: {e}"
117
+ f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
117
118
  )
118
119
  raise e from None
119
120
 
@@ -127,13 +128,10 @@ class PgVectoRS(VectorDB):
127
128
  f'CREATE TABLE IF NOT EXISTS public."{self.table_name}" \
128
129
  (id Integer PRIMARY KEY, embedding vector({dim}));'
129
130
  )
130
- self.cursor.execute(
131
- f'ALTER TABLE public."{self.table_name}" ALTER COLUMN embedding SET STORAGE PLAIN;'
132
- )
133
131
  self.conn.commit()
134
132
  except Exception as e:
135
133
  log.warning(
136
- f"Failed to create pgvector table: {self.table_name} error: {e}"
134
+ f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
137
135
  )
138
136
  raise e from None
139
137
 
@@ -146,22 +144,24 @@ class PgVectoRS(VectorDB):
146
144
  assert self.conn is not None, "Connection is not initialized"
147
145
  assert self.cursor is not None, "Cursor is not initialized"
148
146
 
147
+ assert self.conn is not None, "Connection is not initialized"
148
+ assert self.cursor is not None, "Cursor is not initialized"
149
+
149
150
  try:
150
- items = {"id": metadata, "embedding": embeddings}
151
+ items = {
152
+ "id": metadata,
153
+ "embedding": embeddings
154
+ }
151
155
  df = pd.DataFrame(items)
152
156
  csv_buffer = io.StringIO()
153
157
  df.to_csv(csv_buffer, index=False, header=False)
154
158
  csv_buffer.seek(0)
155
- self.cursor.copy_expert(
156
- f'COPY public."{self.table_name}" FROM STDIN WITH (FORMAT CSV)',
157
- csv_buffer,
158
- )
159
+ self.cursor.copy_expert(f"COPY public.\"{self.table_name}\" FROM STDIN WITH (FORMAT CSV)", csv_buffer)
159
160
  self.conn.commit()
160
161
  return len(metadata), None
161
162
  except Exception as e:
162
- log.warning(
163
- f"Failed to insert data into pgvector table ({self.table_name}), error: {e}"
164
- )
163
+ log.warning(f"Failed to insert data into pgvecto.rs table ({self.table_name}), error: {e}")
164
+ return 0, e
165
165
 
166
166
  def search_embedding(
167
167
  self,
@@ -1,49 +1,100 @@
1
1
  from pydantic import BaseModel, SecretStr
2
- from ..api import DBConfig, DBCaseConfig, MetricType
2
+ from ..api import DBConfig, DBCaseConfig, IndexType, MetricType
3
3
 
4
4
  POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
5
5
 
6
6
  class PgVectorConfig(DBConfig):
7
7
  user_name: SecretStr = "postgres"
8
8
  password: SecretStr
9
- url: SecretStr
9
+ host: str = "localhost"
10
+ port: int = 5432
10
11
  db_name: str
11
12
 
12
13
  def to_dict(self) -> dict:
13
14
  user_str = self.user_name.get_secret_value()
14
15
  pwd_str = self.password.get_secret_value()
15
- url_str = self.url.get_secret_value()
16
16
  return {
17
- "url" : POSTGRE_URL_PLACEHOLDER%(user_str, pwd_str, url_str, self.db_name)
17
+ "host" : self.host,
18
+ "port" : self.port,
19
+ "dbname" : self.db_name,
20
+ "user" : user_str,
21
+ "password" : pwd_str
18
22
  }
19
23
 
20
24
  class PgVectorIndexConfig(BaseModel, DBCaseConfig):
21
25
  metric_type: MetricType | None = None
22
- lists: int | None = 1000
23
- probes: int | None = 10
26
+ index: IndexType
24
27
 
25
- def parse_metric(self) -> str:
28
+ def parse_metric(self) -> str:
26
29
  if self.metric_type == MetricType.L2:
27
30
  return "vector_l2_ops"
28
31
  elif self.metric_type == MetricType.IP:
29
32
  return "vector_ip_ops"
30
33
  return "vector_cosine_ops"
31
-
32
- def parse_metric_fun_str(self) -> str:
34
+
35
+ def parse_metric_fun_op(self) -> str:
36
+ if self.metric_type == MetricType.L2:
37
+ return "<->"
38
+ elif self.metric_type == MetricType.IP:
39
+ return "<#>"
40
+ return "<=>"
41
+
42
+ def parse_metric_fun_str(self) -> str:
33
43
  if self.metric_type == MetricType.L2:
34
44
  return "l2_distance"
35
45
  elif self.metric_type == MetricType.IP:
36
46
  return "max_inner_product"
37
47
  return "cosine_distance"
38
48
 
49
+
50
+
51
+ class HNSWConfig(PgVectorIndexConfig):
52
+ M: int
53
+ efConstruction: int
54
+ ef: int | None = None
55
+ index: IndexType = IndexType.HNSW
56
+
57
+ def index_param(self) -> dict:
58
+ return {
59
+ "metric_type": self.parse_metric(),
60
+ "index_type": self.index.value,
61
+ "params": {"M": self.M, "efConstruction": self.efConstruction},
62
+ }
63
+
64
+ def index_param(self) -> dict:
65
+ return {
66
+ "m" : self.M,
67
+ "efConstruction" : self.efConstruction,
68
+ "metric" : self.parse_metric()
69
+ }
70
+
71
+ def search_param(self) -> dict:
72
+ return {
73
+ "ef" : self.ef,
74
+ "metric_fun" : self.parse_metric_fun_str(),
75
+ "metric_fun_op" : self.parse_metric_fun_op(),
76
+ }
77
+
78
+
79
+ class IVFFlatConfig(PgVectorIndexConfig):
80
+ lists: int | None = 1000
81
+ probes: int | None = 10
82
+ index: IndexType = IndexType.IVFFlat
83
+
39
84
  def index_param(self) -> dict:
40
85
  return {
41
86
  "lists" : self.lists,
42
87
  "metric" : self.parse_metric()
43
88
  }
44
-
89
+
45
90
  def search_param(self) -> dict:
46
91
  return {
47
92
  "probes" : self.probes,
48
- "metric_fun" : self.parse_metric_fun_str()
49
- }
93
+ "metric_fun" : self.parse_metric_fun_str(),
94
+ "metric_fun_op" : self.parse_metric_fun_op(),
95
+ }
96
+
97
+ _pgvector_case_config = {
98
+ IndexType.HNSW: HNSWConfig,
99
+ IndexType.IVFFlat: IVFFlatConfig,
100
+ }