vectordb-bench 0.0.6__py3-none-any.whl → 0.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectordb_bench/backend/clients/__init__.py +4 -4
- vectordb_bench/backend/clients/api.py +1 -0
- vectordb_bench/backend/clients/chroma/chroma.py +2 -14
- vectordb_bench/backend/clients/milvus/config.py +19 -0
- vectordb_bench/backend/clients/pgvecto_rs/config.py +44 -32
- vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +16 -16
- vectordb_bench/backend/clients/pgvector/config.py +63 -12
- vectordb_bench/backend/clients/pgvector/pgvector.py +105 -77
- vectordb_bench/backend/clients/qdrant_cloud/config.py +19 -6
- vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +11 -7
- vectordb_bench/backend/clients/zilliz_cloud/config.py +4 -0
- vectordb_bench/backend/data_source.py +13 -64
- vectordb_bench/backend/dataset.py +45 -67
- vectordb_bench/backend/runner/serial_runner.py +1 -1
- vectordb_bench/backend/task_runner.py +2 -2
- vectordb_bench/backend/utils.py +30 -0
- vectordb_bench/frontend/components/run_test/caseSelector.py +1 -1
- vectordb_bench/frontend/const/dbCaseConfigs.py +41 -77
- vectordb_bench/models.py +1 -0
- vectordb_bench/results/PgVector/result_20230727_standard_pgvector.json +8 -0
- vectordb_bench/results/PgVector/result_20230808_standard_pgvector.json +9 -3
- vectordb_bench/results/ZillizCloud/{result_20240105_beta_202401_zillizcloud.json → result_20240105_standard_202401_zillizcloud.json} +365 -41
- vectordb_bench/results/getLeaderboardData.py +1 -1
- vectordb_bench/results/leaderboard.json +1 -1
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/METADATA +15 -2
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/RECORD +30 -30
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/WHEEL +1 -1
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/LICENSE +0 -0
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/entry_points.txt +0 -0
- {vectordb_bench-0.0.6.dist-info → vectordb_bench-0.0.8.dist-info}/top_level.txt +0 -0
@@ -54,8 +54,8 @@ class DB(Enum):
|
|
54
54
|
return ElasticCloud
|
55
55
|
|
56
56
|
if self == DB.QdrantCloud:
|
57
|
-
from .qdrant_cloud.qdrant_cloud import
|
58
|
-
return
|
57
|
+
from .qdrant_cloud.qdrant_cloud import QdrantCloud
|
58
|
+
return QdrantCloud
|
59
59
|
|
60
60
|
if self == DB.WeaviateCloud:
|
61
61
|
from .weaviate_cloud.weaviate_cloud import WeaviateCloud
|
@@ -142,8 +142,8 @@ class DB(Enum):
|
|
142
142
|
return WeaviateIndexConfig
|
143
143
|
|
144
144
|
if self == DB.PgVector:
|
145
|
-
from .pgvector.config import
|
146
|
-
return
|
145
|
+
from .pgvector.config import _pgvector_case_config
|
146
|
+
return _pgvector_case_config.get(index_type)
|
147
147
|
|
148
148
|
if self == DB.PgVectoRS:
|
149
149
|
from .pgvecto_rs.config import _pgvecto_rs_case_config
|
@@ -106,21 +106,9 @@ class ChromaClient(VectorDB):
|
|
106
106
|
"""
|
107
107
|
if filters:
|
108
108
|
# assumes benchmark test filters of format: {'metadata': '>=10000', 'id': 10000}
|
109
|
-
metadata_value = filters.get("metadata")
|
110
109
|
id_value = filters.get("id")
|
111
|
-
|
112
|
-
|
113
|
-
query_embeddings=query, n_results=k,
|
114
|
-
where={"$and": [{"id": {"$eq": id_value}},
|
115
|
-
{"id": {"$gt": metadata_value}}
|
116
|
-
]}
|
117
|
-
)
|
118
|
-
elif metadata_value:
|
119
|
-
results = self.collection.query(query_embeddings=query, n_results=k,
|
120
|
-
where={"id": {"$gt": metadata_value}})
|
121
|
-
else:
|
122
|
-
results = self.collection.query(query_embeddings=query, n_results=k,
|
123
|
-
where={"id": {"$eq": id_value}})
|
110
|
+
results = self.collection.query(query_embeddings=query, n_results=k,
|
111
|
+
where={"id": {"$gt": id_value}})
|
124
112
|
#return list of id's in results
|
125
113
|
return [int(i) for i in results.get('ids')[0]]
|
126
114
|
results = self.collection.query(query_embeddings=query, n_results=k)
|
@@ -95,6 +95,24 @@ class IVFFlatConfig(MilvusIndexConfig, DBCaseConfig):
|
|
95
95
|
"metric_type": self.parse_metric(),
|
96
96
|
"params": {"nprobe": self.nprobe},
|
97
97
|
}
|
98
|
+
|
99
|
+
class IVFSQ8Config(MilvusIndexConfig, DBCaseConfig):
|
100
|
+
nlist: int
|
101
|
+
nprobe: int | None = None
|
102
|
+
index: IndexType = IndexType.IVFSQ8
|
103
|
+
|
104
|
+
def index_param(self) -> dict:
|
105
|
+
return {
|
106
|
+
"metric_type": self.parse_metric(),
|
107
|
+
"index_type": self.index.value,
|
108
|
+
"params": {"nlist": self.nlist},
|
109
|
+
}
|
110
|
+
|
111
|
+
def search_param(self) -> dict:
|
112
|
+
return {
|
113
|
+
"metric_type": self.parse_metric(),
|
114
|
+
"params": {"nprobe": self.nprobe},
|
115
|
+
}
|
98
116
|
|
99
117
|
|
100
118
|
class FLATConfig(MilvusIndexConfig, DBCaseConfig):
|
@@ -210,6 +228,7 @@ _milvus_case_config = {
|
|
210
228
|
IndexType.HNSW: HNSWConfig,
|
211
229
|
IndexType.DISKANN: DISKANNConfig,
|
212
230
|
IndexType.IVFFlat: IVFFlatConfig,
|
231
|
+
IndexType.IVFSQ8: IVFSQ8Config,
|
213
232
|
IndexType.Flat: FLATConfig,
|
214
233
|
IndexType.GPU_IVF_FLAT: GPUIVFFlatConfig,
|
215
234
|
IndexType.GPU_IVF_PQ: GPUIVFPQConfig,
|
@@ -8,42 +8,30 @@ POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
|
|
8
8
|
class PgVectoRSConfig(DBConfig):
|
9
9
|
user_name: SecretStr = "postgres"
|
10
10
|
password: SecretStr
|
11
|
-
|
11
|
+
host: str = "localhost"
|
12
|
+
port: int = 5432
|
12
13
|
db_name: str
|
13
14
|
|
14
15
|
def to_dict(self) -> dict:
|
15
16
|
user_str = self.user_name.get_secret_value()
|
16
17
|
pwd_str = self.password.get_secret_value()
|
17
|
-
url_str = self.url.get_secret_value()
|
18
|
-
host, port = url_str.split(":")
|
19
18
|
return {
|
20
|
-
"host": host,
|
21
|
-
"port": port,
|
19
|
+
"host": self.host,
|
20
|
+
"port": self.port,
|
22
21
|
"dbname": self.db_name,
|
23
22
|
"user": user_str,
|
24
|
-
"password": pwd_str
|
23
|
+
"password": pwd_str
|
25
24
|
}
|
26
25
|
|
27
|
-
|
28
26
|
class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
|
29
27
|
metric_type: MetricType | None = None
|
30
|
-
quantizationType: Literal["trivial", "scalar", "product"]
|
31
|
-
quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
|
32
|
-
|
33
|
-
def parse_quantization(self) -> str:
|
34
|
-
if self.quantizationType == "trivial":
|
35
|
-
return "quantization = { trivial = { } }"
|
36
|
-
elif self.quantizationType == "scalar":
|
37
|
-
return "quantization = { scalar = { } }"
|
38
|
-
else:
|
39
|
-
return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
|
40
28
|
|
41
29
|
def parse_metric(self) -> str:
|
42
30
|
if self.metric_type == MetricType.L2:
|
43
|
-
return "
|
31
|
+
return "vector_l2_ops"
|
44
32
|
elif self.metric_type == MetricType.IP:
|
45
|
-
return "
|
46
|
-
return "
|
33
|
+
return "vector_dot_ops"
|
34
|
+
return "vector_cos_ops"
|
47
35
|
|
48
36
|
def parse_metric_fun_op(self) -> str:
|
49
37
|
if self.metric_type == MetricType.L2:
|
@@ -52,16 +40,27 @@ class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
|
|
52
40
|
return "<#>"
|
53
41
|
return "<=>"
|
54
42
|
|
43
|
+
class PgVectoRSQuantConfig(PgVectoRSIndexConfig):
|
44
|
+
quantizationType: Literal["trivial", "scalar", "product"]
|
45
|
+
quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
|
55
46
|
|
56
|
-
|
47
|
+
def parse_quantization(self) -> str:
|
48
|
+
if self.quantizationType == "trivial":
|
49
|
+
return "quantization = { trivial = { } }"
|
50
|
+
elif self.quantizationType == "scalar":
|
51
|
+
return "quantization = { scalar = { } }"
|
52
|
+
else:
|
53
|
+
return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
|
54
|
+
|
55
|
+
|
56
|
+
class HNSWConfig(PgVectoRSQuantConfig):
|
57
57
|
M: int
|
58
58
|
efConstruction: int
|
59
59
|
index: IndexType = IndexType.HNSW
|
60
60
|
|
61
61
|
def index_param(self) -> dict:
|
62
62
|
options = f"""
|
63
|
-
|
64
|
-
[algorithm.hnsw]
|
63
|
+
[indexing.hnsw]
|
65
64
|
m = {self.M}
|
66
65
|
ef_construction = {self.efConstruction}
|
67
66
|
{self.parse_quantization()}
|
@@ -72,17 +71,16 @@ ef_construction = {self.efConstruction}
|
|
72
71
|
return {"metrics_op": self.parse_metric_fun_op()}
|
73
72
|
|
74
73
|
|
75
|
-
class IVFFlatConfig(
|
74
|
+
class IVFFlatConfig(PgVectoRSQuantConfig):
|
76
75
|
nlist: int
|
77
76
|
nprobe: int | None = None
|
78
77
|
index: IndexType = IndexType.IVFFlat
|
79
78
|
|
80
79
|
def index_param(self) -> dict:
|
81
80
|
options = f"""
|
82
|
-
|
83
|
-
[algorithm.ivf]
|
81
|
+
[indexing.ivf]
|
84
82
|
nlist = {self.nlist}
|
85
|
-
|
83
|
+
nsample = {self.nprobe if self.nprobe else 10}
|
86
84
|
{self.parse_quantization()}
|
87
85
|
"""
|
88
86
|
return {"options": options, "metric": self.parse_metric()}
|
@@ -90,14 +88,29 @@ nprob = {self.nprobe if self.nprobe else 10}
|
|
90
88
|
def search_param(self) -> dict:
|
91
89
|
return {"metrics_op": self.parse_metric_fun_op()}
|
92
90
|
|
91
|
+
class IVFFlatSQ8Config(PgVectoRSIndexConfig):
|
92
|
+
nlist: int
|
93
|
+
nprobe: int | None = None
|
94
|
+
index: IndexType = IndexType.IVFSQ8
|
95
|
+
|
96
|
+
def index_param(self) -> dict:
|
97
|
+
options = f"""
|
98
|
+
[indexing.ivf]
|
99
|
+
nlist = {self.nlist}
|
100
|
+
nsample = {self.nprobe if self.nprobe else 10}
|
101
|
+
quantization = {{ scalar = {{ }} }}
|
102
|
+
"""
|
103
|
+
return {"options": options, "metric": self.parse_metric()}
|
104
|
+
|
105
|
+
def search_param(self) -> dict:
|
106
|
+
return {"metrics_op": self.parse_metric_fun_op()}
|
93
107
|
|
94
|
-
class FLATConfig(
|
108
|
+
class FLATConfig(PgVectoRSQuantConfig):
|
95
109
|
index: IndexType = IndexType.Flat
|
96
110
|
|
97
111
|
def index_param(self) -> dict:
|
98
112
|
options = f"""
|
99
|
-
|
100
|
-
[algorithm.flat]
|
113
|
+
[indexing.flat]
|
101
114
|
{self.parse_quantization()}
|
102
115
|
"""
|
103
116
|
return {"options": options, "metric": self.parse_metric()}
|
@@ -107,9 +120,8 @@ capacity = 1048576
|
|
107
120
|
|
108
121
|
|
109
122
|
_pgvecto_rs_case_config = {
|
110
|
-
IndexType.AUTOINDEX: HNSWConfig,
|
111
123
|
IndexType.HNSW: HNSWConfig,
|
112
|
-
IndexType.DISKANN: HNSWConfig,
|
113
124
|
IndexType.IVFFlat: IVFFlatConfig,
|
125
|
+
IndexType.IVFSQ8: IVFFlatSQ8Config,
|
114
126
|
IndexType.Flat: FLATConfig,
|
115
127
|
}
|
@@ -1,18 +1,17 @@
|
|
1
|
-
"""Wrapper around the
|
1
|
+
"""Wrapper around the Pgvecto.rs vector database over VectorDB"""
|
2
2
|
|
3
3
|
import io
|
4
4
|
import logging
|
5
5
|
from contextlib import contextmanager
|
6
6
|
from typing import Any
|
7
7
|
import pandas as pd
|
8
|
-
|
9
8
|
import psycopg2
|
9
|
+
import psycopg2.extras
|
10
10
|
|
11
11
|
from ..api import VectorDB, DBCaseConfig
|
12
12
|
|
13
13
|
log = logging.getLogger(__name__)
|
14
14
|
|
15
|
-
|
16
15
|
class PgVectoRS(VectorDB):
|
17
16
|
"""Use SQLAlchemy instructions"""
|
18
17
|
|
@@ -66,6 +65,8 @@ class PgVectoRS(VectorDB):
|
|
66
65
|
self.conn = psycopg2.connect(**self.db_config)
|
67
66
|
self.conn.autocommit = False
|
68
67
|
self.cursor = self.conn.cursor()
|
68
|
+
self.cursor.execute('SET search_path = "$user", public, vectors')
|
69
|
+
self.conn.commit()
|
69
70
|
|
70
71
|
try:
|
71
72
|
yield
|
@@ -113,7 +114,7 @@ class PgVectoRS(VectorDB):
|
|
113
114
|
self.conn.commit()
|
114
115
|
except Exception as e:
|
115
116
|
log.warning(
|
116
|
-
f"Failed to create
|
117
|
+
f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
|
117
118
|
)
|
118
119
|
raise e from None
|
119
120
|
|
@@ -127,13 +128,10 @@ class PgVectoRS(VectorDB):
|
|
127
128
|
f'CREATE TABLE IF NOT EXISTS public."{self.table_name}" \
|
128
129
|
(id Integer PRIMARY KEY, embedding vector({dim}));'
|
129
130
|
)
|
130
|
-
self.cursor.execute(
|
131
|
-
f'ALTER TABLE public."{self.table_name}" ALTER COLUMN embedding SET STORAGE PLAIN;'
|
132
|
-
)
|
133
131
|
self.conn.commit()
|
134
132
|
except Exception as e:
|
135
133
|
log.warning(
|
136
|
-
f"Failed to create
|
134
|
+
f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
|
137
135
|
)
|
138
136
|
raise e from None
|
139
137
|
|
@@ -146,22 +144,24 @@ class PgVectoRS(VectorDB):
|
|
146
144
|
assert self.conn is not None, "Connection is not initialized"
|
147
145
|
assert self.cursor is not None, "Cursor is not initialized"
|
148
146
|
|
147
|
+
assert self.conn is not None, "Connection is not initialized"
|
148
|
+
assert self.cursor is not None, "Cursor is not initialized"
|
149
|
+
|
149
150
|
try:
|
150
|
-
items = {
|
151
|
+
items = {
|
152
|
+
"id": metadata,
|
153
|
+
"embedding": embeddings
|
154
|
+
}
|
151
155
|
df = pd.DataFrame(items)
|
152
156
|
csv_buffer = io.StringIO()
|
153
157
|
df.to_csv(csv_buffer, index=False, header=False)
|
154
158
|
csv_buffer.seek(0)
|
155
|
-
self.cursor.copy_expert(
|
156
|
-
f'COPY public."{self.table_name}" FROM STDIN WITH (FORMAT CSV)',
|
157
|
-
csv_buffer,
|
158
|
-
)
|
159
|
+
self.cursor.copy_expert(f"COPY public.\"{self.table_name}\" FROM STDIN WITH (FORMAT CSV)", csv_buffer)
|
159
160
|
self.conn.commit()
|
160
161
|
return len(metadata), None
|
161
162
|
except Exception as e:
|
162
|
-
log.warning(
|
163
|
-
|
164
|
-
)
|
163
|
+
log.warning(f"Failed to insert data into pgvecto.rs table ({self.table_name}), error: {e}")
|
164
|
+
return 0, e
|
165
165
|
|
166
166
|
def search_embedding(
|
167
167
|
self,
|
@@ -1,49 +1,100 @@
|
|
1
1
|
from pydantic import BaseModel, SecretStr
|
2
|
-
from ..api import DBConfig, DBCaseConfig, MetricType
|
2
|
+
from ..api import DBConfig, DBCaseConfig, IndexType, MetricType
|
3
3
|
|
4
4
|
POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
|
5
5
|
|
6
6
|
class PgVectorConfig(DBConfig):
|
7
7
|
user_name: SecretStr = "postgres"
|
8
8
|
password: SecretStr
|
9
|
-
|
9
|
+
host: str = "localhost"
|
10
|
+
port: int = 5432
|
10
11
|
db_name: str
|
11
12
|
|
12
13
|
def to_dict(self) -> dict:
|
13
14
|
user_str = self.user_name.get_secret_value()
|
14
15
|
pwd_str = self.password.get_secret_value()
|
15
|
-
url_str = self.url.get_secret_value()
|
16
16
|
return {
|
17
|
-
"
|
17
|
+
"host" : self.host,
|
18
|
+
"port" : self.port,
|
19
|
+
"dbname" : self.db_name,
|
20
|
+
"user" : user_str,
|
21
|
+
"password" : pwd_str
|
18
22
|
}
|
19
23
|
|
20
24
|
class PgVectorIndexConfig(BaseModel, DBCaseConfig):
|
21
25
|
metric_type: MetricType | None = None
|
22
|
-
|
23
|
-
probes: int | None = 10
|
26
|
+
index: IndexType
|
24
27
|
|
25
|
-
def parse_metric(self) -> str:
|
28
|
+
def parse_metric(self) -> str:
|
26
29
|
if self.metric_type == MetricType.L2:
|
27
30
|
return "vector_l2_ops"
|
28
31
|
elif self.metric_type == MetricType.IP:
|
29
32
|
return "vector_ip_ops"
|
30
33
|
return "vector_cosine_ops"
|
31
|
-
|
32
|
-
def
|
34
|
+
|
35
|
+
def parse_metric_fun_op(self) -> str:
|
36
|
+
if self.metric_type == MetricType.L2:
|
37
|
+
return "<->"
|
38
|
+
elif self.metric_type == MetricType.IP:
|
39
|
+
return "<#>"
|
40
|
+
return "<=>"
|
41
|
+
|
42
|
+
def parse_metric_fun_str(self) -> str:
|
33
43
|
if self.metric_type == MetricType.L2:
|
34
44
|
return "l2_distance"
|
35
45
|
elif self.metric_type == MetricType.IP:
|
36
46
|
return "max_inner_product"
|
37
47
|
return "cosine_distance"
|
38
48
|
|
49
|
+
|
50
|
+
|
51
|
+
class HNSWConfig(PgVectorIndexConfig):
|
52
|
+
M: int
|
53
|
+
efConstruction: int
|
54
|
+
ef: int | None = None
|
55
|
+
index: IndexType = IndexType.HNSW
|
56
|
+
|
57
|
+
def index_param(self) -> dict:
|
58
|
+
return {
|
59
|
+
"metric_type": self.parse_metric(),
|
60
|
+
"index_type": self.index.value,
|
61
|
+
"params": {"M": self.M, "efConstruction": self.efConstruction},
|
62
|
+
}
|
63
|
+
|
64
|
+
def index_param(self) -> dict:
|
65
|
+
return {
|
66
|
+
"m" : self.M,
|
67
|
+
"efConstruction" : self.efConstruction,
|
68
|
+
"metric" : self.parse_metric()
|
69
|
+
}
|
70
|
+
|
71
|
+
def search_param(self) -> dict:
|
72
|
+
return {
|
73
|
+
"ef" : self.ef,
|
74
|
+
"metric_fun" : self.parse_metric_fun_str(),
|
75
|
+
"metric_fun_op" : self.parse_metric_fun_op(),
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
class IVFFlatConfig(PgVectorIndexConfig):
|
80
|
+
lists: int | None = 1000
|
81
|
+
probes: int | None = 10
|
82
|
+
index: IndexType = IndexType.IVFFlat
|
83
|
+
|
39
84
|
def index_param(self) -> dict:
|
40
85
|
return {
|
41
86
|
"lists" : self.lists,
|
42
87
|
"metric" : self.parse_metric()
|
43
88
|
}
|
44
|
-
|
89
|
+
|
45
90
|
def search_param(self) -> dict:
|
46
91
|
return {
|
47
92
|
"probes" : self.probes,
|
48
|
-
"metric_fun" : self.parse_metric_fun_str()
|
49
|
-
|
93
|
+
"metric_fun" : self.parse_metric_fun_str(),
|
94
|
+
"metric_fun_op" : self.parse_metric_fun_op(),
|
95
|
+
}
|
96
|
+
|
97
|
+
_pgvector_case_config = {
|
98
|
+
IndexType.HNSW: HNSWConfig,
|
99
|
+
IndexType.IVFFlat: IVFFlatConfig,
|
100
|
+
}
|