PyPI - vectordb-bench - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

vectordb-bench 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

vectordb_bench/backend/clients/__init__.py CHANGED Viewed

@@ -54,8 +54,8 @@ class DB(Enum):
             return ElasticCloud
         if self == DB.QdrantCloud:
-            from .qdrant_cloud.qdrant_cloud import QdrantClient
-            return QdrantClient
+            from .qdrant_cloud.qdrant_cloud import QdrantCloud
+            return QdrantCloud
         if self == DB.WeaviateCloud:
             from .weaviate_cloud.weaviate_cloud import WeaviateCloud
@@ -142,8 +142,8 @@ class DB(Enum):
             return WeaviateIndexConfig
         if self == DB.PgVector:
-            from .pgvector.config import PgVectorIndexConfig
-            return PgVectorIndexConfig
+            from .pgvector.config import _pgvector_case_config
+            return _pgvector_case_config.get(index_type)
         if self == DB.PgVectoRS:
             from .pgvecto_rs.config import _pgvecto_rs_case_config

vectordb_bench/backend/clients/pgvecto_rs/config.py CHANGED Viewed

@@ -8,42 +8,30 @@ POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
 class PgVectoRSConfig(DBConfig):
     user_name: SecretStr = "postgres"
     password: SecretStr
-    url: SecretStr
+    host: str = "localhost"
+    port: int = 5432
     db_name: str
     def to_dict(self) -> dict:
         user_str = self.user_name.get_secret_value()
         pwd_str = self.password.get_secret_value()
-        url_str = self.url.get_secret_value()
-        host, port = url_str.split(":")
         return {
-            "host": host,
-            "port": port,
+            "host": self.host,
+            "port": self.port,
             "dbname": self.db_name,
             "user": user_str,
-            "password": pwd_str,
+            "password": pwd_str
         }
 class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None
-    quantizationType: Literal["trivial", "scalar", "product"]
-    quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
-    def parse_quantization(self) -> str:
-        if self.quantizationType == "trivial":
-            return "quantization = { trivial = { } }"
-        elif self.quantizationType == "scalar":
-            return "quantization = { scalar = { } }"
-        else:
-            return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.L2:
-            return "l2_ops"
+            return "vector_l2_ops"
         elif self.metric_type == MetricType.IP:
-            return "dot_ops"
-        return "cosine_ops"
+            return "vector_dot_ops"
+        return "vector_cos_ops"
     def parse_metric_fun_op(self) -> str:
         if self.metric_type == MetricType.L2:
@@ -52,16 +40,27 @@ class PgVectoRSIndexConfig(BaseModel, DBCaseConfig):
             return "<#>"
         return "<=>"
+class PgVectoRSQuantConfig(PgVectoRSIndexConfig):
+    quantizationType: Literal["trivial", "scalar", "product"]
+    quantizationRatio: None | Literal["x4", "x8", "x16", "x32", "x64"]
-class HNSWConfig(PgVectoRSIndexConfig):
+    def parse_quantization(self) -> str:
+        if self.quantizationType == "trivial":
+            return "quantization = { trivial = { } }"
+        elif self.quantizationType == "scalar":
+            return "quantization = { scalar = { } }"
+        else:
+            return f'quantization = {{ product = {{ ratio = "{self.quantizationRatio}" }} }}'
+class HNSWConfig(PgVectoRSQuantConfig):
     M: int
     efConstruction: int
     index: IndexType = IndexType.HNSW
     def index_param(self) -> dict:
         options = f"""
-capacity = 1048576
-[algorithm.hnsw]
+[indexing.hnsw]
 m = {self.M}
 ef_construction = {self.efConstruction}
 {self.parse_quantization()}
@@ -72,17 +71,16 @@ ef_construction = {self.efConstruction}
         return {"metrics_op": self.parse_metric_fun_op()}
-class IVFFlatConfig(PgVectoRSIndexConfig):
+class IVFFlatConfig(PgVectoRSQuantConfig):
     nlist: int
     nprobe: int | None = None
     index: IndexType = IndexType.IVFFlat
     def index_param(self) -> dict:
         options = f"""
-capacity = 1048576
-[algorithm.ivf]
+[indexing.ivf]
 nlist = {self.nlist}
-nprob = {self.nprobe if self.nprobe else 10}
+nsample = {self.nprobe if self.nprobe else 10}
 {self.parse_quantization()}
 """
         return {"options": options, "metric": self.parse_metric()}
@@ -90,14 +88,29 @@ nprob = {self.nprobe if self.nprobe else 10}
     def search_param(self) -> dict:
         return {"metrics_op": self.parse_metric_fun_op()}
+class IVFFlatSQ8Config(PgVectoRSIndexConfig):
+    nlist: int
+    nprobe: int | None = None
+    index: IndexType = IndexType.IVFSQ8
+    def index_param(self) -> dict:
+        options = f"""
+[indexing.ivf]
+nlist = {self.nlist}
+nsample = {self.nprobe if self.nprobe else 10}
+quantization = {{ scalar = {{ }} }}
+"""
+        return {"options": options, "metric": self.parse_metric()}
+    def search_param(self) -> dict:
+        return {"metrics_op": self.parse_metric_fun_op()}
-class FLATConfig(PgVectoRSIndexConfig):
+class FLATConfig(PgVectoRSQuantConfig):
     index: IndexType = IndexType.Flat
     def index_param(self) -> dict:
         options = f"""
-capacity = 1048576
-[algorithm.flat]
+[indexing.flat]
 {self.parse_quantization()}
 """
         return {"options": options, "metric": self.parse_metric()}
@@ -107,9 +120,8 @@ capacity = 1048576
 _pgvecto_rs_case_config = {
-    IndexType.AUTOINDEX: HNSWConfig,
     IndexType.HNSW: HNSWConfig,
-    IndexType.DISKANN: HNSWConfig,
     IndexType.IVFFlat: IVFFlatConfig,
+    IndexType.IVFSQ8: IVFFlatSQ8Config,
     IndexType.Flat: FLATConfig,
 }

vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py CHANGED Viewed

@@ -1,18 +1,17 @@
-"""Wrapper around the Pgvector vector database over VectorDB"""
+"""Wrapper around the Pgvecto.rs vector database over VectorDB"""
 import io
 import logging
 from contextlib import contextmanager
 from typing import Any
 import pandas as pd
 import psycopg2
+import psycopg2.extras
 from ..api import VectorDB, DBCaseConfig
 log = logging.getLogger(__name__)
 class PgVectoRS(VectorDB):
     """Use SQLAlchemy instructions"""
@@ -66,6 +65,8 @@ class PgVectoRS(VectorDB):
         self.conn = psycopg2.connect(**self.db_config)
         self.conn.autocommit = False
         self.cursor = self.conn.cursor()
+        self.cursor.execute('SET search_path = "$user", public, vectors')
+        self.conn.commit()
         try:
             yield
@@ -113,7 +114,7 @@ class PgVectoRS(VectorDB):
             self.conn.commit()
         except Exception as e:
             log.warning(
-                f"Failed to create pgvector table: {self.table_name} error: {e}"
+                f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
             )
             raise e from None
@@ -127,13 +128,10 @@ class PgVectoRS(VectorDB):
                 f'CREATE TABLE IF NOT EXISTS public."{self.table_name}" \
                     (id Integer PRIMARY KEY, embedding vector({dim}));'
             )
-            self.cursor.execute(
-                f'ALTER TABLE public."{self.table_name}" ALTER COLUMN embedding SET STORAGE PLAIN;'
-            )
             self.conn.commit()
         except Exception as e:
             log.warning(
-                f"Failed to create pgvector table: {self.table_name} error: {e}"
+                f"Failed to create pgvecto.rs table: {self.table_name} error: {e}"
             )
             raise e from None
@@ -146,22 +144,24 @@ class PgVectoRS(VectorDB):
         assert self.conn is not None, "Connection is not initialized"
         assert self.cursor is not None, "Cursor is not initialized"
+        assert self.conn is not None, "Connection is not initialized"
+        assert self.cursor is not None, "Cursor is not initialized"
         try:
-            items = {"id": metadata, "embedding": embeddings}
+            items = {
+                "id": metadata,
+                "embedding": embeddings
+            }
             df = pd.DataFrame(items)
             csv_buffer = io.StringIO()
             df.to_csv(csv_buffer, index=False, header=False)
             csv_buffer.seek(0)
-            self.cursor.copy_expert(
-                f'COPY public."{self.table_name}" FROM STDIN WITH (FORMAT CSV)',
-                csv_buffer,
-            )
+            self.cursor.copy_expert(f"COPY public.\"{self.table_name}\" FROM STDIN WITH (FORMAT CSV)", csv_buffer)
             self.conn.commit()
             return len(metadata), None
         except Exception as e:
-            log.warning(
-                f"Failed to insert data into pgvector table ({self.table_name}), error: {e}"
-            )
+            log.warning(f"Failed to insert data into pgvecto.rs table ({self.table_name}), error: {e}")
+            return 0, e
     def search_embedding(
         self,

vectordb_bench/backend/clients/pgvector/config.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pydantic import BaseModel, SecretStr
-from ..api import DBConfig, DBCaseConfig, MetricType
+from ..api import DBConfig, DBCaseConfig, IndexType, MetricType
 POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
@@ -23,39 +23,78 @@ class PgVectorConfig(DBConfig):
 class PgVectorIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None
-    lists: int | None = 1000
-    probes: int | None = 10
+    index: IndexType
-    def parse_metric(self) -> str:
+    def parse_metric(self) -> str:
         if self.metric_type == MetricType.L2:
             return "vector_l2_ops"
         elif self.metric_type == MetricType.IP:
             return "vector_ip_ops"
         return "vector_cosine_ops"
     def parse_metric_fun_op(self) -> str:
         if self.metric_type == MetricType.L2:
             return "<->"
         elif self.metric_type == MetricType.IP:
             return "<#>"
         return "<=>"
-    def parse_metric_fun_str(self) -> str:
+    def parse_metric_fun_str(self) -> str:
         if self.metric_type == MetricType.L2:
             return "l2_distance"
         elif self.metric_type == MetricType.IP:
             return "max_inner_product"
         return "cosine_distance"
+class HNSWConfig(PgVectorIndexConfig):
+    M: int
+    efConstruction: int
+    ef: int | None = None
+    index: IndexType = IndexType.HNSW
+    def index_param(self) -> dict:
+        return {
+            "metric_type": self.parse_metric(),
+            "index_type": self.index.value,
+            "params": {"M": self.M, "efConstruction": self.efConstruction},
+        }
+    def index_param(self) -> dict:
+        return {
+            "m" : self.M,
+            "efConstruction" : self.efConstruction,
+            "metric" : self.parse_metric()
+        }
+    def search_param(self) -> dict:
+        return {
+            "ef" : self.ef,
+            "metric_fun" : self.parse_metric_fun_str(),
+            "metric_fun_op" : self.parse_metric_fun_op(),
+        }
+class IVFFlatConfig(PgVectorIndexConfig):
+    lists: int | None = 1000
+    probes: int | None = 10
+    index: IndexType = IndexType.IVFFlat
     def index_param(self) -> dict:
         return {
             "lists" : self.lists,
             "metric" : self.parse_metric()
         }
     def search_param(self) -> dict:
         return {
             "probes" : self.probes,
             "metric_fun" : self.parse_metric_fun_str(),
             "metric_fun_op" : self.parse_metric_fun_op(),
-        }
+        }
+_pgvector_case_config = {
+    IndexType.HNSW: HNSWConfig,
+    IndexType.IVFFlat: IVFFlatConfig,
+}

vectordb_bench/backend/clients/pgvector/pgvector.py CHANGED Viewed

@@ -8,7 +8,7 @@ import pandas as pd
 import psycopg2
 import psycopg2.extras
-from ..api import VectorDB, DBCaseConfig
+from ..api import IndexType, VectorDB, DBCaseConfig
 log = logging.getLogger(__name__)
@@ -108,7 +108,14 @@ class PgVector(VectorDB):
         assert self.cursor is not None, "Cursor is not initialized"
         index_param = self.case_config.index_param()
-        self.cursor.execute(f'CREATE INDEX IF NOT EXISTS {self._index_name} ON public."{self.table_name}" USING ivfflat (embedding {index_param["metric"]}) WITH (lists={index_param["lists"]});')
+        if self.case_config.index == IndexType.HNSW:
+            log.debug(f'Creating HNSW index. m={index_param["m"]}, ef_construction={index_param["ef_construction"]}')
+            self.cursor.execute(f'CREATE INDEX IF NOT EXISTS {self._index_name} ON public."{self.table_name}" USING hnsw (embedding {index_param["metric"]}) WITH (m={index_param["m"]}, ef_construction={index_param["ef_construction"]});')
+        elif self.case_config.index == IndexType.IVFFlat:
+            log.debug(f'Creating IVFFLAT index. list={index_param["lists"]}')
+            self.cursor.execute(f'CREATE INDEX IF NOT EXISTS {self._index_name} ON public."{self.table_name}" USING ivfflat (embedding {index_param["metric"]}) WITH (lists={index_param["lists"]});')
+        else:
+            assert "Invalid index type {self.case_config.index}"
         self.conn.commit()
     def _create_table(self, dim : int):
@@ -164,8 +171,15 @@ class PgVector(VectorDB):
         assert self.cursor is not None, "Cursor is not initialized"
         search_param =self.case_config.search_param()
-        self.cursor.execute(f'SET ivfflat.probes = {search_param["probes"]}')
-        self.cursor.execute(f"SELECT id FROM public.\"{self.table_name}\" ORDER BY embedding {search_param['metric_fun_op']} '{query}' LIMIT {k};")
+        if self.case_config.index == IndexType.HNSW:
+            self.cursor.execute(f'SET hnsw.ef_search = {search_param["ef"]}')
+            self.cursor.execute(f"SELECT id FROM public.\"{self.table_name}\" ORDER BY embedding {search_param['metric_fun_op']} '{query}' LIMIT {k};")
+        elif self.case_config.index == IndexType.IVFFlat:
+            self.cursor.execute(f'SET ivfflat.probes = {search_param["probes"]}')
+            self.cursor.execute(f"SELECT id FROM public.\"{self.table_name}\" ORDER BY embedding {search_param['metric_fun_op']} '{query}' LIMIT {k};")
+        else:
+            assert "Invalid index type {self.case_config.index}"
         self.conn.commit()
         result = self.cursor.fetchall()

vectordb_bench/backend/clients/qdrant_cloud/config.py CHANGED Viewed

@@ -1,18 +1,31 @@
 from pydantic import BaseModel, SecretStr
 from ..api import DBConfig, DBCaseConfig, MetricType
+from pydantic import validator
+# Allowing `api_key` to be left empty, to ensure compatibility with the open-source Qdrant.
 class QdrantConfig(DBConfig):
     url: SecretStr
     api_key: SecretStr
     def to_dict(self) -> dict:
-        return {
-            "url": self.url.get_secret_value(),
-            "api_key": self.api_key.get_secret_value(),
-            "prefer_grpc": True,
-        }
+        api_key = self.api_key.get_secret_value()
+        if len(api_key) > 0:
+            return {
+                "url": self.url.get_secret_value(),
+                "api_key": self.api_key.get_secret_value(),
+                "prefer_grpc": True,
+            }
+        else:
+            return {"url": self.url.get_secret_value(),}
+    @validator("*")
+    def not_empty_field(cls, v, field):
+        if field.name in ["api_key", "db_label"]:
+            return v
+        if isinstance(v, (str, SecretStr)) and len(v) == 0:
+            raise ValueError("Empty string!")
+        return v
 class QdrantIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType | None = None

vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py CHANGED Viewed

@@ -43,8 +43,7 @@ class QdrantCloud(VectorDB):
         if drop_old:
             log.info(f"QdrantCloud client drop_old collection: {self.collection_name}")
             tmp_client.delete_collection(self.collection_name)
-        self._create_collection(dim, tmp_client)
+            self._create_collection(dim, tmp_client)
         tmp_client = None
     @contextmanager
@@ -110,13 +109,18 @@ class QdrantCloud(VectorDB):
     ) -> (int, Exception):
         """Insert embeddings into Milvus. should call self.init() first"""
         assert self.qdrant_client is not None
+        QDRANT_BATCH_SIZE = 500
         try:
             # TODO: counts
-            _ = self.qdrant_client.upsert(
-                collection_name=self.collection_name,
-                wait=True,
-                points=Batch(ids=metadata, payloads=[{self._primary_field: v} for v in metadata], vectors=embeddings)
-            )
+            for offset in range(0, len(embeddings), QDRANT_BATCH_SIZE):
+                vectors = embeddings[offset: offset + QDRANT_BATCH_SIZE]
+                ids = metadata[offset: offset + QDRANT_BATCH_SIZE]
+                payloads=[{self._primary_field: v} for v in ids]
+                _ = self.qdrant_client.upsert(
+                    collection_name=self.collection_name,
+                    wait=True,
+                    points=Batch(ids=ids, payloads=payloads, vectors=vectors),
+                )
         except Exception as e:
             log.info(f"Failed to insert data, {e}")
             return 0, e

vectordb_bench/frontend/const/dbCaseConfigs.py CHANGED Viewed

@@ -397,6 +397,11 @@ CaseConfigParamInput_QuantizationType_PgVectoRS = CaseConfigInput(
     inputConfig={
         "options": ["trivial", "scalar", "product"],
     },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
+    in [
+        IndexType.HNSW.value,
+        IndexType.IVFFlat.value,
+    ],
 )
 CaseConfigParamInput_QuantizationRatio_PgVectoRS = CaseConfigInput(
@@ -406,7 +411,11 @@ CaseConfigParamInput_QuantizationRatio_PgVectoRS = CaseConfigInput(
         "options": ["x4", "x8", "x16", "x32", "x64"],
     },
     isDisplayed=lambda config: config.get(CaseConfigParamType.quantizationType, None)
-    == "product",
+    == "product" and config.get(CaseConfigParamType.IndexType, None)
+    in [
+        IndexType.HNSW.value,
+        IndexType.IVFFlat.value,
+    ],
 )
 CaseConfigParamInput_ZillizLevel = CaseConfigInput(

vectordb_bench/results/PgVector/result_20230727_standard_pgvector.json CHANGED Viewed

@@ -20,6 +20,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -49,6 +50,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -78,6 +80,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2
@@ -107,6 +110,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2
@@ -136,6 +140,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2
@@ -165,6 +170,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2
@@ -194,6 +200,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2
@@ -223,6 +230,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "COSINE",
           "lists": 10,
           "probes": 2

vectordb_bench/results/PgVector/result_20230808_standard_pgvector.json CHANGED Viewed

@@ -20,6 +20,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -51,7 +52,8 @@
         "db_case_config": {
           "metric_type": "L2",
           "lists": 10,
-          "probes": 2
+          "probes": 2,
+          "index": "IVF_FLAT"
         },
         "case_config": {
           "case_id": 11,
@@ -80,7 +82,8 @@
         "db_case_config": {
           "metric_type": "L2",
           "lists": 10,
-          "probes": 2
+          "probes": 2,
+          "index": "IVF_FLAT"
         },
         "case_config": {
           "case_id": 12,
@@ -107,6 +110,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -136,6 +140,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -165,6 +170,7 @@
           "db_name": "**********"
         },
         "db_case_config": {
+          "index": "IVF_FLAT",
           "metric_type": "L2",
           "lists": 10,
           "probes": 2
@@ -178,4 +184,4 @@
     }
   ],
   "file_fmt": "result_{}_{}_{}.json"
-}
+}

vectordb-bench 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

vectordb-bench 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl