PyPI - vectordb-bench - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

vectordb-bench 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

vectordb_bench/__init__.py CHANGED Viewed

@@ -18,12 +18,23 @@ class config:
     USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
     RESULTS_LOCAL_DIR = pathlib.Path(__file__).parent.joinpath("results")
-    CASE_TIMEOUT_IN_SECOND = 24 * 60 * 60
+    CAPACITY_TIMEOUT_IN_SECONDS =  24 * 3600 # 24h
+    LOAD_TIMEOUT_1M             = 2.5 * 3600 # 2.5h
+    LOAD_TIMEOUT_10M            =  25 * 3600 # 25h
+    LOAD_TIMEOUT_100M           = 250 * 3600 # 10.41d
+    OPTIMIZE_TIMEOUT_1M         =  15 * 60   # 15min
+    OPTIMIZE_TIMEOUT_10M        = 2.5 * 3600 # 2.5h
+    OPTIMIZE_TIMEOUT_100M       =  25 * 3600 # 1.04d
     def display(self) -> str:
-        tmp = [i for i in inspect.getmembers(self)
-            if not inspect.ismethod(i[1]) and not i[0].startswith('_') \
+        tmp = [
+            i for i in inspect.getmembers(self)
+            if not inspect.ismethod(i[1])
+            and not i[0].startswith('_')
+            and "TIMEOUT" not in i[0]
         ]
         return tmp

vectordb_bench/backend/cases.py CHANGED Viewed

@@ -2,8 +2,10 @@ import typing
 import logging
 from enum import Enum, auto
-from . import dataset as ds
-from ..base import BaseModel
+from vectordb_bench import config
+from vectordb_bench.base import BaseModel
+from .dataset import Dataset, DatasetManager
 log = logging.getLogger(__name__)
@@ -44,7 +46,7 @@ class CaseType(Enum):
         if c is not None:
             return c().name
         raise ValueError("Case unsupported")
     @property
     def case_description(self) -> str:
         c = self.case_cls
@@ -73,7 +75,10 @@ class Case(BaseModel):
     label: CaseLabel
     name: str
     description: str
-    dataset: ds.DataSet
+    dataset: DatasetManager
+    load_timeout: float | int
+    optimize_timeout: float | int | None
     filter_rate: float | None
@@ -92,6 +97,8 @@ class Case(BaseModel):
 class CapacityCase(Case, BaseModel):
     label: CaseLabel = CaseLabel.Load
     filter_rate: float | None = None
+    load_timeout: float | int = config.CAPACITY_TIMEOUT_IN_SECONDS
+    optimize_timeout: float | int | None = None
 class PerformanceCase(Case, BaseModel):
@@ -101,7 +108,7 @@ class PerformanceCase(Case, BaseModel):
 class CapacityDim960(CapacityCase):
     case_id: CaseType = CaseType.CapacityDim960
-    dataset: ds.DataSet = ds.get(ds.Name.GIST, ds.Label.SMALL)
+    dataset: DatasetManager = Dataset.GIST.manager(100_000)
     name: str = "Capacity Test (960 Dim Repeated)"
     description: str = """This case tests the vector database's loading capacity by repeatedly inserting large-dimension vectors (GIST 100K vectors, <b>960 dimensions</b>) until it is fully loaded.
 Number of inserted vectors will be reported."""
@@ -109,7 +116,7 @@ Number of inserted vectors will be reported."""
 class CapacityDim128(CapacityCase):
     case_id: CaseType = CaseType.CapacityDim128
-    dataset: ds.DataSet = ds.get(ds.Name.SIFT, ds.Label.SMALL)
+    dataset: DatasetManager = Dataset.SIFT.manager(500_000)
     name: str = "Capacity Test (128 Dim Repeated)"
     description: str = """This case tests the vector database's loading capacity by repeatedly inserting small-dimension vectors (SIFT 100K vectors, <b>128 dimensions</b>) until it is fully loaded.
 Number of inserted vectors will be reported."""
@@ -117,64 +124,78 @@ Number of inserted vectors will be reported."""
 class Performance10M(PerformanceCase):
     case_id: CaseType = CaseType.Performance10M
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
+    dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
     name: str = "Search Performance Test (10M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 class Performance1M(PerformanceCase):
     case_id: CaseType = CaseType.Performance1M
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
+    dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
     name: str = "Search Performance Test (1M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 class Performance10M1P(PerformanceCase):
     case_id: CaseType = CaseType.Performance10M1P
     filter_rate: float | int | None = 0.01
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
+    dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
     name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 1%)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 class Performance1M1P(PerformanceCase):
     case_id: CaseType = CaseType.Performance1M1P
     filter_rate: float | int | None = 0.01
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
+    dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
     name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 1%)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 class Performance10M99P(PerformanceCase):
     case_id: CaseType = CaseType.Performance10M99P
     filter_rate: float | int | None = 0.99
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.LARGE)
+    dataset: DatasetManager = Dataset.COHERE.manager(10_000_000)
     name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 99%)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 class Performance1M99P(PerformanceCase):
     case_id: CaseType = CaseType.Performance1M99P
     filter_rate: float | int | None = 0.99
-    dataset: ds.DataSet = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
+    dataset: DatasetManager = Dataset.COHERE.manager(1_000_000)
     name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 99%)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 class Performance100M(PerformanceCase):
     case_id: CaseType = CaseType.Performance100M
     filter_rate: float | int | None = None
-    dataset: ds.DataSet = ds.get(ds.Name.LAION, ds.Label.LARGE)
+    dataset: DatasetManager = Dataset.LAION.manager(100_000_000)
     name: str = "Search Performance Test (100M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a large 100M dataset (<b>LAION 100M vectors</b>, 768 dimensions), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_100M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_100M
 type2case = {

vectordb_bench/backend/clients/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ from .pinecone.pinecone import Pinecone
 from .weaviate_cloud.weaviate_cloud import WeaviateCloud
 from .qdrant_cloud.qdrant_cloud import QdrantCloud
 from .zilliz_cloud.zilliz_cloud import ZillizCloud
+from .pgvector.pgvector import PgVector
 class DB(Enum):
     """Database types
@@ -35,6 +35,7 @@ class DB(Enum):
     ElasticCloud = "ElasticCloud"
     QdrantCloud = "QdrantCloud"
     WeaviateCloud = "WeaviateCloud"
+    PgVector = "PgVector"
     @property
@@ -49,8 +50,12 @@ db2client = {
     DB.ElasticCloud: ElasticCloud,
     DB.QdrantCloud: QdrantCloud,
     DB.Pinecone: Pinecone,
+    DB.PgVector: PgVector
 }
+for db in DB:
+    assert issubclass(db.init_cls, VectorDB)
 __all__ = [
     "DB", "VectorDB", "DBConfig", "DBCaseConfig", "IndexType", "MetricType", "EmptyDBCaseConfig",

vectordb_bench/backend/clients/api.py CHANGED Viewed

@@ -73,7 +73,7 @@ class VectorDB(ABC):
     In each process, the benchmark cases ensure VectorDB.init() calls before any other methods operations
-    insert_embeddings, search_embedding, and, ready_to_search will be timed for each call.
+    insert_embeddings, search_embedding, and, optimize will be timed for each call.
     Examples:
         >>> milvus = Milvus()
@@ -90,9 +90,12 @@ class VectorDB(ABC):
         db_case_config: DBCaseConfig | None,
         collection_name: str,
         drop_old: bool = False,
-        **kwargs
+        **kwargs,
     ) -> None:
-        """Initialize wrapper around the vector database client
+        """Initialize wrapper around the vector database client.
+        Please drop the existing collection if drop_old is True. And create collection
+        if collection not in the Vector Database
         Args:
             dim(int): the dimension of the dataset
@@ -130,7 +133,7 @@ class VectorDB(ABC):
         self,
         embeddings: list[list[float]],
         metadata: list[int],
-        kwargs: Any,
+        **kwargs,
     ) -> (int, Exception):
         """Insert the embeddings to the vector database. The default number of embeddings for
         each insert_embeddings is 5000.
@@ -138,7 +141,7 @@ class VectorDB(ABC):
         Args:
             embeddings(list[list[float]]): list of embedding to add to the vector database.
             metadatas(list[int]): metadata associated with the embeddings, for filtering.
-            kwargs(Any): vector database specific parameters.
+            **kwargs(Any): vector database specific parameters.
         Returns:
             int: inserted data count
@@ -166,13 +169,14 @@ class VectorDB(ABC):
     # TODO: remove
     @abstractmethod
-    def ready_to_search(self):
-        """ready_to_search will be called between insertion and search in performance cases.
+    def optimize(self):
+        """optimize will be called between insertion and search in performance cases.
         Should be blocked until the vectorDB is ready to be tested on
         heavy performance cases.
-        Time(insert the dataset) + Time(ready_to_search) will be recorded as "load_duration" metric
+        Time(insert the dataset) + Time(optimize) will be recorded as "load_duration" metric
+        Optimize's execution time is limited, the limited time is based on cases.
         """
         raise NotImplementedError

vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py CHANGED Viewed

@@ -21,6 +21,7 @@ class ElasticCloud(VectorDB):
         id_col_name: str = "id",
         vector_col_name: str = "vector",
         drop_old: bool = False,
+        **kwargs,
     ):
         self.dim = dim
         self.db_config = db_config
@@ -83,6 +84,7 @@ class ElasticCloud(VectorDB):
         self,
         embeddings: Iterable[list[float]],
         metadata: list[int],
+        **kwargs,
     ) -> (int, Exception):
         """Insert the embeddings to the elasticsearch."""
         assert self.client is not None, "should self.init() first"
@@ -143,8 +145,8 @@ class ElasticCloud(VectorDB):
             log.warning(f"Failed to search: {self.indice} error: {str(e)}")
             raise e from None
-    def ready_to_search(self):
-        """ready_to_search will be called between insertion and search in performance cases."""
+    def optimize(self):
+        """optimize will be called between insertion and search in performance cases."""
         pass
     def ready_to_load(self):

vectordb_bench/backend/clients/milvus/milvus.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 from contextlib import contextmanager
-from typing import Any, Iterable, Type
+from typing import Iterable, Type
 from pymilvus import Collection, utility
 from pymilvus import CollectionSchema, DataType, FieldSchema, MilvusException
@@ -24,6 +24,7 @@ class Milvus(VectorDB):
         collection_name: str = "VectorDBBenchCollection",
         drop_old: bool = False,
         name: str = "Milvus",
+        **kwargs,
     ):
         """Initialize wrapper around the milvus vector database."""
         self.name = name
@@ -53,7 +54,7 @@ class Milvus(VectorDB):
             log.info(f"{self.name} create collection: {self.collection_name}")
             # Create the collection
-            coll = Collection(
+            Collection(
                 name=self.collection_name,
                 schema=CollectionSchema(fields),
                 consistency_level="Session",
@@ -107,6 +108,14 @@ class Milvus(VectorDB):
     def _optimize(self):
         log.info(f"{self.name} optimizing before search")
+        try:
+            self.col.load()
+        except Exception as e:
+            log.warning(f"{self.name} optimize error: {e}")
+            raise e from None
+    def _post_insert(self):
+        log.info(f"{self.name} post insert before optimize")
         try:
             self.col.flush()
             self.col.compact()
@@ -119,10 +128,6 @@ class Milvus(VectorDB):
                 index_name=self._index_name,
             )
             utility.wait_for_index_building_complete(self.collection_name)
-            self.col.load()
-            #  self.col.load(_refresh=True)
-            #  utility.wait_for_loading_complete(self.collection_name)
-            #  import time; time.sleep(10)
         except Exception as e:
             log.warning(f"{self.name} optimize error: {e}")
             raise e from None
@@ -132,7 +137,7 @@ class Milvus(VectorDB):
         self._pre_load(self.col)
         pass
-    def ready_to_search(self):
+    def optimize(self):
         assert self.col, "Please call self.init() before"
         self._optimize()
@@ -140,7 +145,7 @@ class Milvus(VectorDB):
         self,
         embeddings: Iterable[list[float]],
         metadata: list[int],
-        **kwargs: Any,
+        **kwargs,
     ) -> (int, Exception):
         """Insert embeddings into Milvus. should call self.init() first"""
         # use the first insert_embeddings to init collection
@@ -155,10 +160,12 @@ class Milvus(VectorDB):
                         metadata[batch_start_offset : batch_end_offset],
                         embeddings[batch_start_offset : batch_end_offset],
                 ]
-                res = self.col.insert(insert_data, **kwargs)
+                res = self.col.insert(insert_data)
                 insert_count += len(res.primary_keys)
+            if kwargs.get("last_batch"):
+                self._post_insert()
         except MilvusException as e:
-            log.warning("Failed to insert data")
+            log.info(f"Failed to insert data: {e}")
             return (insert_count, e)
         return (insert_count, None)

vectordb_bench/backend/clients/pgvector/config.py ADDED Viewed

@@ -0,0 +1,49 @@
+from pydantic import BaseModel, SecretStr
+from ..api import DBConfig, DBCaseConfig, MetricType
+POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
+class PgVectorConfig(DBConfig):
+    user_name: SecretStr = "postgres"
+    password: SecretStr
+    url: SecretStr
+    db_name: str
+    def to_dict(self) -> dict:
+        user_str = self.user_name.get_secret_value()
+        pwd_str = self.password.get_secret_value()
+        url_str = self.url.get_secret_value()
+        return {
+            "url" : POSTGRE_URL_PLACEHOLDER%(user_str, pwd_str, url_str, self.db_name)
+        }
+class PgVectorIndexConfig(BaseModel, DBCaseConfig):
+    metric_type: MetricType | None = None
+    lists: int | None = 1000
+    probes: int | None = 10
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "vector_l2_ops"
+        elif self.metric_type == MetricType.IP:
+            return "vector_ip_ops"
+        return "vector_cosine_ops"
+    def parse_metric_fun_str(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "l2_distance"
+        elif self.metric_type == MetricType.IP:
+            return "max_inner_product"
+        return "cosine_distance"
+    def index_param(self) -> dict:
+        return {
+            "lists" : self.lists,
+            "metric" : self.parse_metric()
+        }
+    def search_param(self) -> dict:
+        return {
+            "probes" : self.probes,
+            "metric_fun" : self.parse_metric_fun_str()
+        }

vectordb_bench/backend/clients/pgvector/pgvector.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Wrapper around the Pgvector vector database over VectorDB"""
+import logging
+import time
+from contextlib import contextmanager
+from typing import Any, Type
+from functools import wraps
+from ..api import VectorDB, DBConfig, DBCaseConfig, IndexType
+from pgvector.sqlalchemy import Vector
+from .config import PgVectorConfig, PgVectorIndexConfig
+from sqlalchemy import (
+    MetaData,
+    create_engine,
+    insert,
+    select,
+    Index,
+    Table,
+    text,
+    Column,
+    Float,
+    Integer
+)
+from sqlalchemy.orm import (
+    declarative_base,
+    mapped_column,
+    Session
+)
+log = logging.getLogger(__name__)
+class PgVector(VectorDB):
+    """ Use SQLAlchemy instructions"""
+    def __init__(
+        self,
+        dim: int,
+        db_config: dict,
+        db_case_config: DBCaseConfig,
+        collection_name: str = "PgVectorCollection",
+        drop_old: bool = False,
+        **kwargs,
+    ):
+        self.db_config = db_config
+        self.case_config = db_case_config
+        self.table_name = collection_name
+        self.dim = dim
+        self._index_name = "pqvector_index"
+        self._primary_field = "id"
+        self._vector_field = "embedding"
+        # construct basic units
+        pg_engine = create_engine(**self.db_config)
+        Base = declarative_base()
+        pq_metadata = Base.metadata
+        pq_metadata.reflect(pg_engine)
+        # create vector extension
+        with pg_engine.connect() as conn:
+            conn.execute(text('CREATE EXTENSION IF NOT EXISTS vector'))
+            conn.commit()
+        self.pg_table = self._get_table_schema(pq_metadata)
+        if drop_old and self.table_name in pq_metadata.tables:
+            log.info(f"Pgvector client drop table : {self.table_name}")
+            # self.pg_table.drop(pg_engine, checkfirst=True)
+            pq_metadata.drop_all(pg_engine)
+            self._create_table(dim, pg_engine)
+    @classmethod
+    def config_cls(cls) -> Type[DBConfig]:
+        return PgVectorConfig
+    @classmethod
+    def case_config_cls(cls, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
+        return PgVectorIndexConfig
+    @contextmanager
+    def init(self) -> None:
+        """
+        Examples:
+            >>> with self.init():
+            >>>     self.insert_embeddings()
+            >>>     self.search_embedding()
+        """
+        self.pg_engine = create_engine(**self.db_config)
+        Base = declarative_base()
+        pq_metadata = Base.metadata
+        pq_metadata.reflect(self.pg_engine)
+        self.pg_session = Session(self.pg_engine)
+        self.pg_table = self._get_table_schema(pq_metadata)
+        yield
+        self.pg_session = None
+        self.pg_engine = None
+        del (self.pg_session)
+        del (self.pg_engine)
+    def ready_to_load(self):
+        pass
+    def optimize(self):
+        pass
+    def ready_to_search(self):
+        pass
+    def _get_table_schema(self, pq_metadata):
+        return Table(
+            self.table_name,
+            pq_metadata,
+            Column(self._primary_field, Integer, primary_key=True),
+            Column(self._vector_field, Vector(self.dim)),
+            extend_existing=True
+        )
+    def _create_index(self, pg_engine):
+        index_param = self.case_config.index_param()
+        index = Index(self._index_name, self.pg_table.c.embedding,
+            postgresql_using='ivfflat',
+            postgresql_with={'lists': index_param["lists"]},
+            postgresql_ops={'embedding': index_param["metric"]}
+        )
+        index.drop(pg_engine, checkfirst = True)
+        index.create(pg_engine)
+    def _create_table(self, dim, pg_engine : int):
+        try:
+            # create table
+            self.pg_table.create(bind = pg_engine, checkfirst = True)
+            # create vec index
+            self._create_index(pg_engine)
+        except Exception as e:
+            log.warning(f"Failed to create pgvector table: {self.table_name} error: {e}")
+            raise e from None
+    def insert_embeddings(
+        self,
+        embeddings: list[list[float]],
+        metadata: list[int],
+        **kwargs: Any,
+    ) -> (int, Exception):
+        try:
+            items = [dict(id = metadata[i], embedding=embeddings[i]) for i in range(len(metadata))]
+            self.pg_session.execute(insert(self.pg_table), items)
+            self.pg_session.commit()
+            return len(metadata), None
+        except Exception as e:
+            log.warning(f"Failed to insert data into pgvector table ({self.table_name}), error: {e}")
+            return 0, e
+    def search_embedding(
+        self,
+        query: list[float],
+        k: int = 100,
+        filters: dict | None = None,
+        timeout: int | None = None,
+    ) -> list[int]:
+        assert self.pg_table is not None
+        search_param =self.case_config.search_param()
+        with self.pg_engine.connect() as conn:
+            conn.execute(text(f'SET ivfflat.probes = {search_param["probes"]}'))
+            conn.commit()
+        op_fun = getattr(self.pg_table.c.embedding, search_param["metric_fun"])
+        if filters:
+            res = self.pg_session.scalars(select(self.pg_table).order_by(op_fun(query)).filter(self.pg_table.c.id > filters.get('id')).limit(k))
+        else:
+            res = self.pg_session.scalars(select(self.pg_table).order_by(op_fun(query)).limit(k))
+        return list(res)

vectordb_bench/backend/clients/pinecone/pinecone.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import logging
 from contextlib import contextmanager
-from typing import Any, Type
+from typing import Type
 from ..api import VectorDB, DBConfig, DBCaseConfig, EmptyDBCaseConfig, IndexType
 from .config import PineconeConfig
@@ -20,6 +20,7 @@ class Pinecone(VectorDB):
         db_config: dict,
         db_case_config: DBCaseConfig,
         drop_old: bool = False,
+        **kwargs,
     ):
         """Initialize wrapper around the milvus vector database."""
         self.index_name = db_config["index_name"]
@@ -69,13 +70,14 @@ class Pinecone(VectorDB):
     def ready_to_load(self):
         pass
-    def ready_to_search(self):
+    def optimize(self):
         pass
     def insert_embeddings(
         self,
         embeddings: list[list[float]],
         metadata: list[int],
+        **kwargs,
     ) -> (int, Exception):
         assert len(embeddings) == len(metadata)
         insert_count = 0
@@ -99,7 +101,6 @@ class Pinecone(VectorDB):
         k: int = 100,
         filters: dict | None = None,
         timeout: int | None = None,
-        **kwargs: Any,
     ) -> list[tuple[int, float]]:
         if filters is None:
             pinecone_filters = {}

vectordb_bench/backend/clients/qdrant_cloud/config.py CHANGED Viewed

@@ -1,6 +1,7 @@
-from pydantic import SecretStr
+from pydantic import BaseModel, SecretStr
-from ..api import DBConfig
+from ..api import DBConfig, DBCaseConfig, MetricType
+from qdrant_client.models import Distance
 class QdrantConfig(DBConfig):
@@ -13,3 +14,20 @@ class QdrantConfig(DBConfig):
             "api_key": self.api_key.get_secret_value(),
             "prefer_grpc": True,
         }
+class QdrantIndexConfig(BaseModel, DBCaseConfig):
+    metric_type: MetricType | None = None
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return Distance.EUCLID
+        elif self.metric_type == MetricType.IP:
+            return Distance.DOT
+        return Distance.COSINE
+    def index_param(self) -> dict:
+        params = {"distance": self.parse_metric()}
+        return params
+    def search_param(self) -> dict:
+        return {}

vectordb-bench 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

vectordb-bench 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl