PyPI - vectordb-bench - Versions diffs - 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl - Mend

vectordb-bench 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

vectordb_bench/backend/clients/vespa/vespa.py ADDED Viewed

@@ -0,0 +1,249 @@
+import datetime
+import logging
+import math
+from collections.abc import Generator
+from contextlib import contextmanager
+from vespa import application
+from ..api import VectorDB
+from . import util
+from .config import VespaHNSWConfig
+log = logging.getLogger(__name__)
+class Vespa(VectorDB):
+    def __init__(
+        self,
+        dim: int,
+        db_config: dict[str, str],
+        db_case_config: VespaHNSWConfig | None = None,
+        collection_name: str = "VectorDBBenchCollection",
+        drop_old: bool = False,
+        **kwargs,
+    ) -> None:
+        self.dim = dim
+        self.db_config = db_config
+        self.case_config = db_case_config or VespaHNSWConfig()
+        self.schema_name = collection_name
+        client = self.deploy_http()
+        client.wait_for_application_up()
+        if drop_old:
+            try:
+                client.delete_all_docs("vectordbbench_content", self.schema_name)
+            except Exception:
+                drop_old = False
+                log.exception(f"Vespa client drop_old schema: {self.schema_name}")
+    @contextmanager
+    def init(self) -> Generator[None, None, None]:
+        """create and destory connections to database.
+        Why contextmanager:
+            In multiprocessing search tasks, vectordbbench might init
+            totally hundreds of thousands of connections with DB server.
+            Too many connections may drain local FDs or server connection resources.
+            If the DB client doesn't have `close()` method, just set the object to None.
+        Examples:
+            >>> with self.init():
+            >>>     self.insert_embeddings()
+        """
+        self.client = application.Vespa(self.db_config["url"], port=self.db_config["port"])
+        yield
+        self.client = None
+    def need_normalize_cosine(self) -> bool:
+        """Wheather this database need to normalize dataset to support COSINE"""
+        return False
+    def insert_embeddings(
+        self,
+        embeddings: list[list[float]],
+        metadata: list[int],
+        **kwargs,
+    ) -> tuple[int, Exception | None]:
+        """Insert the embeddings to the vector database. The default number of embeddings for
+        each insert_embeddings is 5000.
+        Args:
+            embeddings(list[list[float]]): list of embedding to add to the vector database.
+            metadatas(list[int]): metadata associated with the embeddings, for filtering.
+            **kwargs(Any): vector database specific parameters.
+        Returns:
+            int: inserted data count
+        """
+        assert self.client is not None
+        data = ({"id": str(i), "fields": {"id": i, "embedding": e}} for i, e in zip(metadata, embeddings, strict=True))
+        self.client.feed_iterable(data, self.schema_name)
+        return len(embeddings), None
+    def search_embedding(
+        self,
+        query: list[float],
+        k: int = 100,
+        filters: dict | None = None,
+    ) -> list[int]:
+        """Get k most similar embeddings to query vector.
+        Args:
+            query(list[float]): query embedding to look up documents similar to.
+            k(int): Number of most similar embeddings to return. Defaults to 100.
+            filters(dict, optional): filtering expression to filter the data while searching.
+        Returns:
+            list[int]: list of k most similar embeddings IDs to the query embedding.
+        """
+        assert self.client is not None
+        ef = self.case_config.ef
+        extra_ef = max(0, ef - k)
+        embedding_field = "embedding" if self.case_config.quantization_type == "none" else "embedding_binary"
+        yql = (
+            f"select id from {self.schema_name} where "  # noqa: S608
+            f"{{targetHits: {k}, hnsw.exploreAdditionalHits: {extra_ef}}}"
+            f"nearestNeighbor({embedding_field}, query_embedding)"
+        )
+        if filters:
+            id_filter = filters.get("id")
+            yql += f" and id >= {id_filter}"
+        query_embedding = query if self.case_config.quantization_type == "none" else util.binarize_tensor(query)
+        ranking = self.case_config.quantization_type
+        result = self.client.query({"yql": yql, "input.query(query_embedding)": query_embedding, "ranking": ranking})
+        return [child["fields"]["id"] for child in result.get_json()["root"]["children"]]
+    def optimize(self, data_size: int | None = None):
+        """optimize will be called between insertion and search in performance cases.
+        Should be blocked until the vectorDB is ready to be tested on
+        heavy performance cases.
+        Time(insert the dataset) + Time(optimize) will be recorded as "load_duration" metric
+        Optimize's execution time is limited, the limited time is based on cases.
+        """
+    @property
+    def application_package(self):
+        if getattr(self, "_application_package", None) is None:
+            self._application_package = self._create_application_package()
+        return self._application_package
+    def _create_application_package(self):
+        from vespa.package import (
+            HNSW,
+            ApplicationPackage,
+            Document,
+            Field,
+            RankProfile,
+            Schema,
+            Validation,
+            ValidationID,
+        )
+        fields = [
+            Field(
+                "id",
+                "int",
+                indexing=["summary", "attribute"],
+            ),
+            Field(
+                "embedding",
+                f"tensor<float>(x[{self.dim}])",
+                indexing=["summary", "attribute", "index"],
+                ann=HNSW(**self.case_config.index_param()),
+            ),
+        ]
+        if self.case_config.quantization_type == "binary":
+            fields.append(
+                Field(
+                    "embedding_binary",
+                    f"tensor<int8>(x[{math.ceil(self.dim / 8)}])",
+                    indexing=[
+                        "input embedding",
+                        # convert 32 bit float to 1 bit
+                        "binarize",
+                        # pack 8 bits into one int8
+                        "pack_bits",
+                        "summary",
+                        "attribute",
+                        "index",
+                    ],
+                    ann=HNSW(**{**self.case_config.index_param(), "distance_metric": "hamming"}),
+                    is_document_field=False,
+                )
+            )
+        tomorrow = datetime.date.today() + datetime.timedelta(days=1)
+        return ApplicationPackage(
+            "vectordbbench",
+            [
+                Schema(
+                    self.schema_name,
+                    Document(
+                        fields,
+                    ),
+                    rank_profiles=[
+                        RankProfile(
+                            name="none",
+                            first_phase="",
+                            inherits="default",
+                            inputs=[("query(query_embedding)", f"tensor<float>(x[{self.dim}])")],
+                        ),
+                        RankProfile(
+                            name="binary",
+                            first_phase="",
+                            inherits="default",
+                            inputs=[("query(query_embedding)", f"tensor<int8>(x[{math.ceil(self.dim / 8)}])")],
+                        ),
+                    ],
+                )
+            ],
+            validations=[
+                Validation(ValidationID.tensorTypeChange, until=tomorrow),
+                Validation(ValidationID.fieldTypeChange, until=tomorrow),
+            ],
+        )
+    def deploy_http(self) -> application.Vespa:
+        """
+        Deploy a Vespa application package via HTTP REST API.
+        Returns:
+            application.Vespa: The deployed Vespa application instance
+        """
+        import requests
+        url = self.db_config["url"] + ":19071/application/v2/tenant/default/prepareandactivate"
+        package_data = self.application_package.to_zip()
+        headers = {"Content-Type": "application/zip"}
+        try:
+            response = requests.post(url=url, data=package_data, headers=headers, timeout=10)
+            response.raise_for_status()
+            result = response.json()
+            return application.Vespa(
+                url=self.db_config["url"],
+                port=self.db_config["port"],
+                deployment_message=result.get("message"),
+                application_package=self.application_package,
+            )
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Failed to deploy Vespa application: {e!s}"
+            if hasattr(e, "response") and e.response is not None:
+                error_msg += f" - Response: {e.response.text}"
+            raise RuntimeError(error_msg) from e

vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py CHANGED Viewed

@@ -99,7 +99,7 @@ class WeaviateCloud(VectorDB):
         embeddings: Iterable[list[float]],
         metadata: list[int],
         **kwargs,
-    ) -> (int, Exception):
+    ) -> tuple[int, Exception]:
         """Insert embeddings into Weaviate"""
         assert self.client.schema.exists(self.collection_name)
         insert_count = 0

vectordb_bench/cli/cli.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import logging
-import os
 import time
 from collections.abc import Callable
 from concurrent.futures import wait
 from datetime import datetime
+from pathlib import Path
 from pprint import pformat
 from typing import (
     Annotated,
@@ -38,18 +38,17 @@ except ImportError:
     from yaml import Loader
-def click_get_defaults_from_file(ctx, param, value):
+def click_get_defaults_from_file(ctx, param, value):  # noqa: ANN001, ARG001
     if value:
-        if os.path.exists(value):
-            input_file = value
-        else:
-            input_file = os.path.join(config.CONFIG_LOCAL_DIR, value)
+        path = Path(value)
+        input_file = path if path.exists() else Path(config.CONFIG_LOCAL_DIR, path)
         try:
-            with open(input_file) as f:
-                _config: dict[str, dict[str, Any]] = load(f.read(), Loader=Loader)
+            with input_file.open() as f:
+                _config: dict[str, dict[str, Any]] = load(f.read(), Loader=Loader)  # noqa: S506
                 ctx.default_map = _config.get(ctx.command.name, {})
         except Exception as e:
-            raise click.BadParameter(f"Failed to load config file: {e}")
+            msg = f"Failed to load config file: {e}"
+            raise click.BadParameter(msg) from e
     return value
@@ -68,12 +67,16 @@ def click_parameter_decorators_from_typed_dict(
     For clarity, the key names of the TypedDict will be used to determine the type hints for the input parameters.
-    The actual function parameters are controlled by the click.option definitions. You must manually ensure these are aligned in a sensible way!
+    The actual function parameters are controlled by the click.option definitions.
+    You must manually ensure these are aligned in a sensible way!
     Example:
     ```
     class CommonTypedDict(TypedDict):
-        z: Annotated[int, click.option("--z/--no-z", is_flag=True, type=bool, help="help z", default=True, show_default=True)]
+        z: Annotated[
+            int,
+            click.option("--z/--no-z", is_flag=True, type=bool, help="help z", default=True, show_default=True)
+        ]
         name: Annotated[str, click.argument("name", required=False, default="Jeff")]
     class FooTypedDict(CommonTypedDict):
@@ -91,14 +94,16 @@ def click_parameter_decorators_from_typed_dict(
     for _, t in get_type_hints(typed_dict, include_extras=True).items():
         assert get_origin(t) is Annotated
         if len(t.__metadata__) == 1 and t.__metadata__[0].__module__ == "click.decorators":
-            # happy path -- only accept Annotated[..., Union[click.option,click.argument,...]] with no additional metadata defined (len=1)
+            # happy path -- only accept Annotated[..., Union[click.option,click.argument,...]]
+            # with no additional metadata defined (len=1)
             decorators.append(t.__metadata__[0])
         else:
             raise RuntimeError(
-                "Click-TypedDict decorator parsing must only contain root type and a click decorator like click.option. See docstring",
+                "Click-TypedDict decorator parsing must only contain root type "
+                "and a click decorator like click.option. See docstring",
             )
-    def deco(f):
+    def deco(f):  # noqa: ANN001
         for dec in reversed(decorators):
             f = dec(f)
         return f
@@ -106,7 +111,7 @@ def click_parameter_decorators_from_typed_dict(
     return deco
-def click_arg_split(ctx: click.Context, param: click.core.Option, value):
+def click_arg_split(ctx: click.Context, param: click.core.Option, value):  # noqa: ANN001, ARG001
     """Will split a comma-separated list input into an actual list.
     Args:
@@ -145,8 +150,7 @@ def parse_task_stages(
     return stages
-# ruff: noqa
-def check_custom_case_parameters(ctx: any, param: any, value: any):
+def check_custom_case_parameters(ctx: any, param: any, value: any):  # noqa: ARG001
     if ctx.params.get("case_type") == "PerformanceCustomDataset" and value is None:
         raise click.BadParameter(
             """ Custom case parameters

vectordb_bench/cli/vectordbbench.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from ..backend.clients.alloydb.cli import AlloyDBScaNN
 from ..backend.clients.aws_opensearch.cli import AWSOpenSearch
+from ..backend.clients.clickhouse.cli import Clickhouse
 from ..backend.clients.mariadb.cli import MariaDBHNSW
 from ..backend.clients.memorydb.cli import MemoryDB
 from ..backend.clients.milvus.cli import MilvusAutoIndex
@@ -9,9 +10,10 @@ from ..backend.clients.pgvector.cli import PgVectorHNSW
 from ..backend.clients.pgvectorscale.cli import PgVectorScaleDiskAnn
 from ..backend.clients.redis.cli import Redis
 from ..backend.clients.test.cli import Test
+from ..backend.clients.tidb.cli import TiDB
+from ..backend.clients.vespa.cli import Vespa
 from ..backend.clients.weaviate_cloud.cli import Weaviate
 from ..backend.clients.zilliz_cloud.cli import ZillizAutoIndex
-from ..backend.clients.tidb.cli import TiDB
 from .cli import cli
 cli.add_command(PgVectorHNSW)
@@ -29,6 +31,8 @@ cli.add_command(PgDiskAnn)
 cli.add_command(AlloyDBScaNN)
 cli.add_command(MariaDBHNSW)
 cli.add_command(TiDB)
+cli.add_command(Clickhouse)
+cli.add_command(Vespa)
 if __name__ == "__main__":

vectordb_bench/frontend/config/dbCaseConfigs.py CHANGED Viewed

@@ -1087,8 +1087,7 @@ CaseConfigParamInput_M_MariaDB = CaseConfigInput(
         "max": 200,
         "value": 6,
     },
-    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
-    == IndexType.HNSW.value,
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) == IndexType.HNSW.value,
 )
 CaseConfigParamInput_EFSearch_MariaDB = CaseConfigInput(
@@ -1100,8 +1099,7 @@ CaseConfigParamInput_EFSearch_MariaDB = CaseConfigInput(
         "max": 10000,
         "value": 20,
     },
-    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
-    == IndexType.HNSW.value,
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) == IndexType.HNSW.value,
 )
 CaseConfigParamInput_CacheSize_MariaDB = CaseConfigInput(
@@ -1111,10 +1109,9 @@ CaseConfigParamInput_CacheSize_MariaDB = CaseConfigInput(
     inputConfig={
         "min": 1048576,
         "max": (1 << 53) - 1,
-        "value": 16 * 1024 ** 3,
+        "value": 16 * 1024**3,
     },
-    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None)
-    == IndexType.HNSW.value,
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) == IndexType.HNSW.value,
 )
 CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput(
@@ -1137,6 +1134,47 @@ CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput(
 )
+CaseConfigParamInput_M_Vespa = CaseConfigInput(
+    label=CaseConfigParamType.M,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 4,
+        "max": 64,
+        "value": 16,
+    },
+    isDisplayed=lambda config: config.get(CaseConfigParamType.IndexType, None) == IndexType.HNSW.value,
+)
+CaseConfigParamInput_IndexType_Vespa = CaseConfigInput(
+    label=CaseConfigParamType.IndexType,
+    inputType=InputType.Option,
+    inputConfig={
+        "options": [
+            IndexType.HNSW.value,
+        ],
+    },
+)
+CaseConfigParamInput_QuantizationType_Vespa = CaseConfigInput(
+    label=CaseConfigParamType.quantizationType,
+    inputType=InputType.Option,
+    inputConfig={
+        "options": ["none", "binary"],
+    },
+)
+CaseConfigParamInput_EFConstruction_Vespa = CaseConfigInput(
+    label=CaseConfigParamType.EFConstruction,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 8,
+        "max": 512,
+        "value": 200,
+    },
+    isDisplayed=lambda config: config[CaseConfigParamType.IndexType] == IndexType.HNSW.value,
+)
 MilvusLoadConfig = [
     CaseConfigParamInput_IndexType,
     CaseConfigParamInput_M,
@@ -1344,6 +1382,15 @@ MariaDBPerformanceConfig = [
     CaseConfigParamInput_EFSearch_MariaDB,
 ]
+VespaLoadingConfig = [
+    CaseConfigParamInput_IndexType_Vespa,
+    CaseConfigParamInput_QuantizationType_Vespa,
+    CaseConfigParamInput_M_Vespa,
+    CaseConfigParamInput_EF_Milvus,
+    CaseConfigParamInput_EFConstruction_Vespa,
+]
+VespaPerformanceConfig = VespaLoadingConfig
 CASE_CONFIG_MAP = {
     DB.Milvus: {
         CaseLabel.Load: MilvusLoadConfig,
@@ -1400,4 +1447,8 @@ CASE_CONFIG_MAP = {
         CaseLabel.Load: MariaDBLoadingConfig,
         CaseLabel.Performance: MariaDBPerformanceConfig,
     },
+    DB.Vespa: {
+        CaseLabel.Load: VespaLoadingConfig,
+        CaseLabel.Performance: VespaPerformanceConfig,
+    },
 }

vectordb_bench/frontend/config/styles.py CHANGED Viewed

@@ -48,6 +48,7 @@ DB_TO_ICON = {
     DB.Chroma: "https://assets.zilliz.com/chroma_ceb3f06ed7.png",
     DB.AWSOpenSearch: "https://assets.zilliz.com/opensearch_1eee37584e.jpeg",
     DB.TiDB: "https://img2.pingcap.com/forms/3/d/3d7fd5f9767323d6f037795704211ac44b4923d6.png",
+    DB.Vespa: "https://vespa.ai/vespa-content/uploads/2025/01/Vespa-symbol-green-rgb.png.webp",
 }
 # RedisCloud color: #0D6EFD
@@ -63,4 +64,5 @@ COLOR_MAP = {
     DB.Redis.value: "#0D6EFD",
     DB.AWSOpenSearch.value: "#0DCAF0",
     DB.TiDB.value: "#0D6EFD",
+    DB.Vespa.value: "#61d790",
 }

vectordb_bench/models.py CHANGED Viewed

@@ -263,7 +263,6 @@ class TestResult(BaseModel):
                     )
             return TestResult.validate(test_result)
-    # ruff: noqa
     def display(self, dbs: list[DB] | None = None):
         filter_list = dbs if dbs and isinstance(dbs, list) else None
         sorted_results = sorted(
@@ -294,7 +293,7 @@ class TestResult(BaseModel):
         max_qps = 10 if max_qps < 10 else max_qps
         max_recall = 13 if max_recall < 13 else max_recall
-        LENGTH = (
+        LENGTH = (  # noqa: N806
             max_db,
             max_db_labels,
             max_case,
@@ -307,13 +306,13 @@ class TestResult(BaseModel):
             5,
         )
-        DATA_FORMAT = (
+        DATA_FORMAT = (  # noqa: N806
             f"%-{max_db}s | %-{max_db_labels}s %-{max_case}s %-{len(self.task_label)}s"
             f" | %-{max_load_dur}s %-{max_qps}s %-15s %-{max_recall}s %-14s"
             f" | %-5s"
         )
-        TITLE = DATA_FORMAT % (
+        TITLE = DATA_FORMAT % (  # noqa: N806
             "DB",
             "db_label",
             "case",
@@ -325,8 +324,8 @@ class TestResult(BaseModel):
             "max_load_count",
             "label",
         )
-        SPLIT = DATA_FORMAT % tuple(map(lambda x: "-" * x, LENGTH))
-        SUMMARY_FORMAT = ("Task summary: run_id=%s, task_label=%s") % (
+        SPLIT = DATA_FORMAT % tuple(map(lambda x: "-" * x, LENGTH))  # noqa: C417, N806
+        SUMMARY_FORMAT = ("Task summary: run_id=%s, task_label=%s") % (  # noqa: N806
             self.run_id[:5],
             self.task_label,
         )

{vectordb_bench-0.0.23.dist-info → vectordb_bench-0.0.25.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: vectordb-bench
-Version: 0.0.23
+Version: 0.0.25
 Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
 Author-email: XuanYang-cn <xuan.yang@zilliz.com>
 Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -13,7 +13,7 @@ License-File: LICENSE
 Requires-Dist: click
 Requires-Dist: pytz
 Requires-Dist: streamlit-autorefresh
-Requires-Dist: streamlit!=1.34.0
+Requires-Dist: streamlit!=1.34.0,<1.44
 Requires-Dist: streamlit_extras
 Requires-Dist: tqdm
 Requires-Dist: s3fs
@@ -50,6 +50,8 @@ Requires-Dist: alibabacloud_ha3engine_vector; extra == "all"
 Requires-Dist: alibabacloud_searchengine20211025; extra == "all"
 Requires-Dist: mariadb; extra == "all"
 Requires-Dist: PyMySQL; extra == "all"
+Requires-Dist: clickhouse-connect; extra == "all"
+Requires-Dist: pyvespa; extra == "all"
 Provides-Extra: qdrant
 Requires-Dist: qdrant-client; extra == "qdrant"
 Provides-Extra: pinecone
@@ -81,6 +83,11 @@ Provides-Extra: mariadb
 Requires-Dist: mariadb; extra == "mariadb"
 Provides-Extra: tidb
 Requires-Dist: PyMySQL; extra == "tidb"
+Provides-Extra: clickhouse
+Requires-Dist: clickhouse-connect; extra == "clickhouse"
+Provides-Extra: vespa
+Requires-Dist: pyvespa; extra == "vespa"
+Dynamic: license-file
 # VectorDBBench: A Benchmark Tool for VectorDB
@@ -140,6 +147,7 @@ All the database client supported
 | aliyun_opensearch        | `pip install vectordb-bench[aliyun_opensearch]` |
 | mongodb                  | `pip install vectordb-bench[mongodb]`       |
 | tidb                     | `pip install vectordb-bench[tidb]`          |
+| vespa                    | `pip install vectordb-bench[vespa]`         |
 ### Run

vectordb-bench 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

vectordb-bench 0.0.23py3-none-any.whl → 0.0.25py3-none-any.whl