PyPI - vectordb-bench - Versions diffs - 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl - Mend

vectordb-bench 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

vectordb_bench/__init__.py +1 -0
vectordb_bench/backend/cases.py +45 -1
vectordb_bench/backend/clients/__init__.py +47 -0
vectordb_bench/backend/clients/api.py +2 -0
vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +104 -40
vectordb_bench/backend/clients/aws_opensearch/cli.py +52 -15
vectordb_bench/backend/clients/aws_opensearch/config.py +27 -7
vectordb_bench/backend/clients/hologres/cli.py +50 -0
vectordb_bench/backend/clients/hologres/config.py +121 -0
vectordb_bench/backend/clients/hologres/hologres.py +365 -0
vectordb_bench/backend/clients/lancedb/lancedb.py +1 -0
vectordb_bench/backend/clients/milvus/cli.py +29 -9
vectordb_bench/backend/clients/milvus/config.py +2 -0
vectordb_bench/backend/clients/milvus/milvus.py +1 -1
vectordb_bench/backend/clients/oceanbase/cli.py +1 -0
vectordb_bench/backend/clients/oceanbase/config.py +3 -1
vectordb_bench/backend/clients/oceanbase/oceanbase.py +20 -4
vectordb_bench/backend/clients/oss_opensearch/cli.py +155 -0
vectordb_bench/backend/clients/oss_opensearch/config.py +157 -0
vectordb_bench/backend/clients/oss_opensearch/oss_opensearch.py +582 -0
vectordb_bench/backend/clients/oss_opensearch/run.py +166 -0
vectordb_bench/backend/clients/pgdiskann/cli.py +45 -0
vectordb_bench/backend/clients/pgdiskann/config.py +16 -0
vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +94 -26
vectordb_bench/backend/clients/s3_vectors/config.py +41 -0
vectordb_bench/backend/clients/s3_vectors/s3_vectors.py +171 -0
vectordb_bench/backend/clients/tidb/cli.py +0 -4
vectordb_bench/backend/clients/tidb/config.py +22 -2
vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -1
vectordb_bench/backend/clients/zilliz_cloud/config.py +4 -1
vectordb_bench/backend/dataset.py +70 -0
vectordb_bench/backend/filter.py +17 -0
vectordb_bench/backend/runner/mp_runner.py +4 -0
vectordb_bench/backend/runner/rate_runner.py +23 -11
vectordb_bench/backend/runner/read_write_runner.py +10 -9
vectordb_bench/backend/runner/serial_runner.py +23 -7
vectordb_bench/backend/task_runner.py +5 -4
vectordb_bench/cli/cli.py +36 -0
vectordb_bench/cli/vectordbbench.py +4 -0
vectordb_bench/fig/custom_case_run_test.png +0 -0
vectordb_bench/fig/custom_dataset.png +0 -0
vectordb_bench/fig/homepage/bar-chart.png +0 -0
vectordb_bench/fig/homepage/concurrent.png +0 -0
vectordb_bench/fig/homepage/custom.png +0 -0
vectordb_bench/fig/homepage/label_filter.png +0 -0
vectordb_bench/fig/homepage/qp$.png +0 -0
vectordb_bench/fig/homepage/run_test.png +0 -0
vectordb_bench/fig/homepage/streaming.png +0 -0
vectordb_bench/fig/homepage/table.png +0 -0
vectordb_bench/fig/run_test_select_case.png +0 -0
vectordb_bench/fig/run_test_select_db.png +0 -0
vectordb_bench/fig/run_test_submit.png +0 -0
vectordb_bench/frontend/components/check_results/filters.py +1 -4
vectordb_bench/frontend/components/check_results/nav.py +2 -1
vectordb_bench/frontend/components/concurrent/charts.py +5 -0
vectordb_bench/frontend/components/int_filter/charts.py +60 -0
vectordb_bench/frontend/components/streaming/data.py +7 -0
vectordb_bench/frontend/components/welcome/welcomePrams.py +42 -4
vectordb_bench/frontend/config/dbCaseConfigs.py +142 -16
vectordb_bench/frontend/config/styles.py +4 -0
vectordb_bench/frontend/pages/concurrent.py +1 -1
vectordb_bench/frontend/pages/custom.py +1 -1
vectordb_bench/frontend/pages/int_filter.py +56 -0
vectordb_bench/frontend/pages/streaming.py +16 -3
vectordb_bench/interface.py +5 -1
vectordb_bench/metric.py +7 -0
vectordb_bench/models.py +39 -4
vectordb_bench/results/S3Vectors/result_20250722_standard_s3vectors.json +2509 -0
vectordb_bench/results/getLeaderboardDataV2.py +23 -2
vectordb_bench/results/leaderboard_v2.json +200 -0
vectordb_bench/results/leaderboard_v2_streaming.json +128 -0
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/METADATA +40 -8
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/RECORD +77 -51
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/WHEEL +0 -0
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/entry_points.txt +0 -0
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/licenses/LICENSE +0 -0
{vectordb_bench-1.0.4.dist-info → vectordb_bench-1.0.7.dist-info}/top_level.txt +0 -0

vectordb_bench/backend/clients/oss_opensearch/run.py ADDED Viewed

@@ -0,0 +1,166 @@
+import logging
+import random
+import time
+from opensearchpy import OpenSearch
+log = logging.getLogger(__name__)
+_HOST = "xxxxxx.us-west-2.es.amazonaws.com"
+_PORT = 443
+_AUTH = ("admin", "xxxxxx")  # For testing only. Don't store credentials in code.
+_INDEX_NAME = "my-dsl-index"
+_BATCH = 100
+_ROWS = 100
+_DIM = 128
+_TOPK = 10
+def create_client():
+    return OpenSearch(
+        hosts=[{"host": _HOST, "port": _PORT}],
+        http_compress=True,  # enables gzip compression for request bodies
+        http_auth=_AUTH,
+        use_ssl=True,
+        verify_certs=True,
+        ssl_assert_hostname=False,
+        ssl_show_warn=False,
+    )
+def create_index(client: OpenSearch, index_name: str):
+    settings = {
+        "index": {
+            "knn": True,
+            "number_of_shards": 1,
+            "refresh_interval": "5s",
+        },
+    }
+    mappings = {
+        "properties": {
+            "embedding": {
+                "type": "knn_vector",
+                "dimension": _DIM,
+                "method": {
+                    "engine": "faiss",
+                    "name": "hnsw",
+                    "space_type": "l2",
+                    "parameters": {
+                        "ef_construction": 256,
+                        "m": 16,
+                    },
+                },
+            },
+        },
+    }
+    response = client.indices.create(
+        index=index_name,
+        body={"settings": settings, "mappings": mappings},
+    )
+    log.info("\nCreating index:")
+    log.info(response)
+def delete_index(client: OpenSearch, index_name: str):
+    response = client.indices.delete(index=index_name)
+    log.info("\nDeleting index:")
+    log.info(response)
+def bulk_insert(client: OpenSearch, index_name: str):
+    # Perform bulk operations
+    ids = list(range(_ROWS))
+    vec = [[random.random() for _ in range(_DIM)] for _ in range(_ROWS)]
+    docs = []
+    for i in range(0, _ROWS, _BATCH):
+        docs.clear()
+        for j in range(_BATCH):
+            docs.append({"index": {"_index": index_name, "_id": ids[i + j]}})
+            docs.append({"embedding": vec[i + j]})
+        response = client.bulk(docs)
+        log.info(f"Adding documents: {len(response['items'])}, {response['errors']}")
+        response = client.indices.stats(index_name)
+        log.info(
+            f'Total document count in index: { response["_all"]["primaries"]["indexing"]["index_total"] }',
+        )
+def search(client: OpenSearch, index_name: str):
+    # Search for the document.
+    search_body = {
+        "size": _TOPK,
+        "query": {
+            "knn": {
+                "embedding": {
+                    "vector": [random.random() for _ in range(_DIM)],
+                    "k": _TOPK,
+                },
+            },
+        },
+    }
+    while True:
+        response = client.search(index=index_name, body=search_body)
+        log.info(f'\nSearch took: {response["took"]}')
+        log.info(f'\nSearch shards: {response["_shards"]}')
+        log.info(f'\nSearch hits total: {response["hits"]["total"]}')
+        result = response["hits"]["hits"]
+        if len(result) != 0:
+            log.info("\nSearch results:")
+            for hit in response["hits"]["hits"]:
+                log.info(hit["_id"], hit["_score"])
+            break
+        log.info("\nSearch not ready, sleep 1s")
+        time.sleep(1)
+SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+WAITINT_FOR_REFRESH_SEC = 30
+def optimize_index(client: OpenSearch, index_name: str):
+    log.info(f"Starting force merge for index {index_name}")
+    force_merge_endpoint = f"/{index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false"
+    force_merge_task_id = client.transport.perform_request("POST", force_merge_endpoint)["task"]
+    while True:
+        time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
+        task_status = client.tasks.get(task_id=force_merge_task_id)
+        if task_status["completed"]:
+            break
+    log.info(f"Completed force merge for index {index_name}")
+def refresh_index(client: OpenSearch, index_name: str):
+    log.info(f"Starting refresh for index {index_name}")
+    while True:
+        try:
+            log.info("Starting the Refresh Index..")
+            client.indices.refresh(index=index_name)
+            break
+        except Exception as e:
+            log.info(
+                f"Refresh errored out. Sleeping for {WAITINT_FOR_REFRESH_SEC} sec and then Retrying : {e}",
+            )
+            time.sleep(WAITINT_FOR_REFRESH_SEC)
+            continue
+    log.info(f"Completed refresh for index {index_name}")
+def main():
+    client = create_client()
+    try:
+        create_index(client, _INDEX_NAME)
+        bulk_insert(client, _INDEX_NAME)
+        optimize_index(client, _INDEX_NAME)
+        refresh_index(client, _INDEX_NAME)
+        search(client, _INDEX_NAME)
+        delete_index(client, _INDEX_NAME)
+    except Exception as e:
+        log.info(e)
+        delete_index(client, _INDEX_NAME)
+if __name__ == "__main__":
+    main()

vectordb_bench/backend/clients/pgdiskann/cli.py CHANGED Viewed

@@ -5,6 +5,7 @@ import click
 from pydantic import SecretStr
 from vectordb_bench.backend.clients import DB
+from vectordb_bench.backend.clients.api import MetricType
 from ....cli.cli import (
     CommonTypedDict,
@@ -48,6 +49,15 @@ class PgDiskAnnTypedDict(CommonTypedDict):
             help="PgDiskAnn l_value_ib",
         ),
     ]
+    pq_param_num_chunks: Annotated[
+        int,
+        click.option(
+            "--pq-param-num-chunks",
+            type=int,
+            help="PgDiskAnn pq_param_num_chunks",
+            required=False,
+        ),
+    ]
     l_value_is: Annotated[
         float,
         click.option(
@@ -56,6 +66,37 @@ class PgDiskAnnTypedDict(CommonTypedDict):
             help="PgDiskAnn l_value_is",
         ),
     ]
+    reranking: Annotated[
+        bool | None,
+        click.option(
+            "--reranking/--skip-reranking",
+            type=bool,
+            help="Enable reranking for PQ search",
+            default=False,
+        ),
+    ]
+    reranking_metric: Annotated[
+        str | None,
+        click.option(
+            "--reranking-metric",
+            type=click.Choice(
+                [metric.value for metric in MetricType if metric.value not in ["HAMMING", "JACCARD", "DP"]],
+            ),
+            help="Distance metric for reranking",
+            default="COSINE",
+            show_default=True,
+            required=False,
+        ),
+    ]
+    quantized_fetch_limit: Annotated[
+        int | None,
+        click.option(
+            "--quantized-fetch-limit",
+            type=int,
+            help="Limit of inner query in case of reranking",
+            required=False,
+        ),
+    ]
     maintenance_work_mem: Annotated[
         str | None,
         click.option(
@@ -98,7 +139,11 @@ def PgDiskAnn(
         db_case_config=PgDiskANNImplConfig(
             max_neighbors=parameters["max_neighbors"],
             l_value_ib=parameters["l_value_ib"],
+            pq_param_num_chunks=parameters["pq_param_num_chunks"],
             l_value_is=parameters["l_value_is"],
+            reranking=parameters["reranking"],
+            reranking_metric=parameters["reranking_metric"],
+            quantized_fetch_limit=parameters["quantized_fetch_limit"],
             max_parallel_workers=parameters["max_parallel_workers"],
             maintenance_work_mem=parameters["maintenance_work_mem"],
         ),

vectordb_bench/backend/clients/pgdiskann/config.py CHANGED Viewed

@@ -60,6 +60,13 @@ class PgDiskANNIndexConfig(BaseModel, DBCaseConfig):
             return "<#>"
         return "<=>"
+    def parse_reranking_metric_fun_op(self) -> LiteralString:
+        if self.reranking_metric == MetricType.L2:
+            return "<->"
+        if self.reranking_metric == MetricType.IP:
+            return "<#>"
+        return "<=>"
     def parse_metric_fun_str(self) -> str:
         if self.metric_type == MetricType.L2:
             return "l2_distance"
@@ -115,7 +122,11 @@ class PgDiskANNImplConfig(PgDiskANNIndexConfig):
     index: IndexType = IndexType.DISKANN
     max_neighbors: int | None
     l_value_ib: int | None
+    pq_param_num_chunks: int | None
     l_value_is: float | None
+    reranking: bool | None = None
+    reranking_metric: str | None = None
+    quantized_fetch_limit: int | None = None
     maintenance_work_mem: str | None = None
     max_parallel_workers: int | None = None
@@ -126,6 +137,8 @@ class PgDiskANNImplConfig(PgDiskANNIndexConfig):
             "options": {
                 "max_neighbors": self.max_neighbors,
                 "l_value_ib": self.l_value_ib,
+                "pq_param_num_chunks": self.pq_param_num_chunks,
+                "product_quantized": str(self.reranking),
             },
             "maintenance_work_mem": self.maintenance_work_mem,
             "max_parallel_workers": self.max_parallel_workers,
@@ -135,6 +148,9 @@ class PgDiskANNImplConfig(PgDiskANNIndexConfig):
         return {
             "metric": self.parse_metric(),
             "metric_fun_op": self.parse_metric_fun_op(),
+            "reranking": self.reranking,
+            "reranking_metric_fun_op": self.parse_reranking_metric_fun_op(),
+            "quantized_fetch_limit": self.quantized_fetch_limit,
         }
     def session_param(self) -> dict:

vectordb_bench/backend/clients/pgdiskann/pgdiskann.py CHANGED Viewed

@@ -90,38 +90,83 @@ class PgDiskANN(VectorDB):
     def init(self) -> Generator[None, None, None]:
         self.conn, self.cursor = self._create_connection(**self.db_config)
-        # index configuration may have commands defined that we should set during each client session
         session_options: dict[str, Any] = self.case_config.session_param()
         if len(session_options) > 0:
             for setting_name, setting_val in session_options.items():
-                command = sql.SQL("SET {setting_name} " + "= {setting_val};").format(
-                    setting_name=sql.Identifier(setting_name),
-                    setting_val=sql.Identifier(str(setting_val)),
+                command = sql.SQL("SET {setting_name} = {setting_val};").format(
+                    setting_name=sql.Identifier(setting_name), setting_val=sql.Literal(setting_val)
                 )
                 log.debug(command.as_string(self.cursor))
                 self.cursor.execute(command)
             self.conn.commit()
-        self._filtered_search = sql.Composed(
-            [
-                sql.SQL(
-                    "SELECT id FROM public.{table_name} WHERE id >= %s ORDER BY embedding ",
-                ).format(table_name=sql.Identifier(self.table_name)),
-                sql.SQL(self.case_config.search_param()["metric_fun_op"]),
-                sql.SQL(" %s::vector LIMIT %s::int"),
-            ],
-        )
+        search_params = self.case_config.search_param()
+        if search_params.get("reranking"):
+            # Reranking-enabled queries
+            self._filtered_search = sql.SQL(
+                """
+                SELECT i.id
+                FROM (
+                    SELECT id, embedding
+                    FROM public.{table_name}
+                    WHERE id >= %s
+                    ORDER BY embedding {metric_fun_op} %s::vector
+                    LIMIT {quantized_fetch_limit}::int
+                ) i
+                ORDER BY i.embedding {reranking_metric_fun_op} %s::vector
+                LIMIT %s::int
+            """
+            ).format(
+                table_name=sql.Identifier(self.table_name),
+                metric_fun_op=sql.SQL(search_params["metric_fun_op"]),
+                reranking_metric_fun_op=sql.SQL(search_params["reranking_metric_fun_op"]),
+                quantized_fetch_limit=sql.Literal(search_params["quantized_fetch_limit"]),
+            )
-        self._unfiltered_search = sql.Composed(
-            [
-                sql.SQL("SELECT id FROM public.{} ORDER BY embedding ").format(
-                    sql.Identifier(self.table_name),
-                ),
-                sql.SQL(self.case_config.search_param()["metric_fun_op"]),
-                sql.SQL(" %s::vector LIMIT %s::int"),
-            ],
-        )
+            self._unfiltered_search = sql.SQL(
+                """
+                SELECT i.id
+                FROM (
+                    SELECT id, embedding
+                    FROM public.{table_name}
+                    ORDER BY embedding {metric_fun_op} %s::vector
+                    LIMIT {quantized_fetch_limit}::int
+                ) i
+                ORDER BY i.embedding {reranking_metric_fun_op} %s::vector
+                LIMIT %s::int
+            """
+            ).format(
+                table_name=sql.Identifier(self.table_name),
+                metric_fun_op=sql.SQL(search_params["metric_fun_op"]),
+                reranking_metric_fun_op=sql.SQL(search_params["reranking_metric_fun_op"]),
+                quantized_fetch_limit=sql.Literal(search_params["quantized_fetch_limit"]),
+            )
+        else:
+            self._filtered_search = sql.Composed(
+                [
+                    sql.SQL(
+                        "SELECT id FROM public.{table_name} WHERE id >= %s ORDER BY embedding ",
+                    ).format(table_name=sql.Identifier(self.table_name)),
+                    sql.SQL(search_params["metric_fun_op"]),
+                    sql.SQL(" %s::vector LIMIT %s::int"),
+                ]
+            )
+            self._unfiltered_search = sql.Composed(
+                [
+                    sql.SQL("SELECT id FROM public.{table_name} ORDER BY embedding ").format(
+                        table_name=sql.Identifier(self.table_name)
+                    ),
+                    sql.SQL(search_params["metric_fun_op"]),
+                    sql.SQL(" %s::vector LIMIT %s::int"),
+                ]
+            )
+        log.debug(f"Unfiltered search query={self._unfiltered_search.as_string(self.conn)}")
+        log.debug(f"Filtered search query={self._filtered_search.as_string(self.conn)}")
         try:
             yield
@@ -234,7 +279,7 @@ class PgDiskANN(VectorDB):
                 options.append(
                     sql.SQL("{option_name} = {val}").format(
                         option_name=sql.Identifier(option_name),
-                        val=sql.Identifier(str(option_val)),
+                        val=sql.Literal(option_val),
                     ),
                 )
@@ -314,16 +359,39 @@ class PgDiskANN(VectorDB):
         assert self.conn is not None, "Connection is not initialized"
         assert self.cursor is not None, "Cursor is not initialized"
+        search_params = self.case_config.search_param()
+        is_reranking = search_params.get("reranking", False)
         q = np.asarray(query)
         if filters:
             gt = filters.get("id")
+            if is_reranking:
+                result = self.cursor.execute(
+                    self._filtered_search,
+                    (gt, q, q, k),
+                    prepare=True,
+                    binary=True,
+                )
+            else:
+                result = self.cursor.execute(
+                    self._filtered_search,
+                    (gt, q, k),
+                    prepare=True,
+                    binary=True,
+                )
+        elif is_reranking:
             result = self.cursor.execute(
-                self._filtered_search,
-                (gt, q, k),
+                self._unfiltered_search,
+                (q, q, k),
                 prepare=True,
                 binary=True,
             )
         else:
-            result = self.cursor.execute(self._unfiltered_search, (q, k), prepare=True, binary=True)
+            result = self.cursor.execute(
+                self._unfiltered_search,
+                (q, k),
+                prepare=True,
+                binary=True,
+            )
         return [int(i[0]) for i in result.fetchall()]

vectordb_bench/backend/clients/s3_vectors/config.py ADDED Viewed

@@ -0,0 +1,41 @@
+from pydantic import BaseModel, SecretStr
+from ..api import DBCaseConfig, DBConfig, MetricType
+class S3VectorsConfig(DBConfig):
+    region_name: str = "us-west-2"
+    access_key_id: SecretStr
+    secret_access_key: SecretStr
+    bucket_name: str
+    index_name: str = "vdbbench-index"
+    def to_dict(self) -> dict:
+        return {
+            "region_name": self.region_name,
+            "access_key_id": self.access_key_id.get_secret_value() if self.access_key_id else "",
+            "secret_access_key": self.secret_access_key.get_secret_value() if self.secret_access_key else "",
+            "bucket_name": self.bucket_name,
+            "index_name": self.index_name,
+        }
+class S3VectorsIndexConfig(DBCaseConfig, BaseModel):
+    """Base config for s3-vectors"""
+    metric_type: MetricType | None = None
+    data_type: str = "float32"
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.COSINE:
+            return "cosine"
+        if self.metric_type == MetricType.L2:
+            return "euclidean"
+        msg = f"Unsupported metric type: {self.metric_type}"
+        raise ValueError(msg)
+    def index_param(self) -> dict:
+        return {}
+    def search_param(self) -> dict:
+        return {}

vectordb_bench/backend/clients/s3_vectors/s3_vectors.py ADDED Viewed

@@ -0,0 +1,171 @@
+"""Wrapper around the Milvus vector database over VectorDB"""
+import logging
+from collections.abc import Iterable
+from contextlib import contextmanager
+import boto3
+from vectordb_bench.backend.filter import Filter, FilterOp
+from ..api import VectorDB
+from .config import S3VectorsIndexConfig
+log = logging.getLogger(__name__)
+class S3Vectors(VectorDB):
+    supported_filter_types: list[FilterOp] = [
+        FilterOp.NonFilter,
+        FilterOp.NumGE,
+        FilterOp.StrEqual,
+    ]
+    def __init__(
+        self,
+        dim: int,
+        db_config: dict,
+        db_case_config: S3VectorsIndexConfig,
+        drop_old: bool = False,
+        with_scalar_labels: bool = False,
+        **kwargs,
+    ):
+        """Initialize wrapper around the s3-vectors client."""
+        self.db_config = db_config
+        self.case_config = db_case_config
+        self.with_scalar_labels = with_scalar_labels
+        self.batch_size = 500
+        self._scalar_id_field = "id"
+        self._scalar_label_field = "label"
+        self._vector_field = "vector"
+        self.region_name = self.db_config.get("region_name")
+        self.access_key_id = self.db_config.get("access_key_id")
+        self.secret_access_key = self.db_config.get("secret_access_key")
+        self.bucket_name = self.db_config.get("bucket_name")
+        self.index_name = self.db_config.get("index_name")
+        client = boto3.client(
+            service_name="s3vectors",
+            region_name=self.region_name,
+            aws_access_key_id=self.access_key_id,
+            aws_secret_access_key=self.secret_access_key,
+        )
+        if drop_old:
+            # delete old index if exists
+            response = client.list_indexes(vectorBucketName=self.bucket_name)
+            index_names = [index["indexName"] for index in response["indexes"]]
+            if self.index_name in index_names:
+                log.info(f"drop old index: {self.index_name}")
+                client.delete_index(vectorBucketName=self.bucket_name, indexName=self.index_name)
+            # create the index
+            client.create_index(
+                vectorBucketName=self.bucket_name,
+                indexName=self.index_name,
+                dataType=self.case_config.data_type,
+                dimension=dim,
+                distanceMetric=self.case_config.parse_metric(),
+            )
+        client.close()
+    @contextmanager
+    def init(self):
+        """
+        Examples:
+            >>> with self.init():
+            >>>     self.insert_embeddings()
+            >>>     self.search_embedding()
+        """
+        self.client = boto3.client(
+            service_name="s3vectors",
+            region_name=self.region_name,
+            aws_access_key_id=self.access_key_id,
+            aws_secret_access_key=self.secret_access_key,
+        )
+        yield
+        self.client.close()
+    def optimize(self, **kwargs):
+        return
+    def need_normalize_cosine(self) -> bool:
+        """Wheather this database need to normalize dataset to support COSINE"""
+        return False
+    def insert_embeddings(
+        self,
+        embeddings: Iterable[list[float]],
+        metadata: list[int],
+        labels_data: list[str] | None = None,
+        **kwargs,
+    ) -> tuple[int, Exception]:
+        """Insert embeddings into s3-vectors. should call self.init() first"""
+        # use the first insert_embeddings to init collection
+        assert self.client is not None
+        assert len(embeddings) == len(metadata)
+        insert_count = 0
+        try:
+            for batch_start_offset in range(0, len(embeddings), self.batch_size):
+                batch_end_offset = min(batch_start_offset + self.batch_size, len(embeddings))
+                insert_data = [
+                    {
+                        "key": str(metadata[i]),
+                        "data": {self.case_config.data_type: embeddings[i]},
+                        "metadata": (
+                            {self._scalar_label_field: labels_data[i], self._scalar_id_field: metadata[i]}
+                            if self.with_scalar_labels
+                            else {self._scalar_id_field: metadata[i]}
+                        ),
+                    }
+                    for i in range(batch_start_offset, batch_end_offset)
+                ]
+                self.client.put_vectors(
+                    vectorBucketName=self.bucket_name,
+                    indexName=self.index_name,
+                    vectors=insert_data,
+                )
+                insert_count += len(insert_data)
+        except Exception as e:
+            log.info(f"Failed to insert data: {e}")
+            return insert_count, e
+        return insert_count, None
+    def prepare_filter(self, filters: Filter):
+        if filters.type == FilterOp.NonFilter:
+            self.filter = None
+        elif filters.type == FilterOp.NumGE:
+            self.filter = {self._scalar_id_field: {"$gte": filters.int_value}}
+        elif filters.type == FilterOp.StrEqual:
+            self.filter = {self._scalar_label_field: filters.label_value}
+        else:
+            msg = f"Not support Filter for S3Vectors - {filters}"
+            raise ValueError(msg)
+    def search_embedding(
+        self,
+        query: list[float],
+        k: int = 100,
+        timeout: int | None = None,
+    ) -> list[int]:
+        """Perform a search on a query embedding and return results."""
+        assert self.client is not None
+        # Perform the search.
+        res = self.client.query_vectors(
+            vectorBucketName=self.bucket_name,
+            indexName=self.index_name,
+            queryVector={"float32": query},
+            topK=k,
+            filter=self.filter,
+            returnDistance=False,
+            returnMetadata=False,
+        )
+        # Organize results.
+        return [int(result["key"]) for result in res["vectors"]]

vectordb_bench/backend/clients/tidb/cli.py CHANGED Viewed

@@ -17,7 +17,6 @@ class TiDBTypedDict(CommonTypedDict):
             help="Username",
             default="root",
             show_default=True,
-            required=True,
         ),
     ]
     password: Annotated[
@@ -37,7 +36,6 @@ class TiDBTypedDict(CommonTypedDict):
             type=str,
             default="127.0.0.1",
             show_default=True,
-            required=True,
             help="Db host",
         ),
     ]
@@ -48,7 +46,6 @@ class TiDBTypedDict(CommonTypedDict):
             type=int,
             default=4000,
             show_default=True,
-            required=True,
             help="Db Port",
         ),
     ]
@@ -59,7 +56,6 @@ class TiDBTypedDict(CommonTypedDict):
             type=str,
             default="test",
             show_default=True,
-            required=True,
             help="Db name",
         ),
     ]

vectordb-bench 1.0.4__py3-none-any.whl → 1.0.7__py3-none-any.whl

vectordb-bench 1.0.4py3-none-any.whl → 1.0.7py3-none-any.whl