PyPI - vectordb-bench - Versions diffs - 0.0.20__tar.gz → 0.0.22__tar.gz - Mend

vectordb-bench 0.0.20tar.gz → 0.0.22tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/.gitignore RENAMED Viewed

@@ -8,5 +8,7 @@ __MACOSX
 .DS_Store
 build/
 venv/
+.venv/
 .idea/
-results/
+results/
+logs/

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: vectordb-bench
-Version: 0.0.20
+Version: 0.0.22
 Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
 Author-email: XuanYang-cn <xuan.yang@zilliz.com>
 Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -21,7 +21,7 @@ Requires-Dist: oss2
 Requires-Dist: psutil
 Requires-Dist: polars
 Requires-Dist: plotly
-Requires-Dist: environs
+Requires-Dist: environs<14.1.0
 Requires-Dist: pydantic<v2
 Requires-Dist: scikit-learn
 Requires-Dist: pymilvus
@@ -73,6 +73,8 @@ Requires-Dist: opensearch-py; extra == "opensearch"
 Provides-Extra: aliyun-opensearch
 Requires-Dist: alibabacloud_ha3engine_vector; extra == "aliyun-opensearch"
 Requires-Dist: alibabacloud_searchengine20211025; extra == "aliyun-opensearch"
+Provides-Extra: mongodb
+Requires-Dist: pymongo; extra == "mongodb"
 # VectorDBBench: A Benchmark Tool for VectorDB
@@ -89,6 +91,8 @@ Closely mimicking real-world production environments, we've set up diverse testi
 Prepare to delve into the world of VectorDBBench, and let it guide you in uncovering your perfect vector database match.
+VectorDBBench is sponsered by Zilliz，the leading opensource vectorDB company behind Milvus. Choose smarter with VectorDBBench- start your free test on [zilliz cloud](https://zilliz.com/) today!
 **Leaderboard:** https://zilliz.com/benchmark
 ## Quick Start
 ### Prerequirement
@@ -128,6 +132,7 @@ All the database client supported
 | chromadb                 | `pip install vectordb-bench[chromadb]`      |
 | awsopensearch            | `pip install vectordb-bench[opensearch]` |
 | aliyun_opensearch        | `pip install vectordb-bench[aliyun_opensearch]` |
+| mongodb                  | `pip install vectordb-bench[mongodb]`       |
 ### Run
@@ -228,6 +233,47 @@ Options:
                                   with-gt]
   --help                          Show this message and exit.
 ```
+### Run awsopensearch from command line
+```shell
+vectordbbench awsopensearch --db-label awsopensearch \
+--m 16 --ef-construction 256 \
+--host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \
+--user vector --password '<password>' \
+--case-type Performance1536D5M --num-insert-workers 10  \
+--skip-load --num-concurrency 75
+```
+To list the options for awsopensearch, execute `vectordbbench awsopensearch --help`
+```text
+$ vectordbbench awsopensearch --help
+Usage: vectordbbench awsopensearch [OPTIONS]
+Options:
+  # Sharding and Replication
+  --number-of-shards INTEGER      Number of primary shards for the index
+  --number-of-replicas INTEGER    Number of replica copies for each primary
+                                  shard
+  # Indexing Performance
+  --index-thread-qty INTEGER      Thread count for native engine indexing
+  --index-thread-qty-during-force-merge INTEGER
+                                  Thread count during force merge operations
+  --number-of-indexing-clients INTEGER
+                                  Number of concurrent indexing clients
+  # Index Management
+  --number-of-segments INTEGER    Target number of segments after merging
+  --refresh-interval TEXT         How often to make new data available for
+                                  search
+  --force-merge-enabled BOOLEAN   Whether to perform force merge operation
+  --flush-threshold-size TEXT     Size threshold for flushing the transaction
+                                  log
+  # Memory Management
+  --cb-threshold TEXT             k-NN Memory circuit breaker threshold
+  --help                          Show this message and exit.```
 #### Using a configuration file.
 The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -394,6 +440,13 @@ We have strict requirements for the data set format, please follow them.
 - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
   - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
   - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+    - We recommend limiting the number of test query vectors, like 1,000.
+    When conducting concurrent query tests, Vdbbench creates a large number of processes.
+    To minimize additional communication overhead during testing,
+    we prepare a complete set of test queries for each process, allowing them to run independently.
+    However, this means that as the number of concurrent processes increases,
+    the number of copied query vectors also increases significantly,
+    which can place substantial pressure on memory resources.
   - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
 - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/README.md RENAMED Viewed

@@ -13,6 +13,8 @@ Closely mimicking real-world production environments, we've set up diverse testi
 Prepare to delve into the world of VectorDBBench, and let it guide you in uncovering your perfect vector database match.
+VectorDBBench is sponsered by Zilliz，the leading opensource vectorDB company behind Milvus. Choose smarter with VectorDBBench- start your free test on [zilliz cloud](https://zilliz.com/) today!
 **Leaderboard:** https://zilliz.com/benchmark
 ## Quick Start
 ### Prerequirement
@@ -52,6 +54,7 @@ All the database client supported
 | chromadb                 | `pip install vectordb-bench[chromadb]`      |
 | awsopensearch            | `pip install vectordb-bench[opensearch]` |
 | aliyun_opensearch        | `pip install vectordb-bench[aliyun_opensearch]` |
+| mongodb                  | `pip install vectordb-bench[mongodb]`       |
 ### Run
@@ -152,6 +155,47 @@ Options:
                                   with-gt]
   --help                          Show this message and exit.
 ```
+### Run awsopensearch from command line
+```shell
+vectordbbench awsopensearch --db-label awsopensearch \
+--m 16 --ef-construction 256 \
+--host search-vector-db-prod-h4f6m4of6x7yp2rz7gdmots7w4.us-west-2.es.amazonaws.com --port 443 \
+--user vector --password '<password>' \
+--case-type Performance1536D5M --num-insert-workers 10  \
+--skip-load --num-concurrency 75
+```
+To list the options for awsopensearch, execute `vectordbbench awsopensearch --help`
+```text
+$ vectordbbench awsopensearch --help
+Usage: vectordbbench awsopensearch [OPTIONS]
+Options:
+  # Sharding and Replication
+  --number-of-shards INTEGER      Number of primary shards for the index
+  --number-of-replicas INTEGER    Number of replica copies for each primary
+                                  shard
+  # Indexing Performance
+  --index-thread-qty INTEGER      Thread count for native engine indexing
+  --index-thread-qty-during-force-merge INTEGER
+                                  Thread count during force merge operations
+  --number-of-indexing-clients INTEGER
+                                  Number of concurrent indexing clients
+  # Index Management
+  --number-of-segments INTEGER    Target number of segments after merging
+  --refresh-interval TEXT         How often to make new data available for
+                                  search
+  --force-merge-enabled BOOLEAN   Whether to perform force merge operation
+  --flush-threshold-size TEXT     Size threshold for flushing the transaction
+                                  log
+  # Memory Management
+  --cb-threshold TEXT             k-NN Memory circuit breaker threshold
+  --help                          Show this message and exit.```
 #### Using a configuration file.
 The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -318,6 +362,13 @@ We have strict requirements for the data set format, please follow them.
 - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
   - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
   - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+    - We recommend limiting the number of test query vectors, like 1,000.
+    When conducting concurrent query tests, Vdbbench creates a large number of processes.
+    To minimize additional communication overhead during testing,
+    we prepare a complete set of test queries for each process, allowing them to run independently.
+    However, this means that as the number of concurrent processes increases,
+    the number of copied query vectors also increases significantly,
+    which can place substantial pressure on memory resources.
   - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
 - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/install/requirements_py3.11.txt RENAMED Viewed

@@ -1,4 +1,4 @@
-grpcio==1.53.0
+grpcio==1.53.2
 grpcio-tools==1.53.0
 qdrant-client
 pinecone-client

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/install.py RENAMED Viewed

@@ -1,7 +1,8 @@
-import os
 import argparse
+import os
 import subprocess
 def docker_tag_base():
     return 'vdbbench'

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/pyproject.toml RENAMED Viewed

@@ -35,7 +35,7 @@ dependencies = [
     "psutil",
     "polars",
     "plotly",
-    "environs",
+    "environs<14.1.0",
     "pydantic<v2",
     "scikit-learn",
     "pymilvus", # with pandas, numpy, ujson
@@ -85,6 +85,7 @@ memorydb        = [ "memorydb" ]
 chromadb        = [ "chromadb" ]
 opensearch      = [ "opensearch-py" ]
 aliyun_opensearch = [ "alibabacloud_ha3engine_vector", "alibabacloud_searchengine20211025"]
+mongodb         = [ "pymongo" ]
 [project.urls]
 "repository" = "https://github.com/zilliztech/VectorDBBench"
@@ -133,6 +134,7 @@ lint.ignore = [
     "RUF017",
     "C416",
     "PLW0603",
+    "COM812",
 ]
 # Allow autofix for all enabled rules (when `--fix`) is provided.
@@ -206,4 +208,3 @@ builtins-ignorelist = [
     # "dict", # TODO
     # "filter",
 ]

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/assembler.py RENAMED Viewed

@@ -53,8 +53,8 @@ class Assembler:
             _ = k.init_cls
         # sort by dataset size
-        for k, _ in db2runner:
-            db2runner[k].sort(key=lambda x: x.ca.dataset.data.size)
+        for _, runner in db2runner.items():
+            runner.sort(key=lambda x: x.ca.dataset.data.size)
         all_runners = []
         all_runners.extend(load_runners)

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/__init__.py RENAMED Viewed

@@ -40,9 +40,10 @@ class DB(Enum):
     AliyunElasticsearch = "AliyunElasticsearch"
     Test = "test"
     AliyunOpenSearch = "AliyunOpenSearch"
+    MongoDB = "MongoDB"
     @property
-    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912
+    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.milvus import Milvus
@@ -129,11 +130,21 @@ class DB(Enum):
             return AliyunOpenSearch
+        if self == DB.MongoDB:
+            from .mongodb.mongodb import MongoDB
+            return MongoDB
+        if self == DB.Test:
+            from .test.test import Test
+            return Test
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
     @property
-    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912
+    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.config import MilvusConfig
@@ -220,6 +231,16 @@ class DB(Enum):
             return AliyunOpenSearchConfig
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBConfig
+            return MongoDBConfig
+        if self == DB.Test:
+            from .test.config import TestConfig
+            return TestConfig
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
@@ -292,6 +313,11 @@ class DB(Enum):
             return AliyunOpenSearchIndexConfig
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBIndexConfig
+            return MongoDBIndexConfig
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py RENAMED Viewed

@@ -325,10 +325,7 @@ class AliyunOpenSearch(VectorDB):
         return False
-    def optimize(self):
-        pass
-    def optimize_with_size(self, data_size: int):
+    def optimize(self, data_size: int):
         log.info(f"optimize count: {data_size}")
         retry_times = 0
         while True:
@@ -340,6 +337,3 @@ class AliyunOpenSearch(VectorDB):
             if total_count == data_size:
                 log.info("optimize table finish.")
                 return
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/alloydb/alloydb.py RENAMED Viewed

@@ -149,10 +149,7 @@ class AlloyDB(VectorDB):
         )
         self.conn.commit()
-    def ready_to_load(self):
-        pass
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         self._post_insert()
     def _post_insert(self):

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/api.py RENAMED Viewed

@@ -137,6 +137,13 @@ class VectorDB(ABC):
     @contextmanager
     def init(self) -> None:
         """create and destory connections to database.
+        Why contextmanager:
+            In multiprocessing search tasks, vectordbbench might init
+            totally hundreds of thousands of connections with DB server.
+            Too many connections may drain local FDs or server connection resources.
+            If the DB client doesn't have `close()` method, just set the object to None.
         Examples:
             >>> with self.init():
@@ -187,9 +194,8 @@ class VectorDB(ABC):
         """
         raise NotImplementedError
-    # TODO: remove
     @abstractmethod
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases.
         Should be blocked until the vectorDB is ready to be tested on
@@ -199,16 +205,3 @@ class VectorDB(ABC):
         Optimize's execution time is limited, the limited time is based on cases.
         """
         raise NotImplementedError
-    def optimize_with_size(self, data_size: int):
-        self.optimize()
-    # TODO: remove
-    @abstractmethod
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases.
-        Should be blocked until the vectorDB is ready to be tested on
-        heavy load cases.
-        """
-        raise NotImplementedError

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py RENAMED Viewed

@@ -12,6 +12,7 @@ log = logging.getLogger(__name__)
 WAITING_FOR_REFRESH_SEC = 30
 WAITING_FOR_FORCE_MERGE_SEC = 30
+SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC = 30
 class AWSOpenSearch(VectorDB):
@@ -52,10 +53,27 @@ class AWSOpenSearch(VectorDB):
         return AWSOpenSearchIndexConfig
     def _create_index(self, client: OpenSearch):
+        cluster_settings_body = {
+            "persistent": {
+                "knn.algo_param.index_thread_qty": self.case_config.index_thread_qty,
+                "knn.memory.circuit_breaker.limit": self.case_config.cb_threshold,
+            }
+        }
+        client.cluster.put_settings(cluster_settings_body)
         settings = {
             "index": {
                 "knn": True,
+                "number_of_shards": self.case_config.number_of_shards,
+                "number_of_replicas": 0,
+                "translog.flush_threshold_size": self.case_config.flush_threshold_size,
+                # Setting trans log threshold to 5GB
+                **(
+                    {"knn.algo_param.ef_search": self.case_config.ef_search}
+                    if self.case_config.engine == AWSOS_Engine.nmslib
+                    else {}
+                ),
             },
+            "refresh_interval": self.case_config.refresh_interval,
         }
         mappings = {
             "properties": {
@@ -145,24 +163,49 @@ class AWSOpenSearch(VectorDB):
                 docvalue_fields=[self.id_col_name],
                 stored_fields="_none_",
             )
-            log.info(f'Search took: {resp["took"]}')
-            log.info(f'Search shards: {resp["_shards"]}')
-            log.info(f'Search hits total: {resp["hits"]["total"]}')
+            log.debug(f"Search took: {resp['took']}")
+            log.debug(f"Search shards: {resp['_shards']}")
+            log.debug(f"Search hits total: {resp['hits']['total']}")
             return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
         except Exception as e:
             log.warning(f"Failed to search: {self.index_name} error: {e!s}")
             raise e from None
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         # Call refresh first to ensure that all segments are created
         self._refresh_index()
-        self._do_force_merge()
+        if self.case_config.force_merge_enabled:
+            self._do_force_merge()
+            self._refresh_index()
+        self._update_replicas()
         # Call refresh again to ensure that the index is ready after force merge.
         self._refresh_index()
         # ensure that all graphs are loaded in memory and ready for search
         self._load_graphs_to_memory()
+    def _update_replicas(self):
+        index_settings = self.client.indices.get_settings(index=self.index_name)
+        current_number_of_replicas = int(index_settings[self.index_name]["settings"]["index"]["number_of_replicas"])
+        log.info(
+            f"Current Number of replicas are {current_number_of_replicas}"
+            f" and changing the replicas to {self.case_config.number_of_replicas}"
+        )
+        settings_body = {"index": {"number_of_replicas": self.case_config.number_of_replicas}}
+        self.client.indices.put_settings(index=self.index_name, body=settings_body)
+        self._wait_till_green()
+    def _wait_till_green(self):
+        log.info("Wait for index to become green..")
+        while True:
+            res = self.client.cat.indices(index=self.index_name, h="health", format="json")
+            health = res[0]["health"]
+            if health != "green":
+                break
+            log.info(f"The index {self.index_name} has health : {health} and is not green. Retrying")
+            time.sleep(SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC)
+        log.info(f"Index {self.index_name} is green..")
     def _refresh_index(self):
         log.debug(f"Starting refresh for index {self.index_name}")
         while True:
@@ -179,6 +222,12 @@ class AWSOpenSearch(VectorDB):
         log.debug(f"Completed refresh for index {self.index_name}")
     def _do_force_merge(self):
+        log.info(f"Updating the Index thread qty to {self.case_config.index_thread_qty_during_force_merge}.")
+        cluster_settings_body = {
+            "persistent": {"knn.algo_param.index_thread_qty": self.case_config.index_thread_qty_during_force_merge}
+        }
+        self.client.cluster.put_settings(cluster_settings_body)
         log.debug(f"Starting force merge for index {self.index_name}")
         force_merge_endpoint = f"/{self.index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false"
         force_merge_task_id = self.client.transport.perform_request("POST", force_merge_endpoint)["task"]
@@ -194,6 +243,3 @@ class AWSOpenSearch(VectorDB):
             log.info("Calling warmup API to load graphs into memory")
             warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
             self.client.transport.perform_request("GET", warmup_endpoint)
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

vectordb_bench-0.0.22/vectordb_bench/backend/clients/aws_opensearch/cli.py ADDED Viewed

@@ -0,0 +1,125 @@
+from typing import Annotated, TypedDict, Unpack
+import click
+from pydantic import SecretStr
+from ....cli.cli import (
+    CommonTypedDict,
+    HNSWFlavor2,
+    cli,
+    click_parameter_decorators_from_typed_dict,
+    run,
+)
+from .. import DB
+class AWSOpenSearchTypedDict(TypedDict):
+    host: Annotated[str, click.option("--host", type=str, help="Db host", required=True)]
+    port: Annotated[int, click.option("--port", type=int, default=443, help="Db Port")]
+    user: Annotated[str, click.option("--user", type=str, default="admin", help="Db User")]
+    password: Annotated[str, click.option("--password", type=str, help="Db password")]
+    number_of_shards: Annotated[
+        int,
+        click.option("--number-of-shards", type=int, help="Number of primary shards for the index", default=1),
+    ]
+    number_of_replicas: Annotated[
+        int,
+        click.option(
+            "--number-of-replicas", type=int, help="Number of replica copies for each primary shard", default=1
+        ),
+    ]
+    index_thread_qty: Annotated[
+        int,
+        click.option(
+            "--index-thread-qty",
+            type=int,
+            help="Thread count for native engine indexing",
+            default=4,
+        ),
+    ]
+    index_thread_qty_during_force_merge: Annotated[
+        int,
+        click.option(
+            "--index-thread-qty-during-force-merge",
+            type=int,
+            help="Thread count during force merge operations",
+            default=4,
+        ),
+    ]
+    number_of_indexing_clients: Annotated[
+        int,
+        click.option(
+            "--number-of-indexing-clients",
+            type=int,
+            help="Number of concurrent indexing clients",
+            default=1,
+        ),
+    ]
+    number_of_segments: Annotated[
+        int,
+        click.option("--number-of-segments", type=int, help="Target number of segments after merging", default=1),
+    ]
+    refresh_interval: Annotated[
+        int,
+        click.option(
+            "--refresh-interval", type=str, help="How often to make new data available for search", default="60s"
+        ),
+    ]
+    force_merge_enabled: Annotated[
+        int,
+        click.option("--force-merge-enabled", type=bool, help="Whether to perform force merge operation", default=True),
+    ]
+    flush_threshold_size: Annotated[
+        int,
+        click.option(
+            "--flush-threshold-size", type=str, help="Size threshold for flushing the transaction log", default="5120mb"
+        ),
+    ]
+    cb_threshold: Annotated[
+        int,
+        click.option(
+            "--cb-threshold",
+            type=str,
+            help="k-NN Memory circuit breaker threshold",
+            default="50%",
+        ),
+    ]
+class AWSOpenSearchHNSWTypedDict(CommonTypedDict, AWSOpenSearchTypedDict, HNSWFlavor2): ...
+@cli.command()
+@click_parameter_decorators_from_typed_dict(AWSOpenSearchHNSWTypedDict)
+def AWSOpenSearch(**parameters: Unpack[AWSOpenSearchHNSWTypedDict]):
+    from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig
+    run(
+        db=DB.AWSOpenSearch,
+        db_config=AWSOpenSearchConfig(
+            host=parameters["host"],
+            port=parameters["port"],
+            user=parameters["user"],
+            password=SecretStr(parameters["password"]),
+        ),
+        db_case_config=AWSOpenSearchIndexConfig(
+            number_of_shards=parameters["number_of_shards"],
+            number_of_replicas=parameters["number_of_replicas"],
+            index_thread_qty=parameters["index_thread_qty"],
+            number_of_segments=parameters["number_of_segments"],
+            refresh_interval=parameters["refresh_interval"],
+            force_merge_enabled=parameters["force_merge_enabled"],
+            flush_threshold_size=parameters["flush_threshold_size"],
+            number_of_indexing_clients=parameters["number_of_indexing_clients"],
+            index_thread_qty_during_force_merge=parameters["index_thread_qty_during_force_merge"],
+            cb_threshold=parameters["cb_threshold"],
+        ),
+        **parameters,
+    )

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/aws_opensearch/config.py RENAMED Viewed

@@ -39,6 +39,16 @@ class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
     efConstruction: int = 256
     efSearch: int = 256
     M: int = 16
+    index_thread_qty: int | None = 4
+    number_of_shards: int | None = 1
+    number_of_replicas: int | None = 0
+    number_of_segments: int | None = 1
+    refresh_interval: str | None = "60s"
+    force_merge_enabled: bool | None = True
+    flush_threshold_size: str | None = "5120mb"
+    number_of_indexing_clients: int | None = 1
+    index_thread_qty_during_force_merge: int
+    cb_threshold: str | None = "50%"
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.IP:

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/chroma/chroma.py RENAMED Viewed

@@ -57,10 +57,7 @@ class ChromaClient(VectorDB):
     def ready_to_search(self) -> bool:
         pass
-    def ready_to_load(self) -> bool:
-        pass
-    def optimize(self) -> None:
+    def optimize(self, data_size: int | None = None):
         pass
     def insert_embeddings(

{vectordb_bench-0.0.20 → vectordb_bench-0.0.22}/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py RENAMED Viewed

@@ -143,7 +143,7 @@ class ElasticCloud(VectorDB):
             log.warning(f"Failed to search: {self.indice} error: {e!s}")
             raise e from None
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         assert self.client is not None, "should self.init() first"
         self.client.indices.refresh(index=self.indice)
@@ -158,6 +158,3 @@ class ElasticCloud(VectorDB):
             task_status = self.client.tasks.get(task_id=force_merge_task_id)
             if task_status["completed"]:
                 return
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

vectordb-bench 0.0.20__tar.gz → 0.0.22__tar.gz

vectordb-bench 0.0.20tar.gz → 0.0.22tar.gz