PyPI - vectordb-bench - Versions diffs - 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl - Mend

vectordb-bench 0.0.20py3-none-any.whl → 0.0.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

vectordb_bench/backend/assembler.py CHANGED Viewed

@@ -53,8 +53,8 @@ class Assembler:
             _ = k.init_cls
         # sort by dataset size
-        for k, _ in db2runner:
-            db2runner[k].sort(key=lambda x: x.ca.dataset.data.size)
+        for _, runner in db2runner.items():
+            runner.sort(key=lambda x: x.ca.dataset.data.size)
         all_runners = []
         all_runners.extend(load_runners)

vectordb_bench/backend/clients/__init__.py CHANGED Viewed

@@ -40,9 +40,10 @@ class DB(Enum):
     AliyunElasticsearch = "AliyunElasticsearch"
     Test = "test"
     AliyunOpenSearch = "AliyunOpenSearch"
+    MongoDB = "MongoDB"
     @property
-    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912
+    def init_cls(self) -> type[VectorDB]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.milvus import Milvus
@@ -129,11 +130,21 @@ class DB(Enum):
             return AliyunOpenSearch
+        if self == DB.MongoDB:
+            from .mongodb.mongodb import MongoDB
+            return MongoDB
+        if self == DB.Test:
+            from .test.test import Test
+            return Test
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
     @property
-    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912
+    def config_cls(self) -> type[DBConfig]:  # noqa: PLR0911, PLR0912, C901
         """Import while in use"""
         if self == DB.Milvus:
             from .milvus.config import MilvusConfig
@@ -220,6 +231,16 @@ class DB(Enum):
             return AliyunOpenSearchConfig
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBConfig
+            return MongoDBConfig
+        if self == DB.Test:
+            from .test.config import TestConfig
+            return TestConfig
         msg = f"Unknown DB: {self.name}"
         raise ValueError(msg)
@@ -292,6 +313,11 @@ class DB(Enum):
             return AliyunOpenSearchIndexConfig
+        if self == DB.MongoDB:
+            from .mongodb.config import MongoDBIndexConfig
+            return MongoDBIndexConfig
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig

vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py CHANGED Viewed

@@ -325,10 +325,7 @@ class AliyunOpenSearch(VectorDB):
         return False
-    def optimize(self):
-        pass
-    def optimize_with_size(self, data_size: int):
+    def optimize(self, data_size: int):
         log.info(f"optimize count: {data_size}")
         retry_times = 0
         while True:
@@ -340,6 +337,3 @@ class AliyunOpenSearch(VectorDB):
             if total_count == data_size:
                 log.info("optimize table finish.")
                 return
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

vectordb_bench/backend/clients/alloydb/alloydb.py CHANGED Viewed

@@ -149,10 +149,7 @@ class AlloyDB(VectorDB):
         )
         self.conn.commit()
-    def ready_to_load(self):
-        pass
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         self._post_insert()
     def _post_insert(self):

vectordb_bench/backend/clients/api.py CHANGED Viewed

@@ -137,6 +137,13 @@ class VectorDB(ABC):
     @contextmanager
     def init(self) -> None:
         """create and destory connections to database.
+        Why contextmanager:
+            In multiprocessing search tasks, vectordbbench might init
+            totally hundreds of thousands of connections with DB server.
+            Too many connections may drain local FDs or server connection resources.
+            If the DB client doesn't have `close()` method, just set the object to None.
         Examples:
             >>> with self.init():
@@ -187,9 +194,8 @@ class VectorDB(ABC):
         """
         raise NotImplementedError
-    # TODO: remove
     @abstractmethod
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases.
         Should be blocked until the vectorDB is ready to be tested on
@@ -199,16 +205,3 @@ class VectorDB(ABC):
         Optimize's execution time is limited, the limited time is based on cases.
         """
         raise NotImplementedError
-    def optimize_with_size(self, data_size: int):
-        self.optimize()
-    # TODO: remove
-    @abstractmethod
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases.
-        Should be blocked until the vectorDB is ready to be tested on
-        heavy load cases.
-        """
-        raise NotImplementedError

vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py CHANGED Viewed

@@ -12,6 +12,7 @@ log = logging.getLogger(__name__)
 WAITING_FOR_REFRESH_SEC = 30
 WAITING_FOR_FORCE_MERGE_SEC = 30
+SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC = 30
 class AWSOpenSearch(VectorDB):
@@ -52,10 +53,27 @@ class AWSOpenSearch(VectorDB):
         return AWSOpenSearchIndexConfig
     def _create_index(self, client: OpenSearch):
+        cluster_settings_body = {
+            "persistent": {
+                "knn.algo_param.index_thread_qty": self.case_config.index_thread_qty,
+                "knn.memory.circuit_breaker.limit": self.case_config.cb_threshold,
+            }
+        }
+        client.cluster.put_settings(cluster_settings_body)
         settings = {
             "index": {
                 "knn": True,
+                "number_of_shards": self.case_config.number_of_shards,
+                "number_of_replicas": 0,
+                "translog.flush_threshold_size": self.case_config.flush_threshold_size,
+                # Setting trans log threshold to 5GB
+                **(
+                    {"knn.algo_param.ef_search": self.case_config.ef_search}
+                    if self.case_config.engine == AWSOS_Engine.nmslib
+                    else {}
+                ),
             },
+            "refresh_interval": self.case_config.refresh_interval,
         }
         mappings = {
             "properties": {
@@ -145,24 +163,49 @@ class AWSOpenSearch(VectorDB):
                 docvalue_fields=[self.id_col_name],
                 stored_fields="_none_",
             )
-            log.info(f'Search took: {resp["took"]}')
-            log.info(f'Search shards: {resp["_shards"]}')
-            log.info(f'Search hits total: {resp["hits"]["total"]}')
+            log.debug(f"Search took: {resp['took']}")
+            log.debug(f"Search shards: {resp['_shards']}")
+            log.debug(f"Search hits total: {resp['hits']['total']}")
             return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
         except Exception as e:
             log.warning(f"Failed to search: {self.index_name} error: {e!s}")
             raise e from None
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         # Call refresh first to ensure that all segments are created
         self._refresh_index()
-        self._do_force_merge()
+        if self.case_config.force_merge_enabled:
+            self._do_force_merge()
+            self._refresh_index()
+        self._update_replicas()
         # Call refresh again to ensure that the index is ready after force merge.
         self._refresh_index()
         # ensure that all graphs are loaded in memory and ready for search
         self._load_graphs_to_memory()
+    def _update_replicas(self):
+        index_settings = self.client.indices.get_settings(index=self.index_name)
+        current_number_of_replicas = int(index_settings[self.index_name]["settings"]["index"]["number_of_replicas"])
+        log.info(
+            f"Current Number of replicas are {current_number_of_replicas}"
+            f" and changing the replicas to {self.case_config.number_of_replicas}"
+        )
+        settings_body = {"index": {"number_of_replicas": self.case_config.number_of_replicas}}
+        self.client.indices.put_settings(index=self.index_name, body=settings_body)
+        self._wait_till_green()
+    def _wait_till_green(self):
+        log.info("Wait for index to become green..")
+        while True:
+            res = self.client.cat.indices(index=self.index_name, h="health", format="json")
+            health = res[0]["health"]
+            if health != "green":
+                break
+            log.info(f"The index {self.index_name} has health : {health} and is not green. Retrying")
+            time.sleep(SECONDS_WAITING_FOR_REPLICAS_TO_BE_ENABLED_SEC)
+        log.info(f"Index {self.index_name} is green..")
     def _refresh_index(self):
         log.debug(f"Starting refresh for index {self.index_name}")
         while True:
@@ -179,6 +222,12 @@ class AWSOpenSearch(VectorDB):
         log.debug(f"Completed refresh for index {self.index_name}")
     def _do_force_merge(self):
+        log.info(f"Updating the Index thread qty to {self.case_config.index_thread_qty_during_force_merge}.")
+        cluster_settings_body = {
+            "persistent": {"knn.algo_param.index_thread_qty": self.case_config.index_thread_qty_during_force_merge}
+        }
+        self.client.cluster.put_settings(cluster_settings_body)
         log.debug(f"Starting force merge for index {self.index_name}")
         force_merge_endpoint = f"/{self.index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false"
         force_merge_task_id = self.client.transport.perform_request("POST", force_merge_endpoint)["task"]
@@ -194,6 +243,3 @@ class AWSOpenSearch(VectorDB):
             log.info("Calling warmup API to load graphs into memory")
             warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
             self.client.transport.perform_request("GET", warmup_endpoint)
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

vectordb_bench/backend/clients/aws_opensearch/cli.py CHANGED Viewed

@@ -18,6 +18,79 @@ class AWSOpenSearchTypedDict(TypedDict):
     port: Annotated[int, click.option("--port", type=int, default=443, help="Db Port")]
     user: Annotated[str, click.option("--user", type=str, default="admin", help="Db User")]
     password: Annotated[str, click.option("--password", type=str, help="Db password")]
+    number_of_shards: Annotated[
+        int,
+        click.option("--number-of-shards", type=int, help="Number of primary shards for the index", default=1),
+    ]
+    number_of_replicas: Annotated[
+        int,
+        click.option(
+            "--number-of-replicas", type=int, help="Number of replica copies for each primary shard", default=1
+        ),
+    ]
+    index_thread_qty: Annotated[
+        int,
+        click.option(
+            "--index-thread-qty",
+            type=int,
+            help="Thread count for native engine indexing",
+            default=4,
+        ),
+    ]
+    index_thread_qty_during_force_merge: Annotated[
+        int,
+        click.option(
+            "--index-thread-qty-during-force-merge",
+            type=int,
+            help="Thread count during force merge operations",
+            default=4,
+        ),
+    ]
+    number_of_indexing_clients: Annotated[
+        int,
+        click.option(
+            "--number-of-indexing-clients",
+            type=int,
+            help="Number of concurrent indexing clients",
+            default=1,
+        ),
+    ]
+    number_of_segments: Annotated[
+        int,
+        click.option("--number-of-segments", type=int, help="Target number of segments after merging", default=1),
+    ]
+    refresh_interval: Annotated[
+        int,
+        click.option(
+            "--refresh-interval", type=str, help="How often to make new data available for search", default="60s"
+        ),
+    ]
+    force_merge_enabled: Annotated[
+        int,
+        click.option("--force-merge-enabled", type=bool, help="Whether to perform force merge operation", default=True),
+    ]
+    flush_threshold_size: Annotated[
+        int,
+        click.option(
+            "--flush-threshold-size", type=str, help="Size threshold for flushing the transaction log", default="5120mb"
+        ),
+    ]
+    cb_threshold: Annotated[
+        int,
+        click.option(
+            "--cb-threshold",
+            type=str,
+            help="k-NN Memory circuit breaker threshold",
+            default="50%",
+        ),
+    ]
 class AWSOpenSearchHNSWTypedDict(CommonTypedDict, AWSOpenSearchTypedDict, HNSWFlavor2): ...
@@ -36,6 +109,17 @@ def AWSOpenSearch(**parameters: Unpack[AWSOpenSearchHNSWTypedDict]):
             user=parameters["user"],
             password=SecretStr(parameters["password"]),
         ),
-        db_case_config=AWSOpenSearchIndexConfig(),
+        db_case_config=AWSOpenSearchIndexConfig(
+            number_of_shards=parameters["number_of_shards"],
+            number_of_replicas=parameters["number_of_replicas"],
+            index_thread_qty=parameters["index_thread_qty"],
+            number_of_segments=parameters["number_of_segments"],
+            refresh_interval=parameters["refresh_interval"],
+            force_merge_enabled=parameters["force_merge_enabled"],
+            flush_threshold_size=parameters["flush_threshold_size"],
+            number_of_indexing_clients=parameters["number_of_indexing_clients"],
+            index_thread_qty_during_force_merge=parameters["index_thread_qty_during_force_merge"],
+            cb_threshold=parameters["cb_threshold"],
+        ),
         **parameters,
     )

vectordb_bench/backend/clients/aws_opensearch/config.py CHANGED Viewed

@@ -39,6 +39,16 @@ class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
     efConstruction: int = 256
     efSearch: int = 256
     M: int = 16
+    index_thread_qty: int | None = 4
+    number_of_shards: int | None = 1
+    number_of_replicas: int | None = 0
+    number_of_segments: int | None = 1
+    refresh_interval: str | None = "60s"
+    force_merge_enabled: bool | None = True
+    flush_threshold_size: str | None = "5120mb"
+    number_of_indexing_clients: int | None = 1
+    index_thread_qty_during_force_merge: int
+    cb_threshold: str | None = "50%"
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.IP:

vectordb_bench/backend/clients/chroma/chroma.py CHANGED Viewed

@@ -57,10 +57,7 @@ class ChromaClient(VectorDB):
     def ready_to_search(self) -> bool:
         pass
-    def ready_to_load(self) -> bool:
-        pass
-    def optimize(self) -> None:
+    def optimize(self, data_size: int | None = None):
         pass
     def insert_embeddings(

vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py CHANGED Viewed

@@ -143,7 +143,7 @@ class ElasticCloud(VectorDB):
             log.warning(f"Failed to search: {self.indice} error: {e!s}")
             raise e from None
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         """optimize will be called between insertion and search in performance cases."""
         assert self.client is not None, "should self.init() first"
         self.client.indices.refresh(index=self.indice)
@@ -158,6 +158,3 @@ class ElasticCloud(VectorDB):
             task_status = self.client.tasks.get(task_id=force_merge_task_id)
             if task_status["completed"]:
                 return
-    def ready_to_load(self):
-        """ready_to_load will be called before load in load cases."""

vectordb_bench/backend/clients/memorydb/cli.py CHANGED Viewed

@@ -43,8 +43,8 @@ class MemoryDBTypedDict(TypedDict):
             show_default=True,
             default=False,
             help=(
-                "Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance.",
-                " In production, MemoryDB only supports cluster mode (CME)",
+                "Cluster Mode Disabled (CMD), use this flag when testing locally on a single node instance."
+                " In production, MemoryDB only supports cluster mode (CME)"
             ),
         ),
     ]

vectordb_bench/backend/clients/memorydb/memorydb.py CHANGED Viewed

@@ -157,17 +157,14 @@ class MemoryDB(VectorDB):
         self.conn = self.get_client()
         search_param = self.case_config.search_param()
         if search_param["ef_runtime"]:
-            self.ef_runtime_str = f'EF_RUNTIME {search_param["ef_runtime"]}'
+            self.ef_runtime_str = f"EF_RUNTIME {search_param['ef_runtime']}"
         else:
             self.ef_runtime_str = ""
         yield
         self.conn.close()
         self.conn = None
-    def ready_to_load(self) -> bool:
-        pass
-    def optimize(self) -> None:
+    def optimize(self, data_size: int | None = None):
         self._post_insert()
     def insert_embeddings(

vectordb_bench/backend/clients/milvus/milvus.py CHANGED Viewed

@@ -138,26 +138,7 @@ class Milvus(VectorDB):
             log.warning(f"{self.name} optimize error: {e}")
             raise e from None
-    def ready_to_load(self):
-        assert self.col, "Please call self.init() before"
-        self._pre_load(self.col)
-    def _pre_load(self, coll: Collection):
-        try:
-            if not coll.has_index(index_name=self._index_name):
-                log.info(f"{self.name} create index")
-                coll.create_index(
-                    self._vector_field,
-                    self.case_config.index_param(),
-                    index_name=self._index_name,
-                )
-            coll.load()
-            log.info(f"{self.name} load")
-        except Exception as e:
-            log.warning(f"{self.name} pre load error: {e}")
-            raise e from None
-    def optimize(self):
+    def optimize(self, data_size: int | None = None):
         assert self.col, "Please call self.init() before"
         self._optimize()

vectordb_bench/backend/clients/mongodb/config.py ADDED Viewed

@@ -0,0 +1,53 @@
+from enum import Enum
+from pydantic import BaseModel, SecretStr
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
+class QuantizationType(Enum):
+    NONE = "none"
+    BINARY = "binary"
+    SCALAR = "scalar"
+class MongoDBConfig(DBConfig, BaseModel):
+    connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
+    database: str = "vdb_bench"
+    def to_dict(self) -> dict:
+        return {
+            "connection_string": self.connection_string.get_secret_value(),
+            "database": self.database,
+        }
+class MongoDBIndexConfig(BaseModel, DBCaseConfig):
+    index: IndexType = IndexType.HNSW  # MongoDB uses HNSW for vector search
+    metric_type: MetricType = MetricType.COSINE
+    num_candidates_ratio: int = 10  # Default numCandidates ratio for vector search
+    quantization: QuantizationType = QuantizationType.NONE  # Quantization type if applicable
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "euclidean"
+        if self.metric_type == MetricType.IP:
+            return "dotProduct"
+        return "cosine"  # Default to cosine similarity
+    def index_param(self) -> dict:
+        return {
+            "type": "vectorSearch",
+            "fields": [
+                {
+                    "type": "vector",
+                    "similarity": self.parse_metric(),
+                    "numDimensions": None,  # Will be set in MongoDB class
+                    "path": "vector",  # Vector field name
+                    "quantization": self.quantization.value,
+                }
+            ],
+        }
+    def search_param(self) -> dict:
+        return {"num_candidates_ratio": self.num_candidates_ratio}

vectordb-bench 0.0.20__py3-none-any.whl → 0.0.22__py3-none-any.whl

vectordb-bench 0.0.20py3-none-any.whl → 0.0.22py3-none-any.whl