PyPI - vectordb-bench - Versions diffs - 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl - Mend

vectordb-bench 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

vectordb_bench/__init__.py +49 -24
vectordb_bench/__main__.py +4 -3
vectordb_bench/backend/assembler.py +12 -13
vectordb_bench/backend/cases.py +55 -45
vectordb_bench/backend/clients/__init__.py +85 -14
vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +1 -2
vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +3 -4
vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +112 -77
vectordb_bench/backend/clients/aliyun_opensearch/config.py +6 -7
vectordb_bench/backend/clients/alloydb/alloydb.py +59 -84
vectordb_bench/backend/clients/alloydb/cli.py +51 -34
vectordb_bench/backend/clients/alloydb/config.py +30 -30
vectordb_bench/backend/clients/api.py +13 -24
vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +50 -54
vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
vectordb_bench/backend/clients/chroma/chroma.py +39 -40
vectordb_bench/backend/clients/chroma/config.py +4 -2
vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +24 -26
vectordb_bench/backend/clients/memorydb/cli.py +8 -8
vectordb_bench/backend/clients/memorydb/config.py +2 -2
vectordb_bench/backend/clients/memorydb/memorydb.py +67 -58
vectordb_bench/backend/clients/milvus/cli.py +41 -83
vectordb_bench/backend/clients/milvus/config.py +18 -8
vectordb_bench/backend/clients/milvus/milvus.py +19 -39
vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +56 -77
vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +34 -43
vectordb_bench/backend/clients/pgvector/cli.py +40 -31
vectordb_bench/backend/clients/pgvector/config.py +63 -73
vectordb_bench/backend/clients/pgvector/pgvector.py +98 -104
vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +39 -49
vectordb_bench/backend/clients/pinecone/config.py +1 -0
vectordb_bench/backend/clients/pinecone/pinecone.py +15 -25
vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +41 -35
vectordb_bench/backend/clients/redis/cli.py +6 -12
vectordb_bench/backend/clients/redis/config.py +7 -5
vectordb_bench/backend/clients/redis/redis.py +95 -62
vectordb_bench/backend/clients/test/cli.py +2 -3
vectordb_bench/backend/clients/test/config.py +2 -2
vectordb_bench/backend/clients/test/test.py +5 -9
vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +37 -26
vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
vectordb_bench/backend/data_source.py +18 -14
vectordb_bench/backend/dataset.py +47 -27
vectordb_bench/backend/result_collector.py +2 -3
vectordb_bench/backend/runner/__init__.py +4 -6
vectordb_bench/backend/runner/mp_runner.py +56 -23
vectordb_bench/backend/runner/rate_runner.py +30 -19
vectordb_bench/backend/runner/read_write_runner.py +46 -22
vectordb_bench/backend/runner/serial_runner.py +81 -46
vectordb_bench/backend/runner/util.py +4 -3
vectordb_bench/backend/task_runner.py +92 -92
vectordb_bench/backend/utils.py +17 -10
vectordb_bench/base.py +0 -1
vectordb_bench/cli/cli.py +65 -60
vectordb_bench/cli/vectordbbench.py +6 -7
vectordb_bench/frontend/components/check_results/charts.py +8 -19
vectordb_bench/frontend/components/check_results/data.py +4 -16
vectordb_bench/frontend/components/check_results/filters.py +8 -16
vectordb_bench/frontend/components/check_results/nav.py +4 -4
vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
vectordb_bench/frontend/components/concurrent/charts.py +12 -12
vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
vectordb_bench/frontend/components/custom/initStyle.py +1 -1
vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
vectordb_bench/frontend/components/tables/data.py +3 -6
vectordb_bench/frontend/config/dbCaseConfigs.py +51 -84
vectordb_bench/frontend/pages/concurrent.py +3 -5
vectordb_bench/frontend/pages/custom.py +30 -9
vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
vectordb_bench/frontend/pages/run_test.py +3 -7
vectordb_bench/frontend/utils.py +1 -1
vectordb_bench/frontend/vdb_benchmark.py +4 -6
vectordb_bench/interface.py +45 -24
vectordb_bench/log_util.py +59 -64
vectordb_bench/metric.py +10 -11
vectordb_bench/models.py +26 -43
{vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/METADATA +22 -15
vectordb_bench-0.0.21.dist-info/RECORD +135 -0
{vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/WHEEL +1 -1
vectordb_bench-0.0.19.dist-info/RECORD +0 -135
{vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/LICENSE +0 -0
{vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/entry_points.txt +0 -0
{vectordb_bench-0.0.19.dist-info → vectordb_bench-0.0.21.dist-info}/top_level.txt +0 -0

vectordb_bench/backend/data_source.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import logging
 import pathlib
 import typing
+from abc import ABC, abstractmethod
 from enum import Enum
 from tqdm import tqdm
-import os
-from abc import ABC, abstractmethod
-from .. import config
+from vectordb_bench import config
 logging.getLogger("s3fs").setLevel(logging.CRITICAL)
@@ -14,6 +14,7 @@ log = logging.getLogger(__name__)
 DatasetReader = typing.TypeVar("DatasetReader")
 class DatasetSource(Enum):
     S3 = "S3"
     AliyunOSS = "AliyunOSS"
@@ -25,6 +26,8 @@ class DatasetSource(Enum):
         if self == DatasetSource.AliyunOSS:
             return AliyunOSSReader()
+        return None
 class DatasetReader(ABC):
     source: DatasetSource
@@ -39,7 +42,6 @@ class DatasetReader(ABC):
             files(list[str]):  all filenames of the dataset
             local_ds_root(pathlib.Path): whether to write the remote data.
         """
-        pass
     @abstractmethod
     def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
@@ -52,13 +54,14 @@ class AliyunOSSReader(DatasetReader):
     def __init__(self):
         import oss2
         self.bucket = oss2.Bucket(oss2.AnonymousAuth(), self.remote_root, "benchmark", True)
     def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
         info = self.bucket.get_object_meta(remote.as_posix())
         # check size equal
-        remote_size, local_size = info.content_length, os.path.getsize(local)
+        remote_size, local_size = info.content_length, local.stat().st_size
         if remote_size != local_size:
             log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
             return False
@@ -70,7 +73,13 @@ class AliyunOSSReader(DatasetReader):
         if not local_ds_root.exists():
             log.info(f"local dataset root path not exist, creating it: {local_ds_root}")
             local_ds_root.mkdir(parents=True)
-            downloads = [(pathlib.PurePosixPath("benchmark", dataset, f), local_ds_root.joinpath(f)) for f in files]
+            downloads = [
+                (
+                    pathlib.PurePosixPath("benchmark", dataset, f),
+                    local_ds_root.joinpath(f),
+                )
+                for f in files
+            ]
         else:
             for file in files:
@@ -92,17 +101,14 @@ class AliyunOSSReader(DatasetReader):
         log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")
 class AwsS3Reader(DatasetReader):
     source: DatasetSource = DatasetSource.S3
     remote_root: str = config.AWS_S3_URL
     def __init__(self):
         import s3fs
-        self.fs = s3fs.S3FileSystem(
-            anon=True,
-            client_kwargs={'region_name': 'us-west-2'}
-        )
+        self.fs = s3fs.S3FileSystem(anon=True, client_kwargs={"region_name": "us-west-2"})
     def ls_all(self, dataset: str):
         dataset_root_dir = pathlib.Path(self.remote_root, dataset)
@@ -112,7 +118,6 @@ class AwsS3Reader(DatasetReader):
             log.info(n)
         return names
     def read(self, dataset: str, files: list[str], local_ds_root: pathlib.Path):
         downloads = []
         if not local_ds_root.exists():
@@ -139,13 +144,12 @@ class AwsS3Reader(DatasetReader):
         log.info(f"Succeed to download all files, downloaded file count = {len(downloads)}")
     def validate_file(self, remote: pathlib.Path, local: pathlib.Path) -> bool:
         # info() uses ls() inside, maybe we only need to ls once
         info = self.fs.info(remote)
         # check size equal
-        remote_size, local_size = info.get("size"), os.path.getsize(local)
+        remote_size, local_size = info.get("size"), local.stat().st_size
         if remote_size != local_size:
             log.info(f"local file: {local} size[{local_size}] not match with remote size[{remote_size}]")
             return False

vectordb_bench/backend/dataset.py CHANGED Viewed

@@ -4,25 +4,30 @@ Usage:
     >>> Dataset.Cohere.get(100_000)
 """
-from collections import namedtuple
 import logging
 import pathlib
+import typing
 from enum import Enum
 import pandas as pd
-from pydantic import validator, PrivateAttr
 import polars as pl
 from pyarrow.parquet import ParquetFile
+from pydantic import PrivateAttr, validator
+from vectordb_bench import config
+from vectordb_bench.base import BaseModel
-from ..base import BaseModel
-from .. import config
-from ..backend.clients import MetricType
 from . import utils
-from .data_source import DatasetSource, DatasetReader
+from .clients import MetricType
+from .data_source import DatasetReader, DatasetSource
 log = logging.getLogger(__name__)
-SizeLabel = namedtuple('SizeLabel', ['size', 'label', 'file_count'])
+class SizeLabel(typing.NamedTuple):
+    size: int
+    label: str
+    file_count: int
 class BaseDataset(BaseModel):
@@ -33,12 +38,13 @@ class BaseDataset(BaseModel):
     use_shuffled: bool
     with_gt: bool = False
     _size_label: dict[int, SizeLabel] = PrivateAttr()
-    isCustom: bool = False
+    is_custom: bool = False
     @validator("size")
-    def verify_size(cls, v):
+    def verify_size(cls, v: int):
         if v not in cls._size_label:
-            raise ValueError(f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}")
+            msg = f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}"
+            raise ValueError(msg)
         return v
     @property
@@ -53,13 +59,14 @@ class BaseDataset(BaseModel):
     def file_count(self) -> int:
         return self._size_label.get(self.size).file_count
 class CustomDataset(BaseDataset):
     dir: str
     file_num: int
-    isCustom: bool = True
+    is_custom: bool = True
     @validator("size")
-    def verify_size(cls, v):
+    def verify_size(cls, v: int):
         return v
     @property
@@ -102,7 +109,7 @@ class Cohere(BaseDataset):
     dim: int = 768
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = config.USE_SHUFFLED_DATA
-    with_gt: bool = True,
+    with_gt: bool = (True,)
     _size_label: dict = {
         100_000: SizeLabel(100_000, "SMALL", 1),
         1_000_000: SizeLabel(1_000_000, "MEDIUM", 1),
@@ -124,7 +131,11 @@ class SIFT(BaseDataset):
     metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
     _size_label: dict = {
-        500_000: SizeLabel(500_000, "SMALL", 1,),
+        500_000: SizeLabel(
+            500_000,
+            "SMALL",
+            1,
+        ),
         5_000_000: SizeLabel(5_000_000, "MEDIUM", 1),
         #  50_000_000: SizeLabel(50_000_000, "LARGE", 50),
     }
@@ -135,7 +146,7 @@ class OpenAI(BaseDataset):
     dim: int = 1536
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = config.USE_SHUFFLED_DATA
-    with_gt: bool = True,
+    with_gt: bool = (True,)
     _size_label: dict = {
         50_000: SizeLabel(50_000, "SMALL", 1),
         500_000: SizeLabel(500_000, "MEDIUM", 1),
@@ -153,13 +164,14 @@ class DatasetManager(BaseModel):
         >>> for data in cohere:
         >>>    print(data.columns)
     """
-    data:   BaseDataset
+    data: BaseDataset
     test_data: pd.DataFrame | None = None
     gt_data: pd.DataFrame | None = None
-    train_files : list[str] = []
+    train_files: list[str] = []
     reader: DatasetReader | None = None
-    def __eq__(self, obj):
+    def __eq__(self, obj: any):
         if isinstance(obj, DatasetManager):
             return self.data.name == obj.data.name and self.data.label == obj.data.label
         return False
@@ -169,22 +181,27 @@ class DatasetManager(BaseModel):
     @property
     def data_dir(self) -> pathlib.Path:
-        """ data local directory: config.DATASET_LOCAL_DIR/{dataset_name}/{dataset_dirname}
+        """data local directory: config.DATASET_LOCAL_DIR/{dataset_name}/{dataset_dirname}
         Examples:
             >>> sift_s = Dataset.SIFT.manager(500_000)
             >>> sift_s.relative_path
             '/tmp/vectordb_bench/dataset/sift/sift_small_500k/'
         """
-        return pathlib.Path(config.DATASET_LOCAL_DIR, self.data.name.lower(), self.data.dir_name.lower())
+        return pathlib.Path(
+            config.DATASET_LOCAL_DIR,
+            self.data.name.lower(),
+            self.data.dir_name.lower(),
+        )
     def __iter__(self):
         return DataSetIterator(self)
     # TODO passing use_shuffle from outside
-    def prepare(self,
-        source: DatasetSource=DatasetSource.S3,
-        filters: int | float | str | None = None,
+    def prepare(
+        self,
+        source: DatasetSource = DatasetSource.S3,
+        filters: float | str | None = None,
     ) -> bool:
         """Download the dataset from DatasetSource
          url = f"{source}/{self.data.dir_name}"
@@ -208,7 +225,7 @@ class DatasetManager(BaseModel):
             gt_file, test_file = utils.compose_gt_file(filters), "test.parquet"
             all_files.extend([gt_file, test_file])
-        if not self.data.isCustom:
+        if not self.data.is_custom:
             source.reader().read(
                 dataset=self.data.dir_name.lower(),
                 files=all_files,
@@ -220,7 +237,7 @@ class DatasetManager(BaseModel):
             self.gt_data = self._read_file(gt_file)
         prefix = "shuffle_train" if use_shuffled else "train"
-        self.train_files = sorted([f.name for f in self.data_dir.glob(f'{prefix}*.parquet')])
+        self.train_files = sorted([f.name for f in self.data_dir.glob(f"{prefix}*.parquet")])
         log.debug(f"{self.data.name}: available train files {self.train_files}")
         return True
@@ -241,7 +258,7 @@ class DataSetIterator:
         self._ds = dataset
         self._idx = 0  # file number
         self._cur = None
-        self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
+        self._sub_idx = [0 for i in range(len(self._ds.train_files))]  # iter num for each file
     def __iter__(self):
         return self
@@ -250,7 +267,9 @@ class DataSetIterator:
         p = pathlib.Path(self._ds.data_dir, file_name)
         log.info(f"Get iterator for {p.name}")
         if not p.exists():
-            raise IndexError(f"No such file {p}")
+            msg = f"No such file: {p}"
+            log.warning(msg)
+            raise IndexError(msg)
         return ParquetFile(p, memory_map=True, pre_buffer=True).iter_batches(config.NUM_PER_BATCH)
     def __next__(self) -> pd.DataFrame:
@@ -281,6 +300,7 @@ class Dataset(Enum):
         >>> Dataset.COHERE.manager(100_000)
         >>> Dataset.COHERE.get(100_000)
     """
     LAION = LAION
     GIST = GIST
     COHERE = Cohere

vectordb_bench/backend/result_collector.py CHANGED Viewed

@@ -1,7 +1,7 @@
+import logging
 import pathlib
-from ..models import TestResult
-import logging
+from vectordb_bench.models import TestResult
 log = logging.getLogger(__name__)
@@ -14,7 +14,6 @@ class ResultCollector:
         if not result_dir.exists() or len(list(result_dir.rglob(reg))) == 0:
             return []
         for json_file in result_dir.rglob(reg):
             file_result = TestResult.read_file(json_file, trans_unit=True)

vectordb_bench/backend/runner/__init__.py CHANGED Viewed

@@ -1,12 +1,10 @@
 from .mp_runner import (
     MultiProcessingSearchRunner,
 )
-from .serial_runner import SerialSearchRunner, SerialInsertRunner
+from .serial_runner import SerialInsertRunner, SerialSearchRunner
 __all__ = [
-    'MultiProcessingSearchRunner',
-    'SerialSearchRunner',
-    'SerialInsertRunner',
+    "MultiProcessingSearchRunner",
+    "SerialInsertRunner",
+    "SerialSearchRunner",
 ]

vectordb_bench/backend/runner/mp_runner.py CHANGED Viewed

@@ -1,27 +1,29 @@
-import time
-import traceback
 import concurrent
+import logging
 import multiprocessing as mp
 import random
-import logging
-from typing import Iterable
+import time
+import traceback
+from collections.abc import Iterable
 import numpy as np
-from ..clients import api
-from ... import config
+from ... import config
+from ..clients import api
 NUM_PER_BATCH = config.NUM_PER_BATCH
 log = logging.getLogger(__name__)
 class MultiProcessingSearchRunner:
-    """ multiprocessing search runner
+    """multiprocessing search runner
     Args:
         k(int): search topk, default to 100
         concurrency(Iterable): concurrencies, default [1, 5, 10, 15, 20, 25, 30, 35]
         duration(int): duration for each concurency, default to 30s
     """
     def __init__(
         self,
         db: api.VectorDB,
@@ -40,7 +42,12 @@ class MultiProcessingSearchRunner:
         self.test_data = test_data
         log.debug(f"test dataset columns: {len(test_data)}")
-    def search(self, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> tuple[int, float]:
+    def search(
+        self,
+        test_data: list[list[float]],
+        q: mp.Queue,
+        cond: mp.Condition,
+    ) -> tuple[int, float]:
         # sync all process
         q.put(1)
         with cond:
@@ -71,13 +78,16 @@ class MultiProcessingSearchRunner:
                 idx = idx + 1 if idx < num - 1 else 0
                 if count % 500 == 0:
-                    log.debug(f"({mp.current_process().name:16}) search_count: {count}, latest_latency={time.perf_counter()-s}")
+                    log.debug(
+                        f"({mp.current_process().name:16}) "
+                        f"search_count: {count}, latest_latency={time.perf_counter()-s}"
+                    )
         total_dur = round(time.perf_counter() - start_time, 4)
         log.info(
             f"{mp.current_process().name:16} search {self.duration}s: "
             f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}"
-         )
+        )
         return (count, total_dur, latencies)
@@ -87,8 +97,6 @@ class MultiProcessingSearchRunner:
         log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
         return mp.get_context(mp_start_method)
     def _run_all_concurrencies_mem_efficient(self):
         max_qps = 0
         conc_num_list = []
@@ -99,7 +107,10 @@ class MultiProcessingSearchRunner:
             for conc in self.concurrencies:
                 with mp.Manager() as m:
                     q, cond = m.Queue(), m.Condition()
-                    with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
+                    with concurrent.futures.ProcessPoolExecutor(
+                        mp_context=self.get_mp_context(),
+                        max_workers=conc,
+                    ) as executor:
                         log.info(f"Start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
                         future_iter = [executor.submit(self.search, self.test_data, q, cond) for i in range(conc)]
                         # Sync all processes
@@ -129,7 +140,9 @@ class MultiProcessingSearchRunner:
                     max_qps = qps
                     log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
         except Exception as e:
-            log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
+            log.warning(
+                f"Fail to search, concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}"
+            )
             traceback.print_exc()
             # No results available, raise exception
@@ -139,7 +152,13 @@ class MultiProcessingSearchRunner:
         finally:
             self.stop()
-        return max_qps, conc_num_list, conc_qps_list, conc_latency_p99_list, conc_latency_avg_list
+        return (
+            max_qps,
+            conc_num_list,
+            conc_qps_list,
+            conc_latency_p99_list,
+            conc_latency_avg_list,
+        )
     def run(self) -> float:
         """
@@ -160,9 +179,14 @@ class MultiProcessingSearchRunner:
             for conc in self.concurrencies:
                 with mp.Manager() as m:
                     q, cond = m.Queue(), m.Condition()
-                    with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
+                    with concurrent.futures.ProcessPoolExecutor(
+                        mp_context=self.get_mp_context(),
+                        max_workers=conc,
+                    ) as executor:
                         log.info(f"Start search_by_dur {duration}s in concurrency {conc}, filters: {self.filters}")
-                        future_iter = [executor.submit(self.search_by_dur, duration, self.test_data, q, cond) for i in range(conc)]
+                        future_iter = [
+                            executor.submit(self.search_by_dur, duration, self.test_data, q, cond) for i in range(conc)
+                        ]
                         # Sync all processes
                         while q.qsize() < conc:
                             sleep_t = conc if conc < 10 else 10
@@ -183,7 +207,9 @@ class MultiProcessingSearchRunner:
                     max_qps = qps
                     log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
         except Exception as e:
-            log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
+            log.warning(
+                f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}",
+            )
             traceback.print_exc()
             # No results available, raise exception
@@ -195,8 +221,13 @@ class MultiProcessingSearchRunner:
         return max_qps
-    def search_by_dur(self, dur: int, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> int:
+    def search_by_dur(
+        self,
+        dur: int,
+        test_data: list[list[float]],
+        q: mp.Queue,
+        cond: mp.Condition,
+    ) -> int:
         # sync all process
         q.put(1)
         with cond:
@@ -225,13 +256,15 @@ class MultiProcessingSearchRunner:
                 idx = idx + 1 if idx < num - 1 else 0
                 if count % 500 == 0:
-                    log.debug(f"({mp.current_process().name:16}) search_count: {count}, latest_latency={time.perf_counter()-s}")
+                    log.debug(
+                        f"({mp.current_process().name:16}) search_count: {count}, "
+                        f"latest_latency={time.perf_counter()-s}"
+                    )
         total_dur = round(time.perf_counter() - start_time, 4)
         log.debug(
             f"{mp.current_process().name:16} search {self.duration}s: "
             f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}"
-         )
+        )
         return count

vectordb_bench/backend/runner/rate_runner.py CHANGED Viewed

@@ -1,36 +1,36 @@
+import concurrent
 import logging
+import multiprocessing as mp
 import time
-import concurrent
 from concurrent.futures import ThreadPoolExecutor
-import multiprocessing as mp
+from vectordb_bench import config
 from vectordb_bench.backend.clients import api
 from vectordb_bench.backend.dataset import DataSetIterator
 from vectordb_bench.backend.utils import time_it
-from vectordb_bench import config
 from .util import get_data
 log = logging.getLogger(__name__)
 class RatedMultiThreadingInsertRunner:
     def __init__(
         self,
-        rate: int, # numRows per second
+        rate: int,  # numRows per second
         db: api.VectorDB,
         dataset_iter: DataSetIterator,
         normalize: bool = False,
         timeout: float | None = None,
     ):
-        self.timeout = timeout if isinstance(timeout, (int, float)) else None
+        self.timeout = timeout if isinstance(timeout, int | float) else None
         self.dataset = dataset_iter
         self.db = db
         self.normalize = normalize
         self.insert_rate = rate
         self.batch_rate = rate // config.NUM_PER_BATCH
-    def send_insert_task(self, db, emb: list[list[float]], metadata: list[str]):
+    def send_insert_task(self, db: api.VectorDB, emb: list[list[float]], metadata: list[str]):
         db.insert_embeddings(emb, metadata)
     @time_it
@@ -43,7 +43,9 @@ class RatedMultiThreadingInsertRunner:
                 rate = self.batch_rate
                 for data in self.dataset:
                     emb, metadata = get_data(data, self.normalize)
-                    executing_futures.append(executor.submit(self.send_insert_task, self.db, emb, metadata))
+                    executing_futures.append(
+                        executor.submit(self.send_insert_task, self.db, emb, metadata),
+                    )
                     rate -= 1
                     if rate == 0:
@@ -66,19 +68,26 @@ class RatedMultiThreadingInsertRunner:
                         done, not_done = concurrent.futures.wait(
                             executing_futures,
                             timeout=wait_interval,
-                            return_when=concurrent.futures.FIRST_EXCEPTION)
+                            return_when=concurrent.futures.FIRST_EXCEPTION,
+                        )
                         if len(not_done) > 0:
-                            log.warning(f"Failed to finish all tasks in 1s, [{len(not_done)}/{len(executing_futures)}] tasks are not done, waited={wait_interval:.2f}, trying to wait in the next round")
+                            log.warning(
+                                f"Failed to finish all tasks in 1s, [{len(not_done)}/{len(executing_futures)}] "
+                                f"tasks are not done, waited={wait_interval:.2f}, trying to wait in the next round"
+                            )
                             executing_futures = list(not_done)
                         else:
-                            log.debug(f"Finished {len(executing_futures)} insert-{config.NUM_PER_BATCH} task in 1s, wait_interval={wait_interval:.2f}")
+                            log.debug(
+                                f"Finished {len(executing_futures)} insert-{config.NUM_PER_BATCH} "
+                                f"task in 1s, wait_interval={wait_interval:.2f}"
+                            )
                             executing_futures = []
                     except Exception as e:
-                            log.warn(f"task error, terminating, err={e}")
-                            q.put(None, block=True)
-                            executor.shutdown(wait=True, cancel_futures=True)
-                            raise e
+                        log.warning(f"task error, terminating, err={e}")
+                        q.put(None, block=True)
+                        executor.shutdown(wait=True, cancel_futures=True)
+                        raise e from e
                     dur = time.perf_counter() - start_time
                     if dur < 1:
@@ -87,10 +96,12 @@ class RatedMultiThreadingInsertRunner:
                 # wait for all tasks in executing_futures to complete
                 if len(executing_futures) > 0:
                     try:
-                        done, _ = concurrent.futures.wait(executing_futures,
-                           return_when=concurrent.futures.FIRST_EXCEPTION)
+                        done, _ = concurrent.futures.wait(
+                            executing_futures,
+                            return_when=concurrent.futures.FIRST_EXCEPTION,
+                        )
                     except Exception as e:
-                        log.warn(f"task error, terminating, err={e}")
+                        log.warning(f"task error, terminating, err={e}")
                         q.put(None, block=True)
                         executor.shutdown(wait=True, cancel_futures=True)
-                        raise e
+                        raise e from e

vectordb-bench 0.0.19__py3-none-any.whl → 0.0.21__py3-none-any.whl

vectordb-bench 0.0.19py3-none-any.whl → 0.0.21py3-none-any.whl