PyPI - vectordb-bench - Versions diffs - 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl - Mend

vectordb-bench 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

vectordb_bench/backend/clients/alloydb/config.py ADDED Viewed

@@ -0,0 +1,168 @@
+from abc import abstractmethod
+from typing import Any, Mapping, Optional, Sequence, TypedDict
+from pydantic import BaseModel, SecretStr
+from typing_extensions import LiteralString
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
+POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
+class AlloyDBConfigDict(TypedDict):
+    """These keys will be directly used as kwargs in psycopg connection string,
+        so the names must match exactly psycopg API"""
+    user: str
+    password: str
+    host: str
+    port: int
+    dbname: str
+class AlloyDBConfig(DBConfig):
+    user_name: SecretStr = SecretStr("postgres")
+    password: SecretStr
+    host: str = "localhost"
+    port: int = 5432
+    db_name: str
+    def to_dict(self) -> AlloyDBConfigDict:
+        user_str = self.user_name.get_secret_value()
+        pwd_str = self.password.get_secret_value()
+        return {
+            "host": self.host,
+            "port": self.port,
+            "dbname": self.db_name,
+            "user": user_str,
+            "password": pwd_str,
+        }
+class AlloyDBIndexParam(TypedDict):
+    metric: str
+    index_type: str
+    index_creation_with_options: Sequence[dict[str, Any]]
+    maintenance_work_mem: Optional[str]
+    max_parallel_workers: Optional[int]
+class AlloyDBSearchParam(TypedDict):
+    metric_fun_op: LiteralString
+class AlloyDBSessionCommands(TypedDict):
+    session_options: Sequence[dict[str, Any]]
+class AlloyDBIndexConfig(BaseModel, DBCaseConfig):
+    metric_type: MetricType | None = None
+    create_index_before_load: bool = False
+    create_index_after_load: bool = True
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "l2"
+        elif self.metric_type == MetricType.DP:
+            return "dot_product"
+        return "cosine"
+    def parse_metric_fun_op(self) -> LiteralString:
+        if self.metric_type == MetricType.L2:
+            return "<->"
+        elif self.metric_type == MetricType.IP:
+            return "<#>"
+        return "<=>"
+    @abstractmethod
+    def index_param(self) -> AlloyDBIndexParam:
+        ...
+    @abstractmethod
+    def search_param(self) -> AlloyDBSearchParam:
+        ...
+    @abstractmethod
+    def session_param(self) -> AlloyDBSessionCommands:
+        ...
+    @staticmethod
+    def _optionally_build_with_options(with_options: Mapping[str, Any]) -> Sequence[dict[str, Any]]:
+        """Walk through mappings, creating a List of {key1 = value} pairs. That will be used to build a where clause"""
+        options = []
+        for option_name, value in with_options.items():
+            if value is not None:
+                options.append(
+                    {
+                        "option_name": option_name,
+                        "val": str(value),
+                    }
+                )
+        return options
+    @staticmethod
+    def _optionally_build_set_options(
+        set_mapping: Mapping[str, Any]
+    ) -> Sequence[dict[str, Any]]:
+        """Walk through options, creating 'SET 'key1 = "value1";' list"""
+        session_options = []
+        for setting_name, value in set_mapping.items():
+            if value:
+                session_options.append(
+                    {"parameter": {
+                            "setting_name": setting_name,
+                            "val": str(value),
+                        },
+                    }
+                )
+        return session_options
+class AlloyDBScaNNConfig(AlloyDBIndexConfig):
+    index: IndexType = IndexType.SCANN
+    num_leaves: int | None
+    quantizer: str | None
+    enable_pca: str | None
+    max_num_levels: int | None
+    num_leaves_to_search: int | None
+    max_top_neighbors_buffer_size: int | None
+    pre_reordering_num_neighbors: int |  None
+    num_search_threads: int  | None
+    max_num_prefetch_datasets: int | None
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+    def index_param(self) -> AlloyDBIndexParam:
+        index_parameters = {
+            "num_leaves": self.num_leaves, "max_num_levels": self.max_num_levels, "quantizer": self.quantizer,
+        }
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "index_creation_with_options": self._optionally_build_with_options(
+                index_parameters
+            ),
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+            "enable_pca": self.enable_pca,
+        }
+    def search_param(self) -> AlloyDBSearchParam:
+        return {
+            "metric_fun_op": self.parse_metric_fun_op(),
+        }
+    def session_param(self) -> AlloyDBSessionCommands:
+        session_parameters = {
+            "scann.num_leaves_to_search": self.num_leaves_to_search,
+            "scann.max_top_neighbors_buffer_size": self.max_top_neighbors_buffer_size,
+            "scann.pre_reordering_num_neighbors": self.pre_reordering_num_neighbors,
+            "scann.num_search_threads": self.num_search_threads,
+            "scann.max_num_prefetch_datasets": self.max_num_prefetch_datasets,
+        }
+        return {
+            "session_options": self._optionally_build_set_options(session_parameters)
+        }
+_alloydb_case_config = {
+        IndexType.SCANN: AlloyDBScaNNConfig,
+}

vectordb_bench/backend/clients/api.py CHANGED Viewed

@@ -10,6 +10,7 @@ class MetricType(str, Enum):
     L2 = "L2"
     COSINE = "COSINE"
     IP = "IP"
+    DP = "DP"
     HAMMING = "HAMMING"
     JACCARD = "JACCARD"
@@ -27,6 +28,7 @@ class IndexType(str, Enum):
     GPU_IVF_FLAT = "GPU_IVF_FLAT"
     GPU_IVF_PQ = "GPU_IVF_PQ"
     GPU_CAGRA = "GPU_CAGRA"
+    SCANN = "scann"
 class DBConfig(ABC, BaseModel):

vectordb_bench/backend/clients/milvus/milvus.py CHANGED Viewed

@@ -66,7 +66,8 @@ class Milvus(VectorDB):
                 self.case_config.index_param(),
                 index_name=self._index_name,
             )
-            #  self._pre_load(coll)
+            if kwargs.get("pre_load") is True:
+                self._pre_load(col)
         connections.disconnect("default")

vectordb_bench/backend/dataset.py CHANGED Viewed

@@ -57,11 +57,11 @@ class CustomDataset(BaseDataset):
     dir: str
     file_num: int
     isCustom: bool = True
     @validator("size")
     def verify_size(cls, v):
         return v
     @property
     def label(self) -> str:
         return "Custom"
@@ -73,7 +73,8 @@ class CustomDataset(BaseDataset):
     @property
     def file_count(self) -> int:
         return self.file_num
 class LAION(BaseDataset):
     name: str = "LAION"
     dim: int = 768
@@ -242,13 +243,15 @@ class DataSetIterator:
         self._cur = None
         self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
+    def __iter__(self):
+        return self
     def _get_iter(self, file_name: str):
         p = pathlib.Path(self._ds.data_dir, file_name)
         log.info(f"Get iterator for {p.name}")
         if not p.exists():
             raise IndexError(f"No such file {p}")
-            log.warning(f"No such file: {p}")
-        return ParquetFile(p).iter_batches(config.NUM_PER_BATCH)
+        return ParquetFile(p, memory_map=True, pre_buffer=True).iter_batches(config.NUM_PER_BATCH)
     def __next__(self) -> pd.DataFrame:
         """return the data in the next file of the training list"""

vectordb_bench/backend/runner/mp_runner.py CHANGED Viewed

@@ -64,7 +64,7 @@ class MultiProcessingSearchRunner:
                     log.warning(f"VectorDB search_embedding error: {e}")
                     traceback.print_exc(chain=True)
                     raise e from None
                 latencies.append(time.perf_counter() - s)
                 count += 1
                 # loop through the test data
@@ -87,11 +87,14 @@ class MultiProcessingSearchRunner:
         log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
         return mp.get_context(mp_start_method)
-    def _run_all_concurrencies_mem_efficient(self) -> float:
+    def _run_all_concurrencies_mem_efficient(self):
         max_qps = 0
         conc_num_list = []
         conc_qps_list = []
         conc_latency_p99_list = []
+        conc_latency_avg_list = []
         try:
             for conc in self.concurrencies:
                 with mp.Manager() as m:
@@ -111,13 +114,15 @@ class MultiProcessingSearchRunner:
                         start = time.perf_counter()
                         all_count = sum([r.result()[0] for r in future_iter])
                         latencies = sum([r.result()[2] for r in future_iter], start=[])
-                        latency_p99 = np.percentile(latencies, 0.99)
+                        latency_p99 = np.percentile(latencies, 99)
+                        latency_avg = np.mean(latencies)
                         cost = time.perf_counter() - start
                         qps = round(all_count / cost, 4)
                         conc_num_list.append(conc)
                         conc_qps_list.append(qps)
                         conc_latency_p99_list.append(latency_p99)
+                        conc_latency_avg_list.append(latency_avg)
                         log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
                 if qps > max_qps:
@@ -134,7 +139,7 @@ class MultiProcessingSearchRunner:
         finally:
             self.stop()
-        return max_qps, conc_num_list, conc_qps_list, conc_latency_p99_list
+        return max_qps, conc_num_list, conc_qps_list, conc_latency_p99_list, conc_latency_avg_list
     def run(self) -> float:
         """
@@ -145,3 +150,88 @@ class MultiProcessingSearchRunner:
     def stop(self) -> None:
         pass
+    def run_by_dur(self, duration: int) -> float:
+        return self._run_by_dur(duration)
+    def _run_by_dur(self, duration: int) -> float:
+        max_qps = 0
+        try:
+            for conc in self.concurrencies:
+                with mp.Manager() as m:
+                    q, cond = m.Queue(), m.Condition()
+                    with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
+                        log.info(f"Start search_by_dur {duration}s in concurrency {conc}, filters: {self.filters}")
+                        future_iter = [executor.submit(self.search_by_dur, duration, self.test_data, q, cond) for i in range(conc)]
+                        # Sync all processes
+                        while q.qsize() < conc:
+                            sleep_t = conc if conc < 10 else 10
+                            time.sleep(sleep_t)
+                        with cond:
+                            cond.notify_all()
+                            log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
+                        start = time.perf_counter()
+                        all_count = sum([r.result() for r in future_iter])
+                        cost = time.perf_counter() - start
+                        qps = round(all_count / cost, 4)
+                        log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
+                if qps > max_qps:
+                    max_qps = qps
+                    log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
+        except Exception as e:
+            log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
+            traceback.print_exc()
+            # No results available, raise exception
+            if max_qps == 0.0:
+                raise e from None
+        finally:
+            self.stop()
+        return max_qps
+    def search_by_dur(self, dur: int, test_data: list[list[float]], q: mp.Queue, cond: mp.Condition) -> int:
+        # sync all process
+        q.put(1)
+        with cond:
+            cond.wait()
+        with self.db.init():
+            num, idx = len(test_data), random.randint(0, len(test_data) - 1)
+            start_time = time.perf_counter()
+            count = 0
+            while time.perf_counter() < start_time + dur:
+                s = time.perf_counter()
+                try:
+                    self.db.search_embedding(
+                        test_data[idx],
+                        self.k,
+                        self.filters,
+                    )
+                except Exception as e:
+                    log.warning(f"VectorDB search_embedding error: {e}")
+                    traceback.print_exc(chain=True)
+                    raise e from None
+                count += 1
+                # loop through the test data
+                idx = idx + 1 if idx < num - 1 else 0
+                if count % 500 == 0:
+                    log.debug(f"({mp.current_process().name:16}) search_count: {count}, latest_latency={time.perf_counter()-s}")
+        total_dur = round(time.perf_counter() - start_time, 4)
+        log.debug(
+            f"{mp.current_process().name:16} search {self.duration}s: "
+            f"actual_dur={total_dur}s, count={count}, qps in this process: {round(count / total_dur, 4):3}"
+         )
+        return count

vectordb_bench/backend/runner/rate_runner.py ADDED Viewed

@@ -0,0 +1,79 @@
+import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
+import multiprocessing as mp
+from vectordb_bench.backend.clients import api
+from vectordb_bench.backend.dataset import DataSetIterator
+from vectordb_bench.backend.utils import time_it
+from vectordb_bench import config
+from .util import get_data, is_futures_completed, get_future_exceptions
+log = logging.getLogger(__name__)
+class RatedMultiThreadingInsertRunner:
+    def __init__(
+        self,
+        rate: int, # numRows per second
+        db: api.VectorDB,
+        dataset_iter: DataSetIterator,
+        normalize: bool = False,
+        timeout: float | None = None,
+    ):
+        self.timeout = timeout if isinstance(timeout, (int, float)) else None
+        self.dataset = dataset_iter
+        self.db = db
+        self.normalize = normalize
+        self.insert_rate = rate
+        self.batch_rate = rate // config.NUM_PER_BATCH
+    def send_insert_task(self, db, emb: list[list[float]], metadata: list[str]):
+        db.insert_embeddings(emb, metadata)
+    @time_it
+    def run_with_rate(self, q: mp.Queue):
+        with ThreadPoolExecutor(max_workers=mp.cpu_count()) as executor:
+            executing_futures = []
+            @time_it
+            def submit_by_rate() -> bool:
+                rate = self.batch_rate
+                for data in self.dataset:
+                    emb, metadata = get_data(data, self.normalize)
+                    executing_futures.append(executor.submit(self.send_insert_task, self.db, emb, metadata))
+                    rate -= 1
+                    if rate == 0:
+                        return False
+                return rate == self.batch_rate
+            with self.db.init():
+                while True:
+                    start_time = time.perf_counter()
+                    finished, elapsed_time = submit_by_rate()
+                    if finished is True:
+                        q.put(None, block=True)
+                        log.info(f"End of dataset, left unfinished={len(executing_futures)}")
+                        return
+                    q.put(True, block=False)
+                    wait_interval = 1 - elapsed_time if elapsed_time < 1 else 0.001
+                    e, completed = is_futures_completed(executing_futures, wait_interval)
+                    if completed is True:
+                        ex = get_future_exceptions(executing_futures)
+                        if ex is not None:
+                            log.warn(f"task error, terminating, err={ex}")
+                            q.put(None)
+                            executor.shutdown(wait=True, cancel_futures=True)
+                            raise ex
+                        else:
+                            log.debug(f"Finished {len(executing_futures)} insert-{config.NUM_PER_BATCH} task in 1s, wait_interval={wait_interval:.2f}")
+                        executing_futures = []
+                    else:
+                        log.warning(f"Failed to finish tasks in 1s, {e}, waited={wait_interval:.2f}, try to check the next round")
+                    dur = time.perf_counter() - start_time
+                    if dur < 1:
+                        time.sleep(1 - dur)

vectordb_bench/backend/runner/read_write_runner.py ADDED Viewed

@@ -0,0 +1,112 @@
+import logging
+from typing import Iterable
+import multiprocessing as mp
+import concurrent
+import numpy as np
+import math
+from .mp_runner import MultiProcessingSearchRunner
+from .serial_runner import SerialSearchRunner
+from .rate_runner import RatedMultiThreadingInsertRunner
+from vectordb_bench.backend.clients import api
+from vectordb_bench.backend.dataset import DatasetManager
+log = logging.getLogger(__name__)
+class ReadWriteRunner(MultiProcessingSearchRunner, RatedMultiThreadingInsertRunner):
+    def __init__(
+        self,
+        db: api.VectorDB,
+        dataset: DatasetManager,
+        insert_rate: int = 1000,
+        normalize: bool = False,
+        k: int = 100,
+        filters: dict | None = None,
+        concurrencies: Iterable[int] = (1, 15, 50),
+        search_stage: Iterable[float] = (0.5, 0.6, 0.7, 0.8, 0.9, 1.0), # search in any insert portion, 0.0 means search from the start
+        read_dur_after_write: int = 300, # seconds, search duration when insertion is done
+        timeout: float | None = None,
+    ):
+        self.insert_rate = insert_rate
+        self.data_volume = dataset.data.size
+        for stage in search_stage:
+            assert 0.0 <= stage <= 1.0, "each search stage should be in [0.0, 1.0]"
+        self.search_stage = sorted(search_stage)
+        self.read_dur_after_write = read_dur_after_write
+        log.info(f"Init runner, concurencys={concurrencies}, search_stage={search_stage}, stage_search_dur={read_dur_after_write}")
+        test_emb = np.stack(dataset.test_data["emb"])
+        if normalize:
+            test_emb = test_emb / np.linalg.norm(test_emb, axis=1)[:, np.newaxis]
+        test_emb = test_emb.tolist()
+        MultiProcessingSearchRunner.__init__(
+            self,
+            db=db,
+            test_data=test_emb,
+            k=k,
+            filters=filters,
+            concurrencies=concurrencies,
+        )
+        RatedMultiThreadingInsertRunner.__init__(
+            self,
+            rate=insert_rate,
+            db=db,
+            dataset_iter=iter(dataset),
+            normalize=normalize,
+        )
+        self.serial_search_runner = SerialSearchRunner(
+            db=db,
+            test_data=test_emb,
+            ground_truth=dataset.gt_data,
+            k=k,
+        )
+    def run_read_write(self):
+        futures = []
+        with mp.Manager() as m:
+            q = m.Queue()
+            with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context("spawn"), max_workers=2) as executor:
+                futures.append(executor.submit(self.run_with_rate, q))
+                futures.append(executor.submit(self.run_search_by_sig, q))
+                for future in concurrent.futures.as_completed(futures):
+                    res = future.result()
+                    log.info(f"Result = {res}")
+        log.info("Concurrent read write all done")
+    def run_search_by_sig(self, q):
+        res = []
+        total_batch = math.ceil(self.data_volume / self.insert_rate)
+        batch = 0
+        recall = 'x'
+        for idx, stage in enumerate(self.search_stage):
+            target_batch = int(total_batch * stage)
+            while q.get(block=True):
+                batch += 1
+                if batch >= target_batch:
+                    perc = int(stage * 100)
+                    log.info(f"Insert {perc}% done, total batch={total_batch}")
+                    log.info(f"[{batch}/{total_batch}] Serial search - {perc}% start")
+                    recall, ndcg, p99 =self.serial_search_runner.run()
+                    if idx < len(self.search_stage) - 1:
+                        stage_search_dur = (self.data_volume  * (self.search_stage[idx + 1] - stage) // self.insert_rate) // len(self.concurrencies)
+                        if stage_search_dur < 30:
+                            log.warning(f"Search duration too short, please reduce concurrency count or insert rate, or increase dataset volume: dur={stage_search_dur}, concurrencies={len(self.concurrencies)}, insert_rate={self.insert_rate}")
+                        log.info(f"[{batch}/{total_batch}] Conc search - {perc}% start, dur for each conc={stage_search_dur}s")
+                    else:
+                        last_search_dur = self.data_volume * (1.0 - stage) // self.insert_rate
+                        stage_search_dur = last_search_dur + self.read_dur_after_write
+                        log.info(f"[{batch}/{total_batch}] Last conc search - {perc}% start, [read_until_write|read_after_write|total] =[{last_search_dur}s|{self.read_dur_after_write}s|{stage_search_dur}s]")
+                    max_qps = self.run_by_dur(stage_search_dur)
+                    res.append((perc, max_qps, recall))
+                    break
+        return res

vectordb_bench/backend/runner/util.py ADDED Viewed

@@ -0,0 +1,32 @@
+import logging
+import concurrent
+from typing import Iterable
+from pandas import DataFrame
+import numpy as np
+log = logging.getLogger(__name__)
+def get_data(data_df: DataFrame, normalize: bool) -> tuple[list[list[float]], list[str]]:
+    all_metadata = data_df['id'].tolist()
+    emb_np = np.stack(data_df['emb'])
+    if normalize:
+        log.debug("normalize the 100k train data")
+        all_embeddings = (emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis]).tolist()
+    else:
+        all_embeddings = emb_np.tolist()
+    return all_embeddings, all_metadata
+def is_futures_completed(futures: Iterable[concurrent.futures.Future], interval) -> (Exception, bool):
+    try:
+        list(concurrent.futures.as_completed(futures, timeout=interval))
+    except TimeoutError as e:
+        return e, False
+    return None, True
+def get_future_exceptions(futures: Iterable[concurrent.futures.Future]) -> BaseException | None:
+    for f in futures:
+        if f.exception() is not None:
+            return f.exception()
+    return

vectordb_bench/backend/task_runner.py CHANGED Viewed

@@ -150,7 +150,7 @@ class CaseRunner(BaseModel):
                 )
             self._init_search_runner()
             m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = self._conc_search()
             m.recall, m.serial_latency_p99 = self._serial_search()
         '''
@@ -176,6 +176,9 @@ class CaseRunner(BaseModel):
                 or TaskStage.SEARCH_CONCURRENT in self.config.stages
             ):
                 self._init_search_runner()
+                if TaskStage.SEARCH_CONCURRENT in self.config.stages:
+                    search_results = self._conc_search()
+                    m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list, m.conc_latency_avg_list = search_results
                 if TaskStage.SEARCH_SERIAL in self.config.stages:
                     search_results = self._serial_search()
                     '''
@@ -183,10 +186,7 @@ class CaseRunner(BaseModel):
                     m.serial_latencies = search_results.serial_latencies
                     '''
                     m.recall, m.ndcg, m.serial_latency_p99 = search_results
-                if TaskStage.SEARCH_CONCURRENT in self.config.stages:
-                    search_results = self._conc_search()
-                    m.qps, m.conc_num_list, m.conc_qps_list, m.conc_latency_p99_list = search_results
         except Exception as e:
             log.warning(f"Failed to run performance case, reason = {e}")
             traceback.print_exc()

vectordb_bench/cli/vectordbbench.py CHANGED Viewed

@@ -9,6 +9,7 @@ from ..backend.clients.weaviate_cloud.cli import Weaviate
 from ..backend.clients.zilliz_cloud.cli import ZillizAutoIndex
 from ..backend.clients.milvus.cli import MilvusAutoIndex
 from ..backend.clients.aws_opensearch.cli import AWSOpenSearch
+from ..backend.clients.alloydb.cli import AlloyDBScaNN
 from .cli import cli
@@ -24,6 +25,7 @@ cli.add_command(MilvusAutoIndex)
 cli.add_command(AWSOpenSearch)
 cli.add_command(PgVectorScaleDiskAnn)
 cli.add_command(PgDiskAnn)
+cli.add_command(AlloyDBScaNN)
 if __name__ == "__main__":

vectordb-bench 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

vectordb-bench 0.0.16py3-none-any.whl → 0.0.18py3-none-any.whl