PyPI - vectordb-bench - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

vectordb-bench 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

vectordb_bench/backend/dataset.py CHANGED Viewed

@@ -1,23 +1,20 @@
 """
 Usage:
-    >>> from xxx import dataset as ds
-    >>> gist_s = ds.get(ds.Name.GIST, ds.Label.SMALL)
-    >>> gist_s.dict()
-    dataset: {'data': {'name': 'GIST', 'dim': 128, 'metric_type': 'L2', 'label': 'SMALL', 'size': 50000000}, 'data_dir': 'xxx'}
+    >>> from xxx.dataset import Dataset
+    >>> Dataset.Cohere.get(100_000)
 """
 import os
 import logging
 import pathlib
-import math
 from hashlib import md5
-from enum import Enum, auto
-from typing import Any
+from enum import Enum
 import s3fs
 import pandas as pd
 from tqdm import tqdm
-from pydantic.dataclasses import dataclass
+from pydantic import validator, PrivateAttr
+import polars as pl
+from pyarrow.parquet import ParquetFile
 from ..base import BaseModel
 from .. import config
@@ -26,118 +23,83 @@ from . import utils
 log = logging.getLogger(__name__)
-@dataclass
-class LAION:
-    name: str = "LAION"
-    dim: int = 768
-    metric_type: MetricType = MetricType.COSINE
-    use_shuffled: bool = False
+class BaseDataset(BaseModel):
+    name: str
+    size: int
+    dim: int
+    metric_type: MetricType
+    use_shuffled: bool
+    _size_label: dict = PrivateAttr()
+    @validator("size")
+    def verify_size(cls, v):
+        if v not in cls._size_label:
+            raise ValueError(f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}")
+        return v
+    @property
+    def label(self) -> str:
+        return self._size_label.get(self.size)
     @property
     def dir_name(self) -> str:
         return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class GIST:
+class LAION(BaseDataset):
+    name: str = "LAION"
+    dim: int = 768
+    metric_type: MetricType = MetricType.L2
+    use_shuffled: bool = False
+    _size_label: dict = {100_000_000: "LARGE"}
+class GIST(BaseDataset):
     name: str = "GIST"
     dim: int = 960
     metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
+    _size_label: dict = {
+        100_000: "SMALL",
+        1_000_000: "MEDIUM",
+    }
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class Cohere:
+class Cohere(BaseDataset):
     name: str = "Cohere"
     dim: int = 768
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = config.USE_SHUFFLED_DATA
+    _size_label: dict = {
+        100_000: "SMALL",
+        1_000_000: "MEDIUM",
+        10_000_000: "LARGE",
+    }
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class Glove:
+class Glove(BaseDataset):
     name: str = "Glove"
     dim: int = 200
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = False
+    _size_label: dict = {1_000_000: "MEDIUM"}
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class SIFT:
+class SIFT(BaseDataset):
     name: str = "SIFT"
     dim: int = 128
-    metric_type: MetricType = MetricType.COSINE
+    metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
+    _size_label: dict = {
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
+        500_000: "SMALL",
+        5_000_000: "MEDIUM",
+        50_000_000: "LARGE",
+    }
-@dataclass
-class LAION_L(LAION):
-    label: str = "LARGE"
-    size: int  = 100_000_000
-@dataclass
-class GIST_S(GIST):
-    label: str = "SMALL"
-    size: int  = 100_000
-@dataclass
-class GIST_M(GIST):
-    label: str = "MEDIUM"
-    size: int  = 1_000_000
-@dataclass
-class Cohere_S(Cohere):
-    label: str = "SMALL"
-    size: int  = 100_000
-@dataclass
-class Cohere_M(Cohere):
-    label: str = "MEDIUM"
-    size: int = 1_000_000
-@dataclass
-class Cohere_L(Cohere):
-    label : str = "LARGE"
-    size  : int = 10_000_000
-@dataclass
-class Glove_S(Glove):
-    label: str = "SMALL"
-    size : int = 100_000
-@dataclass
-class Glove_M(Glove):
-    label: str = "MEDIUM"
-    size : int = 1_000_000
-@dataclass
-class SIFT_S(SIFT):
-    label: str = "SMALL"
-    size : int = 500_000
-@dataclass
-class SIFT_M(SIFT):
-    label: str = "MEDIUM"
-    size : int = 5_000_000
-@dataclass
-class SIFT_L(SIFT):
-    label: str = "LARGE"
-    size : int = 50_000_000
-class DataSet(BaseModel):
+class DatasetManager(BaseModel):
     """Download dataset if not int the local directory. Provide data for cases.
     DataSet is iterable, each iteration will return the next batch of data in pandas.DataFrame
@@ -147,12 +109,12 @@ class DataSet(BaseModel):
         >>> for data in cohere_s:
         >>>    print(data.columns)
     """
-    data:   GIST | Cohere | Glove | SIFT | Any
+    data:   BaseDataset
     test_data: pd.DataFrame | None = None
     train_files : list[str] = []
     def __eq__(self, obj):
-        if isinstance(obj, DataSet):
+        if isinstance(obj, DatasetManager):
             return self.data.name == obj.data.name and \
                 self.data.label == obj.data.label
         return False
@@ -304,90 +266,66 @@ class DataSet(BaseModel):
     def _read_file(self, file_name: str) -> pd.DataFrame:
         """read one file from disk into memory"""
-        import pyarrow.parquet as pq
+        log.info(f"Read the entire file into memory: {file_name}")
         p = pathlib.Path(self.data_dir, file_name)
-        log.info(f"reading file into memory: {p}")
         if not p.exists():
             log.warning(f"No such file: {p}")
             return pd.DataFrame()
-        data = pq.read_table(p)
-        df = data.to_pandas()
-        return df
+        return pl.read_parquet(p)
 class DataSetIterator:
-    def __init__(self, dataset: DataSet):
+    def __init__(self, dataset: DatasetManager):
         self._ds = dataset
         self._idx = 0  # file number
-        self._curr: pd.DataFrame | None = None
+        self._cur = None
         self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
+    def _get_iter(self, file_name: str):
+        p = pathlib.Path(self._ds.data_dir, file_name)
+        log.info(f"Get iterator for {p.name}")
+        if not p.exists():
+            raise IndexError(f"No such file {p}")
+            log.warning(f"No such file: {p}")
+        return ParquetFile(p).iter_batches(config.NUM_PER_BATCH)
     def __next__(self) -> pd.DataFrame:
         """return the data in the next file of the training list"""
         if self._idx < len(self._ds.train_files):
-            _sub = self._sub_idx[self._idx]
-            if _sub == 0 and self._idx == 0: # init
+            if self._cur is None:
                 file_name = self._ds.train_files[self._idx]
-                self._curr = self._ds._read_file(file_name)
-                self._iter_num = math.ceil(self._curr.shape[0]/100_000)
+                self._cur = self._get_iter(file_name)
-            if _sub == self._iter_num:
+            try:
+                return next(self._cur).to_pandas()
+            except StopIteration:
                 if self._idx == len(self._ds.train_files) - 1:
-                    self._curr = None
-                    raise StopIteration
-                else:
-                    self._idx += 1
-                    _sub = self._sub_idx[self._idx]
-                    self._curr = None
-                    file_name = self._ds.train_files[self._idx]
-                    self._curr = self._ds._read_file(file_name)
-            sub_df = self._curr[_sub*100_000: (_sub+1)*100_000]
-            self._sub_idx[self._idx] += 1
-            log.info(f"Get the [{_sub+1}/{self._iter_num}] batch of {self._idx+1}/{len(self._ds.train_files)} train file")
-            return sub_df
-        self._curr = None
+                    raise StopIteration from None
+                self._idx += 1
+                file_name = self._ds.train_files[self._idx]
+                self._cur = self._get_iter(file_name)
+                return next(self._cur).to_pandas()
         raise StopIteration
-class Name(Enum):
-    GIST = auto()
-    Cohere = auto()
-    Glove = auto()
-    SIFT = auto()
-    LAION = auto()
-class Label(Enum):
-    SMALL = auto()
-    MEDIUM = auto()
-    LARGE = auto()
-_global_ds_mapping = {
-    Name.GIST: {
-        Label.SMALL: DataSet(data=GIST_S()),
-        Label.MEDIUM: DataSet(data=GIST_M()),
-    },
-    Name.Cohere: {
-        Label.SMALL: DataSet(data=Cohere_S()),
-        Label.MEDIUM: DataSet(data=Cohere_M()),
-        Label.LARGE: DataSet(data=Cohere_L()),
-    },
-    Name.Glove:{
-        Label.SMALL: DataSet(data=Glove_S()),
-        Label.MEDIUM: DataSet(data=Glove_M()),
-    },
-    Name.SIFT: {
-        Label.SMALL: DataSet(data=SIFT_S()),
-        Label.MEDIUM: DataSet(data=SIFT_M()),
-        Label.LARGE: DataSet(data=SIFT_L()),
-    },
-    Name.LAION: {
-        Label.LARGE: DataSet(data=LAION_L()),
-    },
-}
-def get(ds: Name, label: Label):
-    return _global_ds_mapping.get(ds, {}).get(label)
+class Dataset(Enum):
+    """
+    Value is Dataset classes, DO NOT use it
+    Example:
+        >>> all_dataset = [ds.name for ds in Dataset]
+        >>> Dataset.COHERE.manager(100_000)
+        >>> Dataset.COHERE.get(100_000)
+    """
+    LAION = LAION
+    GIST = GIST
+    COHERE = Cohere
+    GLOVE = Glove
+    SIFT = SIFT
+    def get(self, size: int) -> BaseDataset:
+        return self.value(size=size)
+    def manager(self, size: int) -> DatasetManager:
+        return DatasetManager(data=self.get(size))

vectordb_bench/backend/result_collector.py CHANGED Viewed

@@ -6,10 +6,10 @@ class ResultCollector:
     @classmethod
     def collect(cls, result_dir: pathlib.Path) -> list[TestResult]:
         results = []
-        if not result_dir.exists() or len(list(result_dir.glob("*.json"))) == 0:
+        if not result_dir.exists() or len(list(result_dir.glob("result_*.json"))) == 0:
             return []
-        for json_file in result_dir.glob("*.json"):
+        for json_file in result_dir.glob("result_*.json"):
             results.append(TestResult.read_file(json_file, trans_unit=True))
         return results

vectordb_bench/backend/runner/mp_runner.py CHANGED Viewed

@@ -40,7 +40,12 @@ class MultiProcessingSearchRunner:
         self.test_data = utils.SharedNumpyArray(test_data)
         log.debug(f"test dataset columns: {len(test_data)}")
-    def search(self, test_np: utils.SharedNumpyArray) -> tuple[int, float]:
+    def search(self, test_np: utils.SharedNumpyArray, q: mp.Queue, cond: mp.Condition) -> tuple[int, float]:
+        # sync all process
+        q.put(1)
+        with cond:
+            cond.wait()
         with self.db.init():
             test_data = test_np.read().tolist()
             num, idx = len(test_data), 0
@@ -77,7 +82,7 @@ class MultiProcessingSearchRunner:
     @staticmethod
     def get_mp_context():
-        mp_start_method = "forkserver" if "forkserver" in mp.get_all_start_methods() else "spawn"
+        mp_start_method = "spawn"
         log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
         return mp.get_context(mp_start_method)
@@ -85,21 +90,32 @@ class MultiProcessingSearchRunner:
         max_qps = 0
         try:
             for conc in self.concurrencies:
-                with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
-                    start = time.perf_counter()
-                    log.info(f"start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
-                    future_iter = executor.map(self.search, [self.test_data for i in range(conc)])
-                    all_count = sum([r[0] for r in future_iter])
-                    cost = time.perf_counter() - start
-                    qps = round(all_count / cost, 4)
-                    log.info(f"end search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
+                with mp.Manager() as m:
+                    q, cond = m.Queue(), m.Condition()
+                    with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
+                        log.info(f"Start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
+                        future_iter = [executor.submit(self.search, self.test_data, q, cond) for i in range(conc)]
+                        # Sync all processes
+                        while q.qsize() < conc:
+                            sleep_t = conc if conc < 10 else 10
+                            time.sleep(sleep_t)
+                        with cond:
+                            cond.notify_all()
+                            log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
+                        start = time.perf_counter()
+                        all_count = sum([r.result()[0] for r in future_iter])
+                        cost = time.perf_counter() - start
+                        qps = round(all_count / cost, 4)
+                        log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
                 if qps > max_qps:
                     max_qps = qps
-                    log.info(f"update largest qps with concurrency {conc}: current max_qps={max_qps}")
+                    log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
         except Exception as e:
-            log.warning(f"fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
+            log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
             traceback.print_exc()
             # No results available, raise exception

vectordb_bench/backend/runner/serial_runner.py CHANGED Viewed

@@ -4,53 +4,99 @@ import traceback
 import concurrent
 import multiprocessing as mp
 import math
+import psutil
 import numpy as np
 import pandas as pd
 from ..clients import api
 from ...metric import calc_recall
-from ...models import LoadTimeoutError
+from ...models import LoadTimeoutError, PerformanceTimeoutError
 from .. import utils
 from ... import config
+from vectordb_bench.backend.dataset import DatasetManager
 NUM_PER_BATCH = config.NUM_PER_BATCH
-LOAD_TIMEOUT = 24 * 60 * 60
+LOAD_MAX_TRY_COUNT = 10
+WAITTING_TIME = 60
 log = logging.getLogger(__name__)
 class SerialInsertRunner:
-    def __init__(self, db: api.VectorDB, train_emb: list[list[float]], train_id: list[int]):
-        log.debug(f"Dataset shape: {len(train_emb)}")
+    def __init__(self, db: api.VectorDB, dataset: DatasetManager, normalize: bool, timeout: float | None = None):
+        self.timeout = timeout if isinstance(timeout, (int, float)) else None
+        self.dataset = dataset
         self.db = db
-        self.shared_emb = train_emb
-        self.train_id = train_id
+        self.normalize = normalize
-        self.seq_batches = math.ceil(len(train_emb)/NUM_PER_BATCH)
-    def insert_data(self, left_id: int = 0) -> int:
+    def task(self) -> int:
+        count = 0
         with self.db.init():
-            all_embeddings = self.shared_emb
+            log.info(f"({mp.current_process().name:16}) Start inserting embeddings in batch {config.NUM_PER_BATCH}")
+            start = time.perf_counter()
+            for data_df in self.dataset:
+                all_metadata = data_df['id'].tolist()
+                emb_np = np.stack(data_df['emb'])
+                if self.normalize:
+                    log.debug("normalize the 100k train data")
+                    all_embeddings = emb_np / np.linalg.norm(emb_np, axis=1)[:, np.newaxis].tolist()
+                else:
+                    all_embeddings = emb_np.tolist()
+                del(emb_np)
+                log.debug(f"batch dataset size: {len(all_embeddings)}, {len(all_metadata)}")
+                last_batch = self.dataset.data.size - count == len(all_metadata)
+                insert_count, error = self.db.insert_embeddings(
+                    embeddings=all_embeddings,
+                    metadata=all_metadata,
+                    last_batch=last_batch,
+                )
+                if error is not None:
+                    raise error
+                assert insert_count == len(all_metadata)
+                count += insert_count
+                if count % 100_000 == 0:
+                    log.info(f"({mp.current_process().name:16}) Loaded {count} embeddings into VectorDB")
+            log.info(f"({mp.current_process().name:16}) Finish loading all dataset into VectorDB, dur={time.perf_counter()-start}")
+            return count
+    def endless_insert_data(self, all_embeddings, all_metadata, left_id: int = 0) -> int:
+        with self.db.init():
             # unique id for endlessness insertion
-            all_metadata = [i+left_id for i in self.train_id]
+            all_metadata = [i+left_id for i in all_metadata]
-            num_conc_batches = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
+            NUM_BATCHES = math.ceil(len(all_embeddings)/NUM_PER_BATCH)
             log.info(f"({mp.current_process().name:16}) Start inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
             count = 0
-            for batch_id in range(self.seq_batches):
-                metadata = all_metadata[batch_id*NUM_PER_BATCH: (batch_id+1)*NUM_PER_BATCH]
-                embeddings = all_embeddings[batch_id*NUM_PER_BATCH: (batch_id+1)*NUM_PER_BATCH]
-                log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_conc_batches}], Start inserting {len(metadata)} embeddings")
-                insert_count = self.db.insert_embeddings(
-                    embeddings=embeddings,
-                    metadata=metadata,
-                )
-                log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{num_conc_batches}], Finish inserting {len(metadata)} embeddings")
-                assert insert_count == len(metadata)
-                count += insert_count
+            for batch_id in range(NUM_BATCHES):
+                retry_count = 0
+                already_insert_count = 0
+                metadata = all_metadata[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
+                embeddings = all_embeddings[batch_id*NUM_PER_BATCH : (batch_id+1)*NUM_PER_BATCH]
+                log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Start inserting {len(metadata)} embeddings")
+                while retry_count < LOAD_MAX_TRY_COUNT:
+                    insert_count, error = self.db.insert_embeddings(
+                        embeddings=embeddings[already_insert_count :],
+                        metadata=metadata[already_insert_count :],
+                    )
+                    already_insert_count += insert_count
+                    if error is not None:
+                        retry_count += 1
+                        time.sleep(WAITTING_TIME)
+                        log.info(f"Failed to insert data, try {retry_count} time")
+                        if retry_count >= LOAD_MAX_TRY_COUNT:
+                            raise error
+                    else:
+                        break
+                log.debug(f"({mp.current_process().name:16}) batch [{batch_id:3}/{NUM_BATCHES}], Finish inserting {len(metadata)} embeddings")
+                assert already_insert_count == len(metadata)
+                count += already_insert_count
             log.info(f"({mp.current_process().name:16}) Finish inserting {len(all_embeddings)} embeddings in batch {NUM_PER_BATCH}")
         return count
@@ -58,30 +104,46 @@ class SerialInsertRunner:
     def _insert_all_batches(self) -> int:
         """Performance case only"""
         with concurrent.futures.ProcessPoolExecutor(mp_context=mp.get_context('spawn'), max_workers=1) as executor:
-            future = executor.submit(self.insert_data)
-            count = future.result()
-            return count
+            future = executor.submit(self.task)
+            try:
+                count = future.result(timeout=self.timeout)
+            except TimeoutError as e:
+                msg = f"VectorDB load dataset timeout in {self.timeout}"
+                log.warning(msg)
+                for pid, _ in executor._processes.items():
+                    psutil.Process(pid).kill()
+                raise PerformanceTimeoutError(msg) from e
+            except Exception as e:
+                log.warning(f"VectorDB load dataset error: {e}")
+                raise e from e
+            else:
+                return count
     def run_endlessness(self) -> int:
         """run forever util DB raises exception or crash"""
+        # datasets for load tests are quite small, can fit into memory
+        # only 1 file
+        data_df = [data_df for data_df in self.dataset][0]
+        all_embeddings, all_metadata = np.stack(data_df["emb"]).tolist(), data_df['id'].tolist()
         start_time = time.perf_counter()
         max_load_count, times = 0, 0
         try:
             with self.db.init():
                 self.db.ready_to_load()
-            while time.perf_counter() - start_time < config.CASE_TIMEOUT_IN_SECOND:
-                count = self.insert_data(left_id=max_load_count)
+            while time.perf_counter() - start_time < self.timeout:
+                count = self.endless_insert_data(all_embeddings, all_metadata, left_id=max_load_count)
                 max_load_count += count
                 times += 1
                 log.info(f"Loaded {times} entire dataset, current max load counts={utils.numerize(max_load_count)}, {max_load_count}")
-            raise LoadTimeoutError("capacity case load timeout and stop")
-        except LoadTimeoutError as e:
-            log.info("load timetout, stop the load case")
-            raise e from None
         except Exception as e:
             log.info(f"Capacity case load reach limit, insertion counts={utils.numerize(max_load_count)}, {max_load_count}, err={e}")
             traceback.print_exc()
             return max_load_count
+        else:
+            msg = f"capacity case load timeout in {self.timeout}s"
+            log.info(msg)
+            raise LoadTimeoutError(msg)
     def run(self) -> int:
         count, dur = self._insert_all_batches()
@@ -113,7 +175,7 @@ class SerialSearchRunner:
             test_data, ground_truth = args
             log.debug(f"test dataset size: {len(test_data)}")
-            log.info(f"ground truth size: {ground_truth.columns}, shape: {ground_truth.shape}")
+            log.debug(f"ground truth size: {ground_truth.columns}, shape: {ground_truth.shape}")
             latencies, recalls = [], []
             for idx, emb in enumerate(test_data):

vectordb-bench 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

vectordb-bench 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl