PyPI - vectordb-bench - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

vectordb-bench 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py CHANGED Viewed

@@ -3,13 +3,12 @@
 import logging
 import time
 from contextlib import contextmanager
-from typing import Any, Type
+from typing import Type
-from ..api import VectorDB, DBConfig, DBCaseConfig, EmptyDBCaseConfig, IndexType
-from .config import QdrantConfig
+from ..api import VectorDB, DBConfig, DBCaseConfig, IndexType
+from .config import QdrantConfig, QdrantIndexConfig
 from qdrant_client.http.models import (
     CollectionStatus,
-    Distance,
     VectorParams,
     PayloadSchemaType,
     Batch,
@@ -32,6 +31,7 @@ class QdrantCloud(VectorDB):
         db_case_config: DBCaseConfig,
         collection_name: str = "QdrantCloudCollection",
         drop_old: bool = False,
+        **kwargs,
     ):
         """Initialize wrapper around the QdrantCloud vector database."""
         self.db_config = db_config
@@ -55,7 +55,7 @@ class QdrantCloud(VectorDB):
     @classmethod
     def case_config_cls(cls, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
-        return EmptyDBCaseConfig
+        return QdrantIndexConfig
     @contextmanager
     def init(self) -> None:
@@ -74,7 +74,7 @@ class QdrantCloud(VectorDB):
         pass
-    def ready_to_search(self):
+    def optimize(self):
         assert self.qdrant_client, "Please call self.init() before"
         # wait for vectors to be fully indexed
         SECONDS_WAITING_FOR_INDEXING_API_CALL = 5
@@ -97,7 +97,7 @@ class QdrantCloud(VectorDB):
         try:
             qdrant_client.create_collection(
                 collection_name=self.collection_name,
-                vectors_config=VectorParams(size=dim, distance=Distance.EUCLID)
+                vectors_config=VectorParams(size=dim, distance=self.case_config.index_param()["distance"])
             )
             qdrant_client.create_payload_index(
@@ -116,7 +116,7 @@ class QdrantCloud(VectorDB):
         self,
         embeddings: list[list[float]],
         metadata: list[int],
-        **kwargs: Any,
+        **kwargs,
     ) -> (int, Exception):
         """Insert embeddings into Milvus. should call self.init() first"""
         assert self.qdrant_client is not None
@@ -127,10 +127,11 @@ class QdrantCloud(VectorDB):
                 wait=True,
                 points=Batch(ids=metadata, payloads=[{self._primary_field: v} for v in metadata], vectors=embeddings)
             )
-            return (len(metadata), None)
         except Exception as e:
             log.info(f"Failed to insert data, {e}")
-            return (0, e)
+            return 0, e
+        else:
+            return len(metadata), None
     def search_embedding(
         self,
@@ -138,7 +139,6 @@ class QdrantCloud(VectorDB):
         k: int = 100,
         filters: dict | None = None,
         timeout: int | None = None,
-        **kwargs: Any,
     ) -> list[int]:
         """Perform a search on a query embedding and return results with score.
         Should call self.init() first.

vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Wrapper around the Weaviate vector database over VectorDB"""
 import logging
-from typing import Any, Iterable, Type
+from typing import Iterable, Type
 from contextlib import contextmanager
 from weaviate.exceptions import WeaviateBaseError
@@ -21,6 +21,7 @@ class WeaviateCloud(VectorDB):
         db_case_config: DBCaseConfig,
         collection_name: str = "VectorDBBenchCollection",
         drop_old: bool = False,
+        **kwargs,
     ):
         """Initialize wrapper around the weaviate vector database."""
         self.db_config = db_config
@@ -70,7 +71,7 @@ class WeaviateCloud(VectorDB):
         """Should call insert first, do nothing"""
         pass
-    def ready_to_search(self):
+    def optimize(self):
         assert self.client.schema.exists(self.collection_name)
         self.client.schema.update_config(self.collection_name, {"vectorIndexConfig": self.case_config.search_param() } )
@@ -98,13 +99,13 @@ class WeaviateCloud(VectorDB):
         self,
         embeddings: Iterable[list[float]],
         metadata: list[int],
-        **kwargs: Any,
+        **kwargs,
     ) -> (int, Exception):
         """Insert embeddings into Weaviate"""
         assert self.client.schema.exists(self.collection_name)
         insert_count = 0
         try:
-            with self.client.batch as batch:
+            with self.client.batch as batch:
                 batch.batch_size = len(metadata)
                 batch.dynamic = True
                 res = []
@@ -126,7 +127,6 @@ class WeaviateCloud(VectorDB):
         k: int = 100,
         filters: dict | None = None,
         timeout: int | None = None,
-        **kwargs: Any,
     ) -> list[int]:
         """Perform a search on a query embedding and return results with distance.
         Should call self.init() first.

vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py CHANGED Viewed

@@ -14,7 +14,8 @@ class ZillizCloud(Milvus):
         db_case_config: DBCaseConfig,
         collection_name: str = "ZillizCloudVectorDBBench",
         drop_old: bool = False,
-        name: str = "ZillizCloud"
+        name: str = "ZillizCloud",
+        **kwargs,
     ):
         super().__init__(
             dim=dim,
@@ -23,6 +24,7 @@ class ZillizCloud(Milvus):
             collection_name=collection_name,
             drop_old=drop_old,
             name=name,
+            **kwargs,
         )
     @classmethod

vectordb_bench/backend/dataset.py CHANGED Viewed

@@ -1,23 +1,20 @@
 """
 Usage:
-    >>> from xxx import dataset as ds
-    >>> gist_s = ds.get(ds.Name.GIST, ds.Label.SMALL)
-    >>> gist_s.dict()
-    dataset: {'data': {'name': 'GIST', 'dim': 128, 'metric_type': 'L2', 'label': 'SMALL', 'size': 50000000}, 'data_dir': 'xxx'}
+    >>> from xxx.dataset import Dataset
+    >>> Dataset.Cohere.get(100_000)
 """
 import os
 import logging
 import pathlib
-import math
 from hashlib import md5
-from enum import Enum, auto
-from typing import Any
+from enum import Enum
 import s3fs
 import pandas as pd
 from tqdm import tqdm
-from pydantic.dataclasses import dataclass
+from pydantic import validator, PrivateAttr
+import polars as pl
+from pyarrow.parquet import ParquetFile
 from ..base import BaseModel
 from .. import config
@@ -26,108 +23,83 @@ from . import utils
 log = logging.getLogger(__name__)
-@dataclass
-class LAION:
+class BaseDataset(BaseModel):
+    name: str
+    size: int
+    dim: int
+    metric_type: MetricType
+    use_shuffled: bool
+    _size_label: dict = PrivateAttr()
+    @validator("size")
+    def verify_size(cls, v):
+        if v not in cls._size_label:
+            raise ValueError(f"Size {v} not supported for the dataset, expected: {cls._size_label.keys()}")
+        return v
+    @property
+    def label(self) -> str:
+        return self._size_label.get(self.size)
+    @property
+    def dir_name(self) -> str:
+        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
+class LAION(BaseDataset):
     name: str = "LAION"
     dim: int = 768
     metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
+    _size_label: dict = {100_000_000: "LARGE"}
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class GIST:
+class GIST(BaseDataset):
     name: str = "GIST"
     dim: int = 960
     metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
+    _size_label: dict = {
+        100_000: "SMALL",
+        1_000_000: "MEDIUM",
+    }
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class Cohere:
+class Cohere(BaseDataset):
     name: str = "Cohere"
     dim: int = 768
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = config.USE_SHUFFLED_DATA
+    _size_label: dict = {
+        100_000: "SMALL",
+        1_000_000: "MEDIUM",
+        10_000_000: "LARGE",
+    }
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class Glove:
+class Glove(BaseDataset):
     name: str = "Glove"
     dim: int = 200
     metric_type: MetricType = MetricType.COSINE
     use_shuffled: bool = False
+    _size_label: dict = {1_000_000: "MEDIUM"}
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
-@dataclass
-class SIFT:
+class SIFT(BaseDataset):
     name: str = "SIFT"
     dim: int = 128
-    metric_type: MetricType = MetricType.COSINE
+    metric_type: MetricType = MetricType.L2
     use_shuffled: bool = False
+    _size_label: dict = {
-    @property
-    def dir_name(self) -> str:
-        return f"{self.name}_{self.label}_{utils.numerize(self.size)}".lower()
+        500_000: "SMALL",
+        5_000_000: "MEDIUM",
+        50_000_000: "LARGE",
+    }
-@dataclass
-class LAION_L(LAION):
-    label: str = "LARGE"
-    size: int  = 100_000_000
-@dataclass
-class GIST_S(GIST):
-    label: str = "SMALL"
-    size: int  = 100_000
-@dataclass
-class GIST_M(GIST):
-    label: str = "MEDIUM"
-    size: int  = 1_000_000
-@dataclass
-class Cohere_M(Cohere):
-    label: str = "MEDIUM"
-    size: int = 1_000_000
-@dataclass
-class Cohere_L(Cohere):
-    label : str = "LARGE"
-    size  : int = 10_000_000
-@dataclass
-class Glove_M(Glove):
-    label: str = "MEDIUM"
-    size : int = 1_000_000
-@dataclass
-class SIFT_S(SIFT):
-    label: str = "SMALL"
-    size : int = 500_000
-@dataclass
-class SIFT_M(SIFT):
-    label: str = "MEDIUM"
-    size : int = 5_000_000
-@dataclass
-class SIFT_L(SIFT):
-    label: str = "LARGE"
-    size : int = 50_000_000
-class DataSet(BaseModel):
+class DatasetManager(BaseModel):
     """Download dataset if not int the local directory. Provide data for cases.
     DataSet is iterable, each iteration will return the next batch of data in pandas.DataFrame
@@ -137,12 +109,12 @@ class DataSet(BaseModel):
         >>> for data in cohere_s:
         >>>    print(data.columns)
     """
-    data:   GIST | Cohere | Glove | SIFT | Any
+    data:   BaseDataset
     test_data: pd.DataFrame | None = None
     train_files : list[str] = []
     def __eq__(self, obj):
-        if isinstance(obj, DataSet):
+        if isinstance(obj, DatasetManager):
             return self.data.name == obj.data.name and \
                 self.data.label == obj.data.label
         return False
@@ -294,88 +266,66 @@ class DataSet(BaseModel):
     def _read_file(self, file_name: str) -> pd.DataFrame:
         """read one file from disk into memory"""
-        import pyarrow.parquet as pq
+        log.info(f"Read the entire file into memory: {file_name}")
         p = pathlib.Path(self.data_dir, file_name)
-        log.info(f"reading file into memory: {p}")
         if not p.exists():
             log.warning(f"No such file: {p}")
             return pd.DataFrame()
-        data = pq.read_table(p)
-        df = data.to_pandas()
-        return df
+        return pl.read_parquet(p)
 class DataSetIterator:
-    def __init__(self, dataset: DataSet):
+    def __init__(self, dataset: DatasetManager):
         self._ds = dataset
         self._idx = 0  # file number
-        self._curr: pd.DataFrame | None = None
+        self._cur = None
         self._sub_idx = [0 for i in range(len(self._ds.train_files))] # iter num for each file
+    def _get_iter(self, file_name: str):
+        p = pathlib.Path(self._ds.data_dir, file_name)
+        log.info(f"Get iterator for {p.name}")
+        if not p.exists():
+            raise IndexError(f"No such file {p}")
+            log.warning(f"No such file: {p}")
+        return ParquetFile(p).iter_batches(config.NUM_PER_BATCH)
     def __next__(self) -> pd.DataFrame:
         """return the data in the next file of the training list"""
         if self._idx < len(self._ds.train_files):
-            _sub = self._sub_idx[self._idx]
-            if _sub == 0 and self._idx == 0: # init
+            if self._cur is None:
                 file_name = self._ds.train_files[self._idx]
-                self._curr = self._ds._read_file(file_name)
-                self._iter_num = math.ceil(self._curr.shape[0]/100_000)
+                self._cur = self._get_iter(file_name)
-            if _sub == self._iter_num:
+            try:
+                return next(self._cur).to_pandas()
+            except StopIteration:
                 if self._idx == len(self._ds.train_files) - 1:
-                    self._curr = None
-                    raise StopIteration
-                else:
-                    self._idx += 1
-                    _sub = self._sub_idx[self._idx]
-                    self._curr = None
-                    file_name = self._ds.train_files[self._idx]
-                    self._curr = self._ds._read_file(file_name)
-            sub_df = self._curr[_sub*100_000: (_sub+1)*100_000]
-            self._sub_idx[self._idx] += 1
-            log.info(f"Get the [{_sub+1}/{self._iter_num}] batch of {self._idx+1}/{len(self._ds.train_files)} train file")
-            return sub_df
-        self._curr = None
+                    raise StopIteration from None
+                self._idx += 1
+                file_name = self._ds.train_files[self._idx]
+                self._cur = self._get_iter(file_name)
+                return next(self._cur).to_pandas()
         raise StopIteration
-class Name(Enum):
-    GIST = auto()
-    Cohere = auto()
-    Glove = auto()
-    SIFT = auto()
-    LAION = auto()
-class Label(Enum):
-    SMALL = auto()
-    MEDIUM = auto()
-    LARGE = auto()
-_global_ds_mapping = {
-    Name.GIST: {
-        Label.SMALL: DataSet(data=GIST_S()),
-        Label.MEDIUM: DataSet(data=GIST_M()),
-    },
-    Name.Cohere: {
-        Label.MEDIUM: DataSet(data=Cohere_M()),
-        Label.LARGE: DataSet(data=Cohere_L()),
-    },
-    Name.Glove:{
-        Label.MEDIUM: DataSet(data=Glove_M()),
-    },
-    Name.SIFT: {
-        Label.SMALL: DataSet(data=SIFT_S()),
-        Label.MEDIUM: DataSet(data=SIFT_M()),
-        Label.LARGE: DataSet(data=SIFT_L()),
-    },
-    Name.LAION: {
-        Label.LARGE: DataSet(data=LAION_L()),
-    },
-}
-def get(ds: Name, label: Label):
-    return _global_ds_mapping.get(ds, {}).get(label)
+class Dataset(Enum):
+    """
+    Value is Dataset classes, DO NOT use it
+    Example:
+        >>> all_dataset = [ds.name for ds in Dataset]
+        >>> Dataset.COHERE.manager(100_000)
+        >>> Dataset.COHERE.get(100_000)
+    """
+    LAION = LAION
+    GIST = GIST
+    COHERE = Cohere
+    GLOVE = Glove
+    SIFT = SIFT
+    def get(self, size: int) -> BaseDataset:
+        return self.value(size=size)
+    def manager(self, size: int) -> DatasetManager:
+        return DatasetManager(data=self.get(size))

vectordb_bench/backend/result_collector.py CHANGED Viewed

@@ -6,10 +6,10 @@ class ResultCollector:
     @classmethod
     def collect(cls, result_dir: pathlib.Path) -> list[TestResult]:
         results = []
-        if not result_dir.exists() or len(list(result_dir.glob("*.json"))) == 0:
+        if not result_dir.exists() or len(list(result_dir.glob("result_*.json"))) == 0:
             return []
-        for json_file in result_dir.glob("*.json"):
+        for json_file in result_dir.glob("result_*.json"):
             results.append(TestResult.read_file(json_file, trans_unit=True))
         return results

vectordb_bench/backend/runner/mp_runner.py CHANGED Viewed

@@ -40,7 +40,12 @@ class MultiProcessingSearchRunner:
         self.test_data = utils.SharedNumpyArray(test_data)
         log.debug(f"test dataset columns: {len(test_data)}")
-    def search(self, test_np: utils.SharedNumpyArray) -> tuple[int, float]:
+    def search(self, test_np: utils.SharedNumpyArray, q: mp.Queue, cond: mp.Condition) -> tuple[int, float]:
+        # sync all process
+        q.put(1)
+        with cond:
+            cond.wait()
         with self.db.init():
             test_data = test_np.read().tolist()
             num, idx = len(test_data), 0
@@ -77,7 +82,7 @@ class MultiProcessingSearchRunner:
     @staticmethod
     def get_mp_context():
-        mp_start_method = "forkserver" if "forkserver" in mp.get_all_start_methods() else "spawn"
+        mp_start_method = "spawn"
         log.debug(f"MultiProcessingSearchRunner get multiprocessing start method: {mp_start_method}")
         return mp.get_context(mp_start_method)
@@ -85,21 +90,32 @@ class MultiProcessingSearchRunner:
         max_qps = 0
         try:
             for conc in self.concurrencies:
-                with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
-                    start = time.perf_counter()
-                    log.info(f"start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
-                    future_iter = executor.map(self.search, [self.test_data for i in range(conc)])
-                    all_count = sum([r[0] for r in future_iter])
-                    cost = time.perf_counter() - start
-                    qps = round(all_count / cost, 4)
-                    log.info(f"end search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
+                with mp.Manager() as m:
+                    q, cond = m.Queue(), m.Condition()
+                    with concurrent.futures.ProcessPoolExecutor(mp_context=self.get_mp_context(), max_workers=conc) as executor:
+                        log.info(f"Start search {self.duration}s in concurrency {conc}, filters: {self.filters}")
+                        future_iter = [executor.submit(self.search, self.test_data, q, cond) for i in range(conc)]
+                        # Sync all processes
+                        while q.qsize() < conc:
+                            sleep_t = conc if conc < 10 else 10
+                            time.sleep(sleep_t)
+                        with cond:
+                            cond.notify_all()
+                            log.info(f"Syncing all process and start concurrency search, concurrency={conc}")
+                        start = time.perf_counter()
+                        all_count = sum([r.result()[0] for r in future_iter])
+                        cost = time.perf_counter() - start
+                        qps = round(all_count / cost, 4)
+                        log.info(f"End search in concurrency {conc}: dur={cost}s, total_count={all_count}, qps={qps}")
                 if qps > max_qps:
                     max_qps = qps
-                    log.info(f"update largest qps with concurrency {conc}: current max_qps={max_qps}")
+                    log.info(f"Update largest qps with concurrency {conc}: current max_qps={max_qps}")
         except Exception as e:
-            log.warning(f"fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
+            log.warning(f"Fail to search all concurrencies: {self.concurrencies}, max_qps before failure={max_qps}, reason={e}")
             traceback.print_exc()
             # No results available, raise exception

vectordb-bench 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl

vectordb-bench 0.0.2py3-none-any.whl → 0.0.3py3-none-any.whl