PyPI - vectordb-bench - Versions diffs - 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl - Mend

vectordb-bench 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

vectordb_bench/__init__.py CHANGED Viewed

@@ -37,23 +37,24 @@ class config:
     K_DEFAULT = 100  # default return top k nearest neighbors during search
     CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
-    CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
-    LOAD_TIMEOUT_DEFAULT        = 2.5 * 3600 # 2.5h
-    LOAD_TIMEOUT_768D_1M        = 2.5 * 3600 # 2.5h
-    LOAD_TIMEOUT_768D_10M       =  25 * 3600 # 25h
-    LOAD_TIMEOUT_768D_100M      = 250 * 3600 # 10.41d
+    CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600   # 24h
+    LOAD_TIMEOUT_DEFAULT        = 24 * 3600   # 24h
+    LOAD_TIMEOUT_768D_1M        = 24 * 3600   # 24h
+    LOAD_TIMEOUT_768D_10M       = 240 * 3600  # 10d
+    LOAD_TIMEOUT_768D_100M      = 2400 * 3600 # 100d
-    LOAD_TIMEOUT_1536D_500K     = 2.5 * 3600 # 2.5h
-    LOAD_TIMEOUT_1536D_5M       =  25 * 3600 # 25h
+    LOAD_TIMEOUT_1536D_500K     = 24 * 3600   # 24h
+    LOAD_TIMEOUT_1536D_5M       = 240 * 3600  # 10d
-    OPTIMIZE_TIMEOUT_DEFAULT    = 30 * 60   # 30min
-    OPTIMIZE_TIMEOUT_768D_1M    =  30 * 60   # 30min
-    OPTIMIZE_TIMEOUT_768D_10M   = 5 * 3600 # 5h
-    OPTIMIZE_TIMEOUT_768D_100M  =  50 * 3600 # 50h
+    OPTIMIZE_TIMEOUT_DEFAULT    = 24 * 3600   # 24h
+    OPTIMIZE_TIMEOUT_768D_1M    = 24 * 3600   # 24h
+    OPTIMIZE_TIMEOUT_768D_10M   = 240 * 3600  # 10d
+    OPTIMIZE_TIMEOUT_768D_100M  = 2400 * 3600 # 100d
-    OPTIMIZE_TIMEOUT_1536D_500K =  15 * 60   # 15min
-    OPTIMIZE_TIMEOUT_1536D_5M   =   2.5 * 3600 # 2.5h
+    OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600   # 24h
+    OPTIMIZE_TIMEOUT_1536D_5M   = 240 * 3600  # 10d
     def display(self) -> str:
         tmp = [
             i for i in inspect.getmembers(self)

vectordb_bench/backend/clients/__init__.py CHANGED Viewed

@@ -31,6 +31,7 @@ class DB(Enum):
     PgVector = "PgVector"
     PgVectoRS = "PgVectoRS"
     PgVectorScale = "PgVectorScale"
+    PgDiskANN = "PgDiskANN"
     Redis = "Redis"
     MemoryDB = "MemoryDB"
     Chroma = "Chroma"
@@ -77,6 +78,10 @@ class DB(Enum):
             from .pgvectorscale.pgvectorscale import PgVectorScale
             return PgVectorScale
+        if self == DB.PgDiskANN:
+            from .pgdiskann.pgdiskann import PgDiskANN
+            return PgDiskANN
         if self == DB.Redis:
             from .redis.redis import Redis
             return Redis
@@ -132,6 +137,10 @@ class DB(Enum):
             from .pgvectorscale.config import PgVectorScaleConfig
             return PgVectorScaleConfig
+        if self == DB.PgDiskANN:
+            from .pgdiskann.config import PgDiskANNConfig
+            return PgDiskANNConfig
         if self == DB.Redis:
             from .redis.config import RedisConfig
             return RedisConfig
@@ -185,6 +194,10 @@ class DB(Enum):
             from .pgvectorscale.config import _pgvectorscale_case_config
             return _pgvectorscale_case_config.get(index_type)
+        if self == DB.PgDiskANN:
+            from .pgdiskann.config import _pgdiskann_case_config
+            return _pgdiskann_case_config.get(index_type)
         # DB.Pinecone, DB.Chroma, DB.Redis
         return EmptyDBCaseConfig

vectordb_bench/backend/clients/api.py CHANGED Viewed

@@ -10,6 +10,8 @@ class MetricType(str, Enum):
     L2 = "L2"
     COSINE = "COSINE"
     IP = "IP"
+    HAMMING = "HAMMING"
+    JACCARD = "JACCARD"
 class IndexType(str, Enum):

vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py CHANGED Viewed

@@ -3,7 +3,7 @@ from contextlib import contextmanager
 import time
 from typing import Iterable, Type
 from ..api import VectorDB, DBCaseConfig, DBConfig, IndexType
-from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig
+from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig, AWSOS_Engine
 from opensearchpy import OpenSearch
 from opensearchpy.helpers import bulk
@@ -83,7 +83,7 @@ class AWSOpenSearch(VectorDB):
     @contextmanager
     def init(self) -> None:
-        """connect to elasticsearch"""
+        """connect to opensearch"""
         self.client = OpenSearch(**self.db_config)
         yield
@@ -97,7 +97,7 @@ class AWSOpenSearch(VectorDB):
         metadata: list[int],
         **kwargs,
     ) -> tuple[int, Exception]:
-        """Insert the embeddings to the elasticsearch."""
+        """Insert the embeddings to the opensearch."""
         assert self.client is not None, "should self.init() first"
         insert_data = []
@@ -136,13 +136,15 @@ class AWSOpenSearch(VectorDB):
         body = {
             "size": k,
             "query": {"knn": {self.vector_col_name: {"vector": query, "k": k}}},
+            **({"filter": {"range": {self.id_col_name: {"gt": filters["id"]}}}} if filters else {})
         }
         try:
-            resp = self.client.search(index=self.index_name, body=body)
+            resp = self.client.search(index=self.index_name, body=body,size=k,_source=False,docvalue_fields=[self.id_col_name],stored_fields="_none_",filter_path=[f"hits.hits.fields.{self.id_col_name}"],)
             log.info(f'Search took: {resp["took"]}')
             log.info(f'Search shards: {resp["_shards"]}')
             log.info(f'Search hits total: {resp["hits"]["total"]}')
-            result = [int(d["_id"]) for d in resp["hits"]["hits"]]
+            result = [h["fields"][self.id_col_name][0] for h in resp["hits"]["hits"]]
+            #result = [int(d["_id"]) for d in resp["hits"]["hits"]]
             # log.info(f'success! length={len(res)}')
             return result
@@ -152,7 +154,46 @@ class AWSOpenSearch(VectorDB):
     def optimize(self):
         """optimize will be called between insertion and search in performance cases."""
-        pass
+        # Call refresh first to ensure that all segments are created
+        self._refresh_index()
+        self._do_force_merge()
+        # Call refresh again to ensure that the index is ready after force merge.
+        self._refresh_index()
+        # ensure that all graphs are loaded in memory and ready for search
+        self._load_graphs_to_memory()
+    def _refresh_index(self):
+        log.debug(f"Starting refresh for index {self.index_name}")
+        SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC = 30
+        while True:
+            try:
+                log.info(f"Starting the Refresh Index..")
+                self.client.indices.refresh(index=self.index_name)
+                break
+            except Exception as e:
+                log.info(
+                    f"Refresh errored out. Sleeping for {SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC} sec and then Retrying : {e}")
+                time.sleep(SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC)
+                continue
+        log.debug(f"Completed refresh for index {self.index_name}")
+    def _do_force_merge(self):
+        log.debug(f"Starting force merge for index {self.index_name}")
+        force_merge_endpoint = f'/{self.index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false'
+        force_merge_task_id = self.client.transport.perform_request('POST', force_merge_endpoint)['task']
+        SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+        while True:
+            time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
+            task_status = self.client.tasks.get(task_id=force_merge_task_id)
+            if task_status['completed']:
+                break
+        log.debug(f"Completed force merge for index {self.index_name}")
+    def _load_graphs_to_memory(self):
+        if self.case_config.engine != AWSOS_Engine.lucene:
+            log.info("Calling warmup API to load graphs into memory")
+            warmup_endpoint = f'/_plugins/_knn/warmup/{self.index_name}'
+            self.client.transport.perform_request('GET', warmup_endpoint)
     def ready_to_load(self):
         """ready_to_load will be called before load in load cases."""

vectordb_bench/backend/clients/aws_opensearch/config.py CHANGED Viewed

@@ -1,9 +1,10 @@
+import logging
 from enum import Enum
 from pydantic import SecretStr, BaseModel
 from ..api import DBConfig, DBCaseConfig, MetricType, IndexType
+log = logging.getLogger(__name__)
 class AWSOpenSearchConfig(DBConfig, BaseModel):
     host: str = ""
     port: int = 443
@@ -31,14 +32,18 @@ class AWSOS_Engine(Enum):
 class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
     metric_type: MetricType = MetricType.L2
-    engine: AWSOS_Engine = AWSOS_Engine.nmslib
-    efConstruction: int = 360
-    M: int = 30
+    engine: AWSOS_Engine = AWSOS_Engine.faiss
+    efConstruction: int = 256
+    efSearch: int = 256
+    M: int = 16
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.IP:
-            return "innerproduct"  # only support faiss / nmslib, not for Lucene.
+            return "innerproduct"
         elif self.metric_type == MetricType.COSINE:
+            if self.engine == AWSOS_Engine.faiss:
+                log.info(f"Using metric type as innerproduct because faiss doesn't support cosine as metric type for Opensearch")
+                return "innerproduct"
             return "cosinesimil"
         return "l2"
@@ -49,7 +54,8 @@ class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
             "engine": self.engine.value,
             "parameters": {
                 "ef_construction": self.efConstruction,
-                "m": self.M
+                "m": self.M,
+                "ef_search": self.efSearch
             }
         }
         return params

vectordb_bench/backend/clients/aws_opensearch/run.py CHANGED Viewed

@@ -40,12 +40,12 @@ def create_index(client, index_name):
                 "type": "knn_vector",
                 "dimension": _DIM,
                 "method": {
-                    "engine": "nmslib",
+                    "engine": "faiss",
                     "name": "hnsw",
                     "space_type": "l2",
                     "parameters": {
-                        "ef_construction": 128,
-                        "m": 24,
+                        "ef_construction": 256,
+                        "m": 16,
                     }
                 }
             }
@@ -108,12 +108,43 @@ def search(client, index_name):
             print('\nSearch not ready, sleep 1s')
             time.sleep(1)
+def optimize_index(client, index_name):
+    print(f"Starting force merge for index {index_name}")
+    force_merge_endpoint = f'/{index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false'
+    force_merge_task_id = client.transport.perform_request('POST', force_merge_endpoint)['task']
+    SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+    while True:
+        time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
+        task_status = client.tasks.get(task_id=force_merge_task_id)
+        if task_status['completed']:
+            break
+    print(f"Completed force merge for index {index_name}")
+def refresh_index(client, index_name):
+    print(f"Starting refresh for index {index_name}")
+    SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC = 30
+    while True:
+        try:
+            print(f"Starting the Refresh Index..")
+            client.indices.refresh(index=index_name)
+            break
+        except Exception as e:
+            print(
+                f"Refresh errored out. Sleeping for {SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC} sec and then Retrying : {e}")
+            time.sleep(SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC)
+            continue
+    print(f"Completed refresh for index {index_name}")
 def main():
     client = create_client()
     try:
         create_index(client, _INDEX_NAME)
         bulk_insert(client, _INDEX_NAME)
+        optimize_index(client, _INDEX_NAME)
+        refresh_index(client, _INDEX_NAME)
         search(client, _INDEX_NAME)
         delete_index(client, _INDEX_NAME)
     except Exception as e:

vectordb_bench/backend/clients/pgdiskann/cli.py ADDED Viewed

@@ -0,0 +1,99 @@
+import click
+import os
+from pydantic import SecretStr
+from ....cli.cli import (
+    CommonTypedDict,
+    cli,
+    click_parameter_decorators_from_typed_dict,
+    run,
+)
+from typing import Annotated, Optional, Unpack
+from vectordb_bench.backend.clients import DB
+class PgDiskAnnTypedDict(CommonTypedDict):
+    user_name: Annotated[
+        str, click.option("--user-name", type=str, help="Db username", required=True)
+    ]
+    password: Annotated[
+        str,
+        click.option("--password",
+                     type=str,
+                     help="Postgres database password",
+                     default=lambda: os.environ.get("POSTGRES_PASSWORD", ""),
+                     show_default="$POSTGRES_PASSWORD",
+                     ),
+    ]
+    host: Annotated[
+        str, click.option("--host", type=str, help="Db host", required=True)
+    ]
+    db_name: Annotated[
+        str, click.option("--db-name", type=str, help="Db name", required=True)
+    ]
+    max_neighbors: Annotated[
+        int,
+        click.option(
+            "--max-neighbors", type=int, help="PgDiskAnn max neighbors",
+        ),
+    ]
+    l_value_ib: Annotated[
+        int,
+        click.option(
+            "--l-value-ib", type=int, help="PgDiskAnn l_value_ib",
+        ),
+    ]
+    l_value_is: Annotated[
+        float,
+        click.option(
+            "--l-value-is", type=float, help="PgDiskAnn l_value_is",
+        ),
+    ]
+    maintenance_work_mem: Annotated[
+        Optional[str],
+        click.option(
+            "--maintenance-work-mem",
+            type=str,
+            help="Sets the maximum memory to be used for maintenance operations (index creation). "
+            "Can be entered as string with unit like '64GB' or as an integer number of KB."
+            "This will set the parameters: max_parallel_maintenance_workers,"
+            " max_parallel_workers & table(parallel_workers)",
+            required=False,
+        ),
+    ]
+    max_parallel_workers: Annotated[
+        Optional[int],
+        click.option(
+            "--max-parallel-workers",
+            type=int,
+            help="Sets the maximum number of parallel processes per maintenance operation (index creation)",
+            required=False,
+        ),
+    ]
+@cli.command()
+@click_parameter_decorators_from_typed_dict(PgDiskAnnTypedDict)
+def PgDiskAnn(
+    **parameters: Unpack[PgDiskAnnTypedDict],
+):
+    from .config import PgDiskANNConfig, PgDiskANNImplConfig
+    run(
+        db=DB.PgDiskANN,
+        db_config=PgDiskANNConfig(
+            db_label=parameters["db_label"],
+            user_name=SecretStr(parameters["user_name"]),
+            password=SecretStr(parameters["password"]),
+            host=parameters["host"],
+            db_name=parameters["db_name"],
+        ),
+        db_case_config=PgDiskANNImplConfig(
+            max_neighbors=parameters["max_neighbors"],
+            l_value_ib=parameters["l_value_ib"],
+            l_value_is=parameters["l_value_is"],
+            max_parallel_workers=parameters["max_parallel_workers"],
+            maintenance_work_mem=parameters["maintenance_work_mem"],
+        ),
+        **parameters,
+    )

vectordb_bench/backend/clients/pgdiskann/config.py ADDED Viewed

@@ -0,0 +1,145 @@
+from abc import abstractmethod
+from typing import Any, Mapping, Optional, Sequence, TypedDict
+from pydantic import BaseModel, SecretStr
+from typing_extensions import LiteralString
+from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
+POSTGRE_URL_PLACEHOLDER = "postgresql://%s:%s@%s/%s"
+class PgDiskANNConfigDict(TypedDict):
+    """These keys will be directly used as kwargs in psycopg connection string,
+        so the names must match exactly psycopg API"""
+    user: str
+    password: str
+    host: str
+    port: int
+    dbname: str
+class PgDiskANNConfig(DBConfig):
+    user_name: SecretStr = SecretStr("postgres")
+    password: SecretStr
+    host: str = "localhost"
+    port: int = 5432
+    db_name: str
+    def to_dict(self) -> PgDiskANNConfigDict:
+        user_str = self.user_name.get_secret_value()
+        pwd_str = self.password.get_secret_value()
+        return {
+            "host": self.host,
+            "port": self.port,
+            "dbname": self.db_name,
+            "user": user_str,
+            "password": pwd_str,
+        }
+class PgDiskANNIndexConfig(BaseModel, DBCaseConfig):
+    metric_type: MetricType | None = None
+    create_index_before_load: bool = False
+    create_index_after_load: bool = True
+    maintenance_work_mem: Optional[str]
+    max_parallel_workers: Optional[int]
+    def parse_metric(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "vector_l2_ops"
+        elif self.metric_type == MetricType.IP:
+            return "vector_ip_ops"
+        return "vector_cosine_ops"
+    def parse_metric_fun_op(self) -> LiteralString:
+        if self.metric_type == MetricType.L2:
+            return "<->"
+        elif self.metric_type == MetricType.IP:
+            return "<#>"
+        return "<=>"
+    def parse_metric_fun_str(self) -> str:
+        if self.metric_type == MetricType.L2:
+            return "l2_distance"
+        elif self.metric_type == MetricType.IP:
+            return "max_inner_product"
+        return "cosine_distance"
+    @abstractmethod
+    def index_param(self) -> dict:
+        ...
+    @abstractmethod
+    def search_param(self) -> dict:
+        ...
+    @abstractmethod
+    def session_param(self) -> dict:
+        ...
+    @staticmethod
+    def _optionally_build_with_options(with_options: Mapping[str, Any]) -> Sequence[dict[str, Any]]:
+        """Walk through mappings, creating a List of {key1 = value} pairs. That will be used to build a where clause"""
+        options = []
+        for option_name, value in with_options.items():
+            if value is not None:
+                options.append(
+                    {
+                        "option_name": option_name,
+                        "val": str(value),
+                    }
+                )
+        return options
+    @staticmethod
+    def _optionally_build_set_options(
+        set_mapping: Mapping[str, Any]
+    ) -> Sequence[dict[str, Any]]:
+        """Walk through options, creating 'SET 'key1 = "value1";' list"""
+        session_options = []
+        for setting_name, value in set_mapping.items():
+            if value:
+                session_options.append(
+                    {"parameter": {
+                            "setting_name": setting_name,
+                            "val": str(value),
+                        },
+                    }
+                )
+        return session_options
+class PgDiskANNImplConfig(PgDiskANNIndexConfig):
+    index: IndexType = IndexType.DISKANN
+    max_neighbors: int | None
+    l_value_ib: int | None
+    l_value_is: float | None
+    maintenance_work_mem: Optional[str] = None
+    max_parallel_workers: Optional[int] = None
+    def index_param(self) -> dict:
+        return {
+            "metric": self.parse_metric(),
+            "index_type": self.index.value,
+            "options": {
+                "max_neighbors": self.max_neighbors,
+                "l_value_ib": self.l_value_ib,
+            },
+            "maintenance_work_mem": self.maintenance_work_mem,
+            "max_parallel_workers": self.max_parallel_workers,
+        }
+    def search_param(self) -> dict:
+        return {
+            "metric": self.parse_metric(),
+            "metric_fun_op": self.parse_metric_fun_op(),
+        }
+    def session_param(self) -> dict:
+        return {
+            "diskann.l_value_is": self.l_value_is,
+        }
+_pgdiskann_case_config = {
+    IndexType.DISKANN: PgDiskANNImplConfig,
+}

vectordb-bench 0.0.13__py3-none-any.whl → 0.0.15__py3-none-any.whl

vectordb-bench 0.0.13py3-none-any.whl → 0.0.15py3-none-any.whl