PyPI - vectordb-bench - Versions diffs - 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl - Mend

vectordb-bench 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

vectordb_bench/__init__.py +49 -24
vectordb_bench/__main__.py +4 -3
vectordb_bench/backend/assembler.py +12 -13
vectordb_bench/backend/cases.py +56 -46
vectordb_bench/backend/clients/__init__.py +101 -14
vectordb_bench/backend/clients/aliyun_elasticsearch/aliyun_elasticsearch.py +26 -0
vectordb_bench/backend/clients/aliyun_elasticsearch/config.py +18 -0
vectordb_bench/backend/clients/aliyun_opensearch/aliyun_opensearch.py +345 -0
vectordb_bench/backend/clients/aliyun_opensearch/config.py +47 -0
vectordb_bench/backend/clients/alloydb/alloydb.py +58 -80
vectordb_bench/backend/clients/alloydb/cli.py +52 -35
vectordb_bench/backend/clients/alloydb/config.py +30 -30
vectordb_bench/backend/clients/api.py +8 -9
vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +46 -47
vectordb_bench/backend/clients/aws_opensearch/cli.py +4 -7
vectordb_bench/backend/clients/aws_opensearch/config.py +13 -9
vectordb_bench/backend/clients/aws_opensearch/run.py +69 -59
vectordb_bench/backend/clients/chroma/chroma.py +38 -36
vectordb_bench/backend/clients/chroma/config.py +4 -2
vectordb_bench/backend/clients/elastic_cloud/config.py +5 -5
vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +23 -22
vectordb_bench/backend/clients/memorydb/cli.py +8 -8
vectordb_bench/backend/clients/memorydb/config.py +2 -2
vectordb_bench/backend/clients/memorydb/memorydb.py +65 -53
vectordb_bench/backend/clients/milvus/cli.py +62 -80
vectordb_bench/backend/clients/milvus/config.py +31 -7
vectordb_bench/backend/clients/milvus/milvus.py +23 -26
vectordb_bench/backend/clients/pgdiskann/cli.py +29 -22
vectordb_bench/backend/clients/pgdiskann/config.py +29 -26
vectordb_bench/backend/clients/pgdiskann/pgdiskann.py +55 -73
vectordb_bench/backend/clients/pgvecto_rs/cli.py +9 -11
vectordb_bench/backend/clients/pgvecto_rs/config.py +8 -14
vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +33 -34
vectordb_bench/backend/clients/pgvector/cli.py +40 -31
vectordb_bench/backend/clients/pgvector/config.py +63 -73
vectordb_bench/backend/clients/pgvector/pgvector.py +97 -98
vectordb_bench/backend/clients/pgvectorscale/cli.py +38 -24
vectordb_bench/backend/clients/pgvectorscale/config.py +14 -15
vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +38 -43
vectordb_bench/backend/clients/pinecone/config.py +1 -0
vectordb_bench/backend/clients/pinecone/pinecone.py +14 -21
vectordb_bench/backend/clients/qdrant_cloud/config.py +11 -10
vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +40 -31
vectordb_bench/backend/clients/redis/cli.py +6 -12
vectordb_bench/backend/clients/redis/config.py +7 -5
vectordb_bench/backend/clients/redis/redis.py +94 -58
vectordb_bench/backend/clients/test/cli.py +1 -2
vectordb_bench/backend/clients/test/config.py +2 -2
vectordb_bench/backend/clients/test/test.py +4 -5
vectordb_bench/backend/clients/weaviate_cloud/cli.py +3 -4
vectordb_bench/backend/clients/weaviate_cloud/config.py +2 -2
vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +36 -22
vectordb_bench/backend/clients/zilliz_cloud/cli.py +14 -11
vectordb_bench/backend/clients/zilliz_cloud/config.py +2 -4
vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +1 -1
vectordb_bench/backend/data_source.py +30 -18
vectordb_bench/backend/dataset.py +47 -27
vectordb_bench/backend/result_collector.py +2 -3
vectordb_bench/backend/runner/__init__.py +4 -6
vectordb_bench/backend/runner/mp_runner.py +85 -34
vectordb_bench/backend/runner/rate_runner.py +51 -23
vectordb_bench/backend/runner/read_write_runner.py +140 -46
vectordb_bench/backend/runner/serial_runner.py +99 -50
vectordb_bench/backend/runner/util.py +4 -19
vectordb_bench/backend/task_runner.py +95 -74
vectordb_bench/backend/utils.py +17 -9
vectordb_bench/base.py +0 -1
vectordb_bench/cli/cli.py +65 -60
vectordb_bench/cli/vectordbbench.py +6 -7
vectordb_bench/frontend/components/check_results/charts.py +8 -19
vectordb_bench/frontend/components/check_results/data.py +4 -16
vectordb_bench/frontend/components/check_results/filters.py +8 -16
vectordb_bench/frontend/components/check_results/nav.py +4 -4
vectordb_bench/frontend/components/check_results/priceTable.py +1 -3
vectordb_bench/frontend/components/check_results/stPageConfig.py +2 -1
vectordb_bench/frontend/components/concurrent/charts.py +12 -12
vectordb_bench/frontend/components/custom/displayCustomCase.py +17 -11
vectordb_bench/frontend/components/custom/displaypPrams.py +4 -2
vectordb_bench/frontend/components/custom/getCustomConfig.py +1 -2
vectordb_bench/frontend/components/custom/initStyle.py +1 -1
vectordb_bench/frontend/components/get_results/saveAsImage.py +2 -0
vectordb_bench/frontend/components/run_test/caseSelector.py +3 -9
vectordb_bench/frontend/components/run_test/dbConfigSetting.py +1 -4
vectordb_bench/frontend/components/run_test/dbSelector.py +1 -1
vectordb_bench/frontend/components/run_test/generateTasks.py +8 -8
vectordb_bench/frontend/components/run_test/submitTask.py +14 -18
vectordb_bench/frontend/components/tables/data.py +3 -6
vectordb_bench/frontend/config/dbCaseConfigs.py +108 -83
vectordb_bench/frontend/pages/concurrent.py +3 -5
vectordb_bench/frontend/pages/custom.py +30 -9
vectordb_bench/frontend/pages/quries_per_dollar.py +3 -3
vectordb_bench/frontend/pages/run_test.py +3 -7
vectordb_bench/frontend/utils.py +1 -1
vectordb_bench/frontend/vdb_benchmark.py +4 -6
vectordb_bench/interface.py +56 -26
vectordb_bench/log_util.py +59 -64
vectordb_bench/metric.py +10 -11
vectordb_bench/models.py +26 -43
{vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/METADATA +34 -42
vectordb_bench-0.0.20.dist-info/RECORD +135 -0
{vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/WHEEL +1 -1
vectordb_bench-0.0.18.dist-info/RECORD +0 -131
{vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/LICENSE +0 -0
{vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/entry_points.txt +0 -0
{vectordb_bench-0.0.18.dist-info → vectordb_bench-0.0.20.dist-info}/top_level.txt +0 -0

vectordb_bench/backend/clients/api.py CHANGED Viewed

@@ -1,9 +1,8 @@
 from abc import ABC, abstractmethod
-from enum import Enum
-from typing import Any, Type
 from contextlib import contextmanager
+from enum import Enum
-from pydantic import BaseModel, validator, SecretStr
+from pydantic import BaseModel, SecretStr, validator
 class MetricType(str, Enum):
@@ -65,13 +64,10 @@ class DBConfig(ABC, BaseModel):
         raise NotImplementedError
     @validator("*")
-    def not_empty_field(cls, v, field):
-        if (
-            field.name in cls.common_short_configs()
-            or field.name in cls.common_long_configs()
-        ):
+    def not_empty_field(cls, v: any, field: any):
+        if field.name in cls.common_short_configs() or field.name in cls.common_long_configs():
             return v
-        if not v and isinstance(v, (str, SecretStr)):
+        if not v and isinstance(v, str | SecretStr):
             raise ValueError("Empty string!")
         return v
@@ -204,6 +200,9 @@ class VectorDB(ABC):
         """
         raise NotImplementedError
+    def optimize_with_size(self, data_size: int):
+        self.optimize()
     # TODO: remove
     @abstractmethod
     def ready_to_load(self):

vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py CHANGED Viewed

@@ -1,14 +1,18 @@
 import logging
-from contextlib import contextmanager
 import time
-from typing import Iterable, Type
-from ..api import VectorDB, DBCaseConfig, DBConfig, IndexType
-from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig, AWSOS_Engine
+from collections.abc import Iterable
+from contextlib import contextmanager
 from opensearchpy import OpenSearch
-from opensearchpy.helpers import bulk
+from ..api import IndexType, VectorDB
+from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig, AWSOS_Engine
 log = logging.getLogger(__name__)
+WAITING_FOR_REFRESH_SEC = 30
+WAITING_FOR_FORCE_MERGE_SEC = 30
 class AWSOpenSearch(VectorDB):
     def __init__(
@@ -17,7 +21,7 @@ class AWSOpenSearch(VectorDB):
         db_config: dict,
         db_case_config: AWSOpenSearchIndexConfig,
         index_name: str = "vdb_bench_index",  # must be lowercase
-        id_col_name: str = "id",
+        id_col_name: str = "_id",
         vector_col_name: str = "embedding",
         drop_old: bool = False,
         **kwargs,
@@ -27,9 +31,7 @@ class AWSOpenSearch(VectorDB):
         self.case_config = db_case_config
         self.index_name = index_name
         self.id_col_name = id_col_name
-        self.category_col_names = [
-            f"scalar-{categoryCount}" for categoryCount in [2, 5, 10, 100, 1000]
-        ]
+        self.category_col_names = [f"scalar-{categoryCount}" for categoryCount in [2, 5, 10, 100, 1000]]
         self.vector_col_name = vector_col_name
         log.info(f"AWS_OpenSearch client config: {self.db_config}")
@@ -46,39 +48,32 @@ class AWSOpenSearch(VectorDB):
         return AWSOpenSearchConfig
     @classmethod
-    def case_config_cls(
-        cls, index_type: IndexType | None = None
-    ) -> AWSOpenSearchIndexConfig:
+    def case_config_cls(cls, index_type: IndexType | None = None) -> AWSOpenSearchIndexConfig:
         return AWSOpenSearchIndexConfig
     def _create_index(self, client: OpenSearch):
         settings = {
             "index": {
                 "knn": True,
-                # "number_of_shards": 5,
-                # "refresh_interval": "600s",
-            }
+            },
         }
         mappings = {
             "properties": {
-                self.id_col_name: {"type": "integer"},
-                **{
-                    categoryCol: {"type": "keyword"}
-                    for categoryCol in self.category_col_names
-                },
+                **{categoryCol: {"type": "keyword"} for categoryCol in self.category_col_names},
                 self.vector_col_name: {
                     "type": "knn_vector",
                     "dimension": self.dim,
                     "method": self.case_config.index_param(),
                 },
-            }
+            },
         }
         try:
             client.indices.create(
-                index=self.index_name, body=dict(settings=settings, mappings=mappings)
+                index=self.index_name,
+                body={"settings": settings, "mappings": mappings},
             )
         except Exception as e:
-            log.warning(f"Failed to create index: {self.index_name} error: {str(e)}")
+            log.warning(f"Failed to create index: {self.index_name} error: {e!s}")
             raise e from None
     @contextmanager
@@ -87,7 +82,6 @@ class AWSOpenSearch(VectorDB):
         self.client = OpenSearch(**self.db_config)
         yield
-        # self.client.transport.close()
         self.client = None
         del self.client
@@ -102,16 +96,20 @@ class AWSOpenSearch(VectorDB):
         insert_data = []
         for i in range(len(embeddings)):
-            insert_data.append({"index": {"_index": self.index_name, "_id": metadata[i]}})
+            insert_data.append(
+                {"index": {"_index": self.index_name, self.id_col_name: metadata[i]}},
+            )
             insert_data.append({self.vector_col_name: embeddings[i]})
         try:
             resp = self.client.bulk(insert_data)
             log.info(f"AWS_OpenSearch adding documents: {len(resp['items'])}")
             resp = self.client.indices.stats(self.index_name)
-            log.info(f"Total document count in index: {resp['_all']['primaries']['indexing']['index_total']}")
+            log.info(
+                f"Total document count in index: {resp['_all']['primaries']['indexing']['index_total']}",
+            )
             return (len(embeddings), None)
         except Exception as e:
-            log.warning(f"Failed to insert data: {self.index_name} error: {str(e)}")
+            log.warning(f"Failed to insert data: {self.index_name} error: {e!s}")
             time.sleep(10)
             return self.insert_embeddings(embeddings, metadata)
@@ -136,20 +134,23 @@ class AWSOpenSearch(VectorDB):
         body = {
             "size": k,
             "query": {"knn": {self.vector_col_name: {"vector": query, "k": k}}},
-            **({"filter": {"range": {self.id_col_name: {"gt": filters["id"]}}}} if filters else {})
+            **({"filter": {"range": {self.id_col_name: {"gt": filters["id"]}}}} if filters else {}),
         }
         try:
-            resp = self.client.search(index=self.index_name, body=body,size=k,_source=False,docvalue_fields=[self.id_col_name],stored_fields="_none_",filter_path=[f"hits.hits.fields.{self.id_col_name}"],)
+            resp = self.client.search(
+                index=self.index_name,
+                body=body,
+                size=k,
+                _source=False,
+                docvalue_fields=[self.id_col_name],
+                stored_fields="_none_",
+            )
             log.info(f'Search took: {resp["took"]}')
             log.info(f'Search shards: {resp["_shards"]}')
             log.info(f'Search hits total: {resp["hits"]["total"]}')
-            result = [h["fields"][self.id_col_name][0] for h in resp["hits"]["hits"]]
-            #result = [int(d["_id"]) for d in resp["hits"]["hits"]]
-            # log.info(f'success! length={len(res)}')
-            return result
+            return [int(h["fields"][self.id_col_name][0]) for h in resp["hits"]["hits"]]
         except Exception as e:
-            log.warning(f"Failed to search: {self.index_name} error: {str(e)}")
+            log.warning(f"Failed to search: {self.index_name} error: {e!s}")
             raise e from None
     def optimize(self):
@@ -164,37 +165,35 @@ class AWSOpenSearch(VectorDB):
     def _refresh_index(self):
         log.debug(f"Starting refresh for index {self.index_name}")
-        SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC = 30
         while True:
             try:
-                log.info(f"Starting the Refresh Index..")
+                log.info("Starting the Refresh Index..")
                 self.client.indices.refresh(index=self.index_name)
                 break
             except Exception as e:
                 log.info(
-                    f"Refresh errored out. Sleeping for {SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC} sec and then Retrying : {e}")
-                time.sleep(SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC)
+                    f"Refresh errored out. Sleeping for {WAITING_FOR_REFRESH_SEC} sec and then Retrying : {e}",
+                )
+                time.sleep(WAITING_FOR_REFRESH_SEC)
                 continue
         log.debug(f"Completed refresh for index {self.index_name}")
     def _do_force_merge(self):
         log.debug(f"Starting force merge for index {self.index_name}")
-        force_merge_endpoint = f'/{self.index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false'
-        force_merge_task_id = self.client.transport.perform_request('POST', force_merge_endpoint)['task']
-        SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+        force_merge_endpoint = f"/{self.index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false"
+        force_merge_task_id = self.client.transport.perform_request("POST", force_merge_endpoint)["task"]
         while True:
-            time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
+            time.sleep(WAITING_FOR_FORCE_MERGE_SEC)
             task_status = self.client.tasks.get(task_id=force_merge_task_id)
-            if task_status['completed']:
+            if task_status["completed"]:
                 break
         log.debug(f"Completed force merge for index {self.index_name}")
     def _load_graphs_to_memory(self):
         if self.case_config.engine != AWSOS_Engine.lucene:
             log.info("Calling warmup API to load graphs into memory")
-            warmup_endpoint = f'/_plugins/_knn/warmup/{self.index_name}'
-            self.client.transport.perform_request('GET', warmup_endpoint)
+            warmup_endpoint = f"/_plugins/_knn/warmup/{self.index_name}"
+            self.client.transport.perform_request("GET", warmup_endpoint)
     def ready_to_load(self):
         """ready_to_load will be called before load in load cases."""
-        pass

vectordb_bench/backend/clients/aws_opensearch/cli.py CHANGED Viewed

@@ -14,22 +14,20 @@ from .. import DB
 class AWSOpenSearchTypedDict(TypedDict):
-    host: Annotated[
-        str, click.option("--host", type=str, help="Db host", required=True)
-    ]
+    host: Annotated[str, click.option("--host", type=str, help="Db host", required=True)]
     port: Annotated[int, click.option("--port", type=int, default=443, help="Db Port")]
     user: Annotated[str, click.option("--user", type=str, default="admin", help="Db User")]
     password: Annotated[str, click.option("--password", type=str, help="Db password")]
-class AWSOpenSearchHNSWTypedDict(CommonTypedDict, AWSOpenSearchTypedDict, HNSWFlavor2):
-    ...
+class AWSOpenSearchHNSWTypedDict(CommonTypedDict, AWSOpenSearchTypedDict, HNSWFlavor2): ...
 @cli.command()
 @click_parameter_decorators_from_typed_dict(AWSOpenSearchHNSWTypedDict)
 def AWSOpenSearch(**parameters: Unpack[AWSOpenSearchHNSWTypedDict]):
     from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig
     run(
         db=DB.AWSOpenSearch,
         db_config=AWSOpenSearchConfig(
@@ -38,7 +36,6 @@ def AWSOpenSearch(**parameters: Unpack[AWSOpenSearchHNSWTypedDict]):
             user=parameters["user"],
             password=SecretStr(parameters["password"]),
         ),
-        db_case_config=AWSOpenSearchIndexConfig(
-        ),
+        db_case_config=AWSOpenSearchIndexConfig(),
         **parameters,
     )

vectordb_bench/backend/clients/aws_opensearch/config.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import logging
 from enum import Enum
-from pydantic import SecretStr, BaseModel
-from ..api import DBConfig, DBCaseConfig, MetricType, IndexType
+from pydantic import BaseModel, SecretStr
+from ..api import DBCaseConfig, DBConfig, MetricType
 log = logging.getLogger(__name__)
 class AWSOpenSearchConfig(DBConfig, BaseModel):
     host: str = ""
     port: int = 443
@@ -13,7 +16,7 @@ class AWSOpenSearchConfig(DBConfig, BaseModel):
     def to_dict(self) -> dict:
         return {
-            "hosts": [{'host': self.host, 'port': self.port}],
+            "hosts": [{"host": self.host, "port": self.port}],
             "http_auth": (self.user, self.password.get_secret_value()),
             "use_ssl": True,
             "http_compress": True,
@@ -40,25 +43,26 @@ class AWSOpenSearchIndexConfig(BaseModel, DBCaseConfig):
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.IP:
             return "innerproduct"
-        elif self.metric_type == MetricType.COSINE:
+        if self.metric_type == MetricType.COSINE:
             if self.engine == AWSOS_Engine.faiss:
-                log.info(f"Using metric type as innerproduct because faiss doesn't support cosine as metric type for Opensearch")
+                log.info(
+                    "Using innerproduct because faiss doesn't support cosine as metric type for Opensearch",
+                )
                 return "innerproduct"
             return "cosinesimil"
         return "l2"
     def index_param(self) -> dict:
-        params = {
+        return {
             "name": "hnsw",
             "space_type": self.parse_metric(),
             "engine": self.engine.value,
             "parameters": {
                 "ef_construction": self.efConstruction,
                 "m": self.M,
-                "ef_search": self.efSearch
-            }
+                "ef_search": self.efSearch,
+            },
         }
-        return params
     def search_param(self) -> dict:
         return {}

vectordb_bench/backend/clients/aws_opensearch/run.py CHANGED Viewed

@@ -1,12 +1,16 @@
-import time, random
+import logging
+import random
+import time
 from opensearchpy import OpenSearch
-from opensearch_dsl import Search, Document, Text, Keyword
-_HOST = 'xxxxxx.us-west-2.es.amazonaws.com'
+log = logging.getLogger(__name__)
+_HOST = "xxxxxx.us-west-2.es.amazonaws.com"
 _PORT = 443
-_AUTH = ('admin', 'xxxxxx') # For testing only. Don't store credentials in code.
+_AUTH = ("admin", "xxxxxx")  # For testing only. Don't store credentials in code.
-_INDEX_NAME = 'my-dsl-index'
+_INDEX_NAME = "my-dsl-index"
 _BATCH = 100
 _ROWS = 100
 _DIM = 128
@@ -14,25 +18,24 @@ _TOPK = 10
 def create_client():
-    client = OpenSearch(
-        hosts=[{'host': _HOST, 'port': _PORT}],
-        http_compress=True, # enables gzip compression for request bodies
+    return OpenSearch(
+        hosts=[{"host": _HOST, "port": _PORT}],
+        http_compress=True,  # enables gzip compression for request bodies
         http_auth=_AUTH,
         use_ssl=True,
         verify_certs=True,
         ssl_assert_hostname=False,
         ssl_show_warn=False,
     )
-    return client
-def create_index(client, index_name):
+def create_index(client: OpenSearch, index_name: str):
     settings = {
         "index": {
             "knn": True,
             "number_of_shards": 1,
             "refresh_interval": "5s",
-        }
+        },
     }
     mappings = {
         "properties": {
@@ -46,41 +49,46 @@ def create_index(client, index_name):
                     "parameters": {
                         "ef_construction": 256,
                         "m": 16,
-                    }
-                }
-            }
-        }
+                    },
+                },
+            },
+        },
     }
-    response = client.indices.create(index=index_name, body=dict(settings=settings, mappings=mappings))
-    print('\nCreating index:')
-    print(response)
+    response = client.indices.create(
+        index=index_name,
+        body={"settings": settings, "mappings": mappings},
+    )
+    log.info("\nCreating index:")
+    log.info(response)
-def delete_index(client, index_name):
+def delete_index(client: OpenSearch, index_name: str):
     response = client.indices.delete(index=index_name)
-    print('\nDeleting index:')
-    print(response)
+    log.info("\nDeleting index:")
+    log.info(response)
-def bulk_insert(client, index_name):
+def bulk_insert(client: OpenSearch, index_name: str):
     # Perform bulk operations
-    ids = [i for i in range(_ROWS)]
+    ids = list(range(_ROWS))
     vec = [[random.random() for _ in range(_DIM)] for _ in range(_ROWS)]
     docs = []
     for i in range(0, _ROWS, _BATCH):
         docs.clear()
-        for j in range(0, _BATCH):
-            docs.append({"index": {"_index": index_name, "_id": ids[i+j]}})
-            docs.append({"embedding": vec[i+j]})
+        for j in range(_BATCH):
+            docs.append({"index": {"_index": index_name, "_id": ids[i + j]}})
+            docs.append({"embedding": vec[i + j]})
         response = client.bulk(docs)
-        print('\nAdding documents:', len(response['items']), response['errors'])
+        log.info(f"Adding documents: {len(response['items'])}, {response['errors']}")
         response = client.indices.stats(index_name)
-        print('\nTotal document count in index:', response['_all']['primaries']['indexing']['index_total'])
+        log.info(
+            f'Total document count in index: { response["_all"]["primaries"]["indexing"]["index_total"] }',
+        )
-def search(client, index_name):
+def search(client: OpenSearch, index_name: str):
     # Search for the document.
     search_body = {
         "size": _TOPK,
@@ -89,53 +97,55 @@ def search(client, index_name):
                 "embedding": {
                     "vector": [random.random() for _ in range(_DIM)],
                     "k": _TOPK,
-                }
-            }
-        }
+                },
+            },
+        },
     }
     while True:
         response = client.search(index=index_name, body=search_body)
-        print(f'\nSearch took: {response["took"]}')
-        print(f'\nSearch shards: {response["_shards"]}')
-        print(f'\nSearch hits total: {response["hits"]["total"]}')
+        log.info(f'\nSearch took: {response["took"]}')
+        log.info(f'\nSearch shards: {response["_shards"]}')
+        log.info(f'\nSearch hits total: {response["hits"]["total"]}')
         result = response["hits"]["hits"]
         if len(result) != 0:
-            print('\nSearch results:')
+            log.info("\nSearch results:")
             for hit in response["hits"]["hits"]:
-                print(hit["_id"], hit["_score"])
+                log.info(hit["_id"], hit["_score"])
             break
-        else:
-            print('\nSearch not ready, sleep 1s')
-            time.sleep(1)
-def optimize_index(client, index_name):
-    print(f"Starting force merge for index {index_name}")
-    force_merge_endpoint = f'/{index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false'
-    force_merge_task_id = client.transport.perform_request('POST', force_merge_endpoint)['task']
-    SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+        log.info("\nSearch not ready, sleep 1s")
+        time.sleep(1)
+SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
+WAITINT_FOR_REFRESH_SEC = 30
+def optimize_index(client: OpenSearch, index_name: str):
+    log.info(f"Starting force merge for index {index_name}")
+    force_merge_endpoint = f"/{index_name}/_forcemerge?max_num_segments=1&wait_for_completion=false"
+    force_merge_task_id = client.transport.perform_request("POST", force_merge_endpoint)["task"]
     while True:
         time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
         task_status = client.tasks.get(task_id=force_merge_task_id)
-        if task_status['completed']:
+        if task_status["completed"]:
             break
-    print(f"Completed force merge for index {index_name}")
+    log.info(f"Completed force merge for index {index_name}")
-def refresh_index(client, index_name):
-    print(f"Starting refresh for index {index_name}")
-    SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC = 30
+def refresh_index(client: OpenSearch, index_name: str):
+    log.info(f"Starting refresh for index {index_name}")
     while True:
         try:
-            print(f"Starting the Refresh Index..")
+            log.info("Starting the Refresh Index..")
             client.indices.refresh(index=index_name)
             break
         except Exception as e:
-            print(
-                f"Refresh errored out. Sleeping for {SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC} sec and then Retrying : {e}")
-            time.sleep(SECONDS_WAITING_FOR_REFRESH_API_CALL_SEC)
+            log.info(
+                f"Refresh errored out. Sleeping for {WAITINT_FOR_REFRESH_SEC} sec and then Retrying : {e}",
+            )
+            time.sleep(WAITINT_FOR_REFRESH_SEC)
             continue
-    print(f"Completed refresh for index {index_name}")
+    log.info(f"Completed refresh for index {index_name}")
 def main():
@@ -148,9 +158,9 @@ def main():
         search(client, _INDEX_NAME)
         delete_index(client, _INDEX_NAME)
     except Exception as e:
-        print(e)
+        log.info(e)
         delete_index(client, _INDEX_NAME)
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

vectordb_bench/backend/clients/chroma/chroma.py CHANGED Viewed

@@ -1,55 +1,55 @@
-import chromadb
-import logging
+import logging
 from contextlib import contextmanager
 from typing import Any
-from ..api import VectorDB, DBCaseConfig
+import chromadb
+from ..api import DBCaseConfig, VectorDB
 log = logging.getLogger(__name__)
 class ChromaClient(VectorDB):
-    """Chroma client for VectorDB.
+    """Chroma client for VectorDB.
     To set up Chroma in docker, see https://docs.trychroma.com/usage-guide
     or the instructions in tests/test_chroma.py
     To change to running in process, modify the HttpClient() in __init__() and init().
-    """
+    """
     def __init__(
-            self,
-            dim: int,
-            db_config: dict,
-            db_case_config: DBCaseConfig,
-            drop_old: bool = False,
-            **kwargs
-        ):
+        self,
+        dim: int,
+        db_config: dict,
+        db_case_config: DBCaseConfig,
+        drop_old: bool = False,
+        **kwargs,
+    ):
         self.db_config = db_config
         self.case_config = db_case_config
-        self.collection_name = 'example2'
+        self.collection_name = "example2"
-        client = chromadb.HttpClient(host=self.db_config["host"],
-                                     port=self.db_config["port"])
+        client = chromadb.HttpClient(host=self.db_config["host"], port=self.db_config["port"])
         assert client.heartbeat() is not None
         if drop_old:
             try:
-                client.reset() # Reset the database
-            except:
+                client.reset()  # Reset the database
+            except Exception:
                 drop_old = False
                 log.info(f"Chroma client drop_old collection: {self.collection_name}")
     @contextmanager
     def init(self) -> None:
-        """ create and destory connections to database.
+        """create and destory connections to database.
         Examples:
             >>> with self.init():
             >>>     self.insert_embeddings()
         """
-        #create connection
-        self.client = chromadb.HttpClient(host=self.db_config["host"],
-                                          port=self.db_config["port"])
-        self.collection = self.client.get_or_create_collection('example2')
+        # create connection
+        self.client = chromadb.HttpClient(host=self.db_config["host"], port=self.db_config["port"])
+        self.collection = self.client.get_or_create_collection("example2")
         yield
         self.client = None
         self.collection = None
@@ -79,12 +79,12 @@ class ChromaClient(VectorDB):
         Returns:
             (int, Exception): number of embeddings inserted and exception if any
         """
-        ids=[str(i) for i in metadata]
-        metadata = [{"id": int(i)} for i in metadata]
+        ids = [str(i) for i in metadata]
+        metadata = [{"id": int(i)} for i in metadata]
         if len(embeddings) > 0:
             self.collection.add(embeddings=embeddings, ids=ids, metadatas=metadata)
         return len(embeddings), None
     def search_embedding(
         self,
         query: list[float],
@@ -100,17 +100,19 @@ class ChromaClient(VectorDB):
             kwargs: other arguments
         Returns:
-            Dict {ids: list[list[int]],
-                    embedding: list[list[float]]
+            Dict {ids: list[list[int]],
+                    embedding: list[list[float]]
                     distance: list[list[float]]}
         """
         if filters:
             # assumes benchmark test filters of format: {'metadata': '>=10000', 'id': 10000}
             id_value = filters.get("id")
-            results = self.collection.query(query_embeddings=query, n_results=k,
-                                                where={"id": {"$gt": id_value}})
-            #return list of id's in results
-            return [int(i) for i in results.get('ids')[0]]
+            results = self.collection.query(
+                query_embeddings=query,
+                n_results=k,
+                where={"id": {"$gt": id_value}},
+            )
+            # return list of id's in results
+            return [int(i) for i in results.get("ids")[0]]
         results = self.collection.query(query_embeddings=query, n_results=k)
-        return [int(i) for i in results.get('ids')[0]]
+        return [int(i) for i in results.get("ids")[0]]

vectordb-bench 0.0.18__py3-none-any.whl → 0.0.20__py3-none-any.whl

vectordb-bench 0.0.18py3-none-any.whl → 0.0.20py3-none-any.whl