PyPI - vectordb-bench - Versions diffs - 0.0.10__tar.gz → 0.0.12__tar.gz - Mend

vectordb-bench 0.0.10tar.gz → 0.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/.env.example RENAMED Viewed

@@ -6,6 +6,6 @@
 # NUM_PER_BATCH=
 # DEFAULT_DATASET_URL=
-DATASET_LOCAL_DIR="/tmp/vector_db_bench/dataset"
+DATASET_LOCAL_DIR="/tmp/vectordb_bench/dataset"
 # DROP_OLD = True

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: vectordb-bench
-Version: 0.0.10
+Version: 0.0.12
 Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
 Author-email: XuanYang-cn <xuan.yang@zilliz.com>
 Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -10,6 +10,7 @@ Classifier: Operating System :: OS Independent
 Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
+Requires-Dist: click
 Requires-Dist: pytz
 Requires-Dist: streamlit-autorefresh
 Requires-Dist: streamlit!=1.34.0
@@ -40,6 +41,9 @@ Requires-Dist: redis; extra == "all"
 Requires-Dist: chromadb; extra == "all"
 Requires-Dist: psycopg2; extra == "all"
 Requires-Dist: psycopg; extra == "all"
+Requires-Dist: psycopg-binary; extra == "all"
+Requires-Dist: opensearch-dsl==2.1.0; extra == "all"
+Requires-Dist: opensearch-py==2.6.0; extra == "all"
 Provides-Extra: qdrant
 Requires-Dist: qdrant-client; extra == "qdrant"
 Provides-Extra: pinecone
@@ -49,14 +53,18 @@ Requires-Dist: weaviate-client; extra == "weaviate"
 Provides-Extra: elastic
 Requires-Dist: elasticsearch; extra == "elastic"
 Provides-Extra: pgvector
-Requires-Dist: pgvector; extra == "pgvector"
 Requires-Dist: psycopg; extra == "pgvector"
+Requires-Dist: psycopg-binary; extra == "pgvector"
+Requires-Dist: pgvector; extra == "pgvector"
 Provides-Extra: pgvecto-rs
 Requires-Dist: psycopg2; extra == "pgvecto-rs"
 Provides-Extra: redis
 Requires-Dist: redis; extra == "redis"
 Provides-Extra: chromadb
 Requires-Dist: chromadb; extra == "chromadb"
+Provides-Extra: awsopensearch
+Requires-Dist: awsopensearch; extra == "awsopensearch"
+Provides-Extra: zilliz-cloud
 # VectorDBBench: A Benchmark Tool for VectorDB
@@ -87,24 +95,134 @@ pip install vectordb-bench[pinecone]
 ```
 All the database client supported
-|Optional database client|install command|
-|---------------|---------------|
-|pymilvus(*default*)|`pip install vectordb-bench`|
-|all|`pip install vectordb-bench[all]`|
-|qdrant|`pip install vectordb-bench[qdrant]`|
-|pinecone|`pip install vectordb-bench[pinecone]`|
-|weaviate|`pip install vectordb-bench[weaviate]`|
-|elastic|`pip install vectordb-bench[elastic]`|
-|pgvector|`pip install vectordb-bench[pgvector]`|
-|pgvecto.rs|`pip install vectordb-bench[pgvecto_rs]`|
-|redis|`pip install vectordb-bench[redis]`|
-|chromadb|`pip install vectordb-bench[chromadb]`|
+| Optional database client | install command                             |
+|--------------------------|---------------------------------------------|
+| pymilvus(*default*)      | `pip install vectordb-bench`                |
+| all                      | `pip install vectordb-bench[all]`           |
+| qdrant                   | `pip install vectordb-bench[qdrant]`        |
+| pinecone                 | `pip install vectordb-bench[pinecone]`      |
+| weaviate                 | `pip install vectordb-bench[weaviate]`      |
+| elastic                  | `pip install vectordb-bench[elastic]`       |
+| pgvector                 | `pip install vectordb-bench[pgvector]`      |
+| pgvecto.rs               | `pip install vectordb-bench[pgvecto_rs]`    |
+| redis                    | `pip install vectordb-bench[redis]`         |
+| chromadb                 | `pip install vectordb-bench[chromadb]`      |
+| awsopensearch            | `pip install vectordb-bench[awsopensearch]` |
 ### Run
 ``` shell
 init_bench
 ```
+OR:
+### Run from the command line.
+``` shell
+vectordbbench [OPTIONS] COMMAND [ARGS]...
+```
+To list the clients that are runnable via the commandline option, execute: `vectordbbench --help`
+``` text
+$ vectordbbench --help
+Usage: vectordbbench [OPTIONS] COMMAND [ARGS]...
+Options:
+  --help  Show this message and exit.
+Commands:
+  pgvectorhnsw
+  pgvectorivfflat
+  test
+  weaviate
+```
+To list the options for each command, execute `vectordbbench [command] --help`
+```text
+$ vectordbbench pgvectorhnsw --help
+Usage: vectordbbench pgvectorhnsw [OPTIONS]
+Options:
+  --config-file PATH              Read configuration from yaml file
+  --drop-old / --skip-drop-old    Drop old or skip  [default: drop-old]
+  --load / --skip-load            Load or skip  [default: load]
+  --search-serial / --skip-search-serial
+                                  Search serial or skip  [default: search-
+                                  serial]
+  --search-concurrent / --skip-search-concurrent
+                                  Search concurrent or skip  [default: search-
+                                  concurrent]
+  --case-type [CapacityDim128|CapacityDim960|Performance768D100M|Performance768D10M|Performance768D1M|Performance768D10M1P|Performance768D1M1P|Performance768D10M99P|Performance768D1M99P|Performance1536D500K|Performance1536D5M|Performance1536D500K1P|Performance1536D5M1P|Performance1536D500K99P|Performance1536D5M99P|Performance1536D50K]
+                                  Case type
+  --db-label TEXT                 Db label, default: date in ISO format
+                                  [default: 2024-05-20T20:26:31.113290]
+  --dry-run                       Print just the configuration and exit
+                                  without running the tasks
+  --k INTEGER                     K value for number of nearest neighbors to
+                                  search  [default: 100]
+  --concurrency-duration INTEGER  Adjusts the duration in seconds of each
+                                  concurrency search  [default: 30]
+  --num-concurrency TEXT          Comma-separated list of concurrency values
+                                  to test during concurrent search  [default:
+                                  1,10,20]
+  --user-name TEXT                Db username  [required]
+  --password TEXT                 Db password  [required]
+  --host TEXT                     Db host  [required]
+  --db-name TEXT                  Db name  [required]
+  --maintenance-work-mem TEXT     Sets the maximum memory to be used for
+                                  maintenance operations (index creation). Can
+                                  be entered as string with unit like '64GB'
+                                  or as an integer number of KB.This will set
+                                  the parameters:
+                                  max_parallel_maintenance_workers,
+                                  max_parallel_workers &
+                                  table(parallel_workers)
+  --max-parallel-workers INTEGER  Sets the maximum number of parallel
+                                  processes per maintenance operation (index
+                                  creation)
+  --m INTEGER                     hnsw m
+  --ef-construction INTEGER       hnsw ef-construction
+  --ef-search INTEGER             hnsw ef-search
+  --help                          Show this message and exit.
+```
+#### Using a configuration file.
+The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
+By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting
+the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file.
+The required format is:
+```yaml
+commandname:
+   parameter_name: parameter_value
+   parameter_name: parameter_value
+```
+Example:
+```yaml
+pgvectorhnsw:
+  db_label: pgConfigTest
+  user_name: vectordbbench
+  password: vectordbbench
+  db_name:  vectordbbench
+  host: localhost
+  m: 16
+  ef_construction: 128
+  ef_search: 128
+milvushnsw:
+  skip_search_serial: True
+  case_type: Performance1536D50K
+  uri: http://localhost:19530
+  m: 16
+  ef_construction: 128
+  ef_search: 128
+  drop_old: False
+  load: False
+```
+> Notes:
+> - Options passed on the command line will override the configuration file*
+> - Parameter names use an _ not -
 ## What is VectorDBBench
 VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
@@ -232,6 +350,24 @@ Case No. | Case Type | Dataset Size  | Filtering Rate | Results |
 Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
+#### Custom Dataset for Performance case
+Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
+![image](fig/custom_dataset.png)
+![image](fig/custom_case_run_test.png)
+We have strict requirements for the data set format, please follow them.
+- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
+  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
+- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
+- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
 ## Goals
 Our goals of this benchmark are:
 ### Reproducibility & Usability
@@ -280,6 +416,7 @@ class NewDBCaseConfig(DBCaseConfig):
     # Implement optional case-specific configuration fields
     # ...
 ```
 **Step 3: Importing the DB Client and Updating Initialization**
 In this final step, you will import your DB client into clients/__init__.py and update the initialization process.
@@ -318,6 +455,83 @@ class DB(Enum):
             return NewClientCaseConfig
 ```
+**Step 4: Implement new_client/cli.py and vectordb_bench/cli/vectordbbench.py**
+In this (optional, but encouraged) step you will enable the test to be run from the command line.
+1. Navigate to the vectordb_bench/backend/clients/"client" directory.
+2. Inside the "client" folder, create a cli.py file.
+Using zilliz as an example cli.py:
+```python
+from typing import Annotated, Unpack
+import click
+import os
+from pydantic import SecretStr
+from vectordb_bench.cli.cli import (
+    CommonTypedDict,
+    cli,
+    click_parameter_decorators_from_typed_dict,
+    run,
+)
+from vectordb_bench.backend.clients import DB
+class ZillizTypedDict(CommonTypedDict):
+    uri: Annotated[
+        str, click.option("--uri", type=str, help="uri connection string", required=True)
+    ]
+    user_name: Annotated[
+        str, click.option("--user-name", type=str, help="Db username", required=True)
+    ]
+    password: Annotated[
+        str,
+        click.option("--password",
+                     type=str,
+                     help="Zilliz password",
+                     default=lambda: os.environ.get("ZILLIZ_PASSWORD", ""),
+                     show_default="$ZILLIZ_PASSWORD",
+                     ),
+    ]
+    level: Annotated[
+        str,
+        click.option("--level", type=str, help="Zilliz index level", required=False),
+    ]
+@cli.command()
+@click_parameter_decorators_from_typed_dict(ZillizTypedDict)
+def ZillizAutoIndex(**parameters: Unpack[ZillizTypedDict]):
+    from .config import ZillizCloudConfig, AutoIndexConfig
+    run(
+        db=DB.ZillizCloud,
+        db_config=ZillizCloudConfig(
+            db_label=parameters["db_label"],
+            uri=SecretStr(parameters["uri"]),
+            user=parameters["user_name"],
+            password=SecretStr(parameters["password"]),
+        ),
+        db_case_config=AutoIndexConfig(
+            params={parameters["level"]},
+        ),
+        **parameters,
+    )
+```
+3. Update cli by adding:
+   1. Add database specific options as an Annotated TypedDict, see ZillizTypedDict above.
+   2. Add index configuration specific options as an Annotated TypedDict. (example: vectordb_bench/backend/clients/pgvector/cli.py)
+      1. May not be needed if there is only one index config.
+      2. Repeat for each index configuration, nesting them if possible.
+   2. Add a index config specific function for each index type,  see Zilliz above.  The function name, in lowercase, will be the command name passed to the vectordbbench command.
+   3. Update db_config and db_case_config to match client requirements
+   4. Continue to add new functions for each index config.
+   5. Import the client cli module and command to vectordb_bench/cli/vectordbbench.py (for databases with multiple commands (index configs), this only needs to be done for one command)
+> cli modules with multiple index configs:
+> - pgvector: vectordb_bench/backend/clients/pgvector/cli.py
+> - milvus: vectordb_bench/backend/clients/milvus/cli.py
 That's it! You have successfully added a new DB client to the vectordb_bench project.
 ## Rules

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/README.md RENAMED Viewed

@@ -27,24 +27,134 @@ pip install vectordb-bench[pinecone]
 ```
 All the database client supported
-|Optional database client|install command|
-|---------------|---------------|
-|pymilvus(*default*)|`pip install vectordb-bench`|
-|all|`pip install vectordb-bench[all]`|
-|qdrant|`pip install vectordb-bench[qdrant]`|
-|pinecone|`pip install vectordb-bench[pinecone]`|
-|weaviate|`pip install vectordb-bench[weaviate]`|
-|elastic|`pip install vectordb-bench[elastic]`|
-|pgvector|`pip install vectordb-bench[pgvector]`|
-|pgvecto.rs|`pip install vectordb-bench[pgvecto_rs]`|
-|redis|`pip install vectordb-bench[redis]`|
-|chromadb|`pip install vectordb-bench[chromadb]`|
+| Optional database client | install command                             |
+|--------------------------|---------------------------------------------|
+| pymilvus(*default*)      | `pip install vectordb-bench`                |
+| all                      | `pip install vectordb-bench[all]`           |
+| qdrant                   | `pip install vectordb-bench[qdrant]`        |
+| pinecone                 | `pip install vectordb-bench[pinecone]`      |
+| weaviate                 | `pip install vectordb-bench[weaviate]`      |
+| elastic                  | `pip install vectordb-bench[elastic]`       |
+| pgvector                 | `pip install vectordb-bench[pgvector]`      |
+| pgvecto.rs               | `pip install vectordb-bench[pgvecto_rs]`    |
+| redis                    | `pip install vectordb-bench[redis]`         |
+| chromadb                 | `pip install vectordb-bench[chromadb]`      |
+| awsopensearch            | `pip install vectordb-bench[awsopensearch]` |
 ### Run
 ``` shell
 init_bench
 ```
+OR:
+### Run from the command line.
+``` shell
+vectordbbench [OPTIONS] COMMAND [ARGS]...
+```
+To list the clients that are runnable via the commandline option, execute: `vectordbbench --help`
+``` text
+$ vectordbbench --help
+Usage: vectordbbench [OPTIONS] COMMAND [ARGS]...
+Options:
+  --help  Show this message and exit.
+Commands:
+  pgvectorhnsw
+  pgvectorivfflat
+  test
+  weaviate
+```
+To list the options for each command, execute `vectordbbench [command] --help`
+```text
+$ vectordbbench pgvectorhnsw --help
+Usage: vectordbbench pgvectorhnsw [OPTIONS]
+Options:
+  --config-file PATH              Read configuration from yaml file
+  --drop-old / --skip-drop-old    Drop old or skip  [default: drop-old]
+  --load / --skip-load            Load or skip  [default: load]
+  --search-serial / --skip-search-serial
+                                  Search serial or skip  [default: search-
+                                  serial]
+  --search-concurrent / --skip-search-concurrent
+                                  Search concurrent or skip  [default: search-
+                                  concurrent]
+  --case-type [CapacityDim128|CapacityDim960|Performance768D100M|Performance768D10M|Performance768D1M|Performance768D10M1P|Performance768D1M1P|Performance768D10M99P|Performance768D1M99P|Performance1536D500K|Performance1536D5M|Performance1536D500K1P|Performance1536D5M1P|Performance1536D500K99P|Performance1536D5M99P|Performance1536D50K]
+                                  Case type
+  --db-label TEXT                 Db label, default: date in ISO format
+                                  [default: 2024-05-20T20:26:31.113290]
+  --dry-run                       Print just the configuration and exit
+                                  without running the tasks
+  --k INTEGER                     K value for number of nearest neighbors to
+                                  search  [default: 100]
+  --concurrency-duration INTEGER  Adjusts the duration in seconds of each
+                                  concurrency search  [default: 30]
+  --num-concurrency TEXT          Comma-separated list of concurrency values
+                                  to test during concurrent search  [default:
+                                  1,10,20]
+  --user-name TEXT                Db username  [required]
+  --password TEXT                 Db password  [required]
+  --host TEXT                     Db host  [required]
+  --db-name TEXT                  Db name  [required]
+  --maintenance-work-mem TEXT     Sets the maximum memory to be used for
+                                  maintenance operations (index creation). Can
+                                  be entered as string with unit like '64GB'
+                                  or as an integer number of KB.This will set
+                                  the parameters:
+                                  max_parallel_maintenance_workers,
+                                  max_parallel_workers &
+                                  table(parallel_workers)
+  --max-parallel-workers INTEGER  Sets the maximum number of parallel
+                                  processes per maintenance operation (index
+                                  creation)
+  --m INTEGER                     hnsw m
+  --ef-construction INTEGER       hnsw ef-construction
+  --ef-search INTEGER             hnsw ef-search
+  --help                          Show this message and exit.
+```
+#### Using a configuration file.
+The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
+By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting
+the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file.
+The required format is:
+```yaml
+commandname:
+   parameter_name: parameter_value
+   parameter_name: parameter_value
+```
+Example:
+```yaml
+pgvectorhnsw:
+  db_label: pgConfigTest
+  user_name: vectordbbench
+  password: vectordbbench
+  db_name:  vectordbbench
+  host: localhost
+  m: 16
+  ef_construction: 128
+  ef_search: 128
+milvushnsw:
+  skip_search_serial: True
+  case_type: Performance1536D50K
+  uri: http://localhost:19530
+  m: 16
+  ef_construction: 128
+  ef_search: 128
+  drop_old: False
+  load: False
+```
+> Notes:
+> - Options passed on the command line will override the configuration file*
+> - Parameter names use an _ not -
 ## What is VectorDBBench
 VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
@@ -172,6 +282,24 @@ Case No. | Case Type | Dataset Size  | Filtering Rate | Results |
 Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
+#### Custom Dataset for Performance case
+Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
+![image](fig/custom_dataset.png)
+![image](fig/custom_case_run_test.png)
+We have strict requirements for the data set format, please follow them.
+- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
+  - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
+  - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
+- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
+- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
 ## Goals
 Our goals of this benchmark are:
 ### Reproducibility & Usability
@@ -220,6 +348,7 @@ class NewDBCaseConfig(DBCaseConfig):
     # Implement optional case-specific configuration fields
     # ...
 ```
 **Step 3: Importing the DB Client and Updating Initialization**
 In this final step, you will import your DB client into clients/__init__.py and update the initialization process.
@@ -258,6 +387,83 @@ class DB(Enum):
             return NewClientCaseConfig
 ```
+**Step 4: Implement new_client/cli.py and vectordb_bench/cli/vectordbbench.py**
+In this (optional, but encouraged) step you will enable the test to be run from the command line.
+1. Navigate to the vectordb_bench/backend/clients/"client" directory.
+2. Inside the "client" folder, create a cli.py file.
+Using zilliz as an example cli.py:
+```python
+from typing import Annotated, Unpack
+import click
+import os
+from pydantic import SecretStr
+from vectordb_bench.cli.cli import (
+    CommonTypedDict,
+    cli,
+    click_parameter_decorators_from_typed_dict,
+    run,
+)
+from vectordb_bench.backend.clients import DB
+class ZillizTypedDict(CommonTypedDict):
+    uri: Annotated[
+        str, click.option("--uri", type=str, help="uri connection string", required=True)
+    ]
+    user_name: Annotated[
+        str, click.option("--user-name", type=str, help="Db username", required=True)
+    ]
+    password: Annotated[
+        str,
+        click.option("--password",
+                     type=str,
+                     help="Zilliz password",
+                     default=lambda: os.environ.get("ZILLIZ_PASSWORD", ""),
+                     show_default="$ZILLIZ_PASSWORD",
+                     ),
+    ]
+    level: Annotated[
+        str,
+        click.option("--level", type=str, help="Zilliz index level", required=False),
+    ]
+@cli.command()
+@click_parameter_decorators_from_typed_dict(ZillizTypedDict)
+def ZillizAutoIndex(**parameters: Unpack[ZillizTypedDict]):
+    from .config import ZillizCloudConfig, AutoIndexConfig
+    run(
+        db=DB.ZillizCloud,
+        db_config=ZillizCloudConfig(
+            db_label=parameters["db_label"],
+            uri=SecretStr(parameters["uri"]),
+            user=parameters["user_name"],
+            password=SecretStr(parameters["password"]),
+        ),
+        db_case_config=AutoIndexConfig(
+            params={parameters["level"]},
+        ),
+        **parameters,
+    )
+```
+3. Update cli by adding:
+   1. Add database specific options as an Annotated TypedDict, see ZillizTypedDict above.
+   2. Add index configuration specific options as an Annotated TypedDict. (example: vectordb_bench/backend/clients/pgvector/cli.py)
+      1. May not be needed if there is only one index config.
+      2. Repeat for each index configuration, nesting them if possible.
+   2. Add a index config specific function for each index type,  see Zilliz above.  The function name, in lowercase, will be the command name passed to the vectordbbench command.
+   3. Update db_config and db_case_config to match client requirements
+   4. Continue to add new functions for each index config.
+   5. Import the client cli module and command to vectordb_bench/cli/vectordbbench.py (for databases with multiple commands (index configs), this only needs to be done for one command)
+> cli modules with multiple index configs:
+> - pgvector: vectordb_bench/backend/clients/pgvector/cli.py
+> - milvus: vectordb_bench/backend/clients/milvus/cli.py
 That's it! You have successfully added a new DB client to the vectordb_bench project.
 ## Rules

vectordb_bench-0.0.12/fig/custom_case_run_test.png ADDED Viewed

Binary file

vectordb_bench-0.0.12/fig/custom_dataset.png ADDED Viewed

Binary file

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/pyproject.toml RENAMED Viewed

@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["vectordb_bench"]
+include = ["vectordb_bench", "vectordb_bench.cli"]
 [project]
 name = "vectordb-bench"
@@ -24,6 +24,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
+    "click",
     "pytz",
     "streamlit-autorefresh",
     "streamlit!=1.34.0",
@@ -60,21 +61,27 @@ all = [
     "chromadb",
     "psycopg2",
     "psycopg",
+    "psycopg-binary",
+    "opensearch-dsl==2.1.0",
+    "opensearch-py==2.6.0",
 ]
 qdrant = [ "qdrant-client" ]
 pinecone = [ "pinecone-client" ]
 weaviate = [ "weaviate-client" ]
 elastic = [ "elasticsearch" ]
-pgvector = [ "pgvector", "psycopg" ]
+pgvector = [ "psycopg", "psycopg-binary", "pgvector" ]
 pgvecto_rs = [ "psycopg2" ]
 redis = [ "redis" ]
 chromadb = [ "chromadb" ]
+awsopensearch = [ "awsopensearch" ]
+zilliz_cloud = []
 [project.urls]
 "repository" = "https://github.com/zilliztech/VectorDBBench"
 [project.scripts]
 init_bench = "vectordb_bench.__main__:main"
+vectordbbench = "vectordb_bench.cli.vectordbbench:cli"
 [tool.setuptools_scm]

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/vectordb_bench/__init__.py RENAMED Viewed

@@ -1,11 +1,13 @@
-import environs
 import inspect
 import pathlib
-from . import log_util
+import environs
+from . import log_util
 env = environs.Env()
-env.read_env(".env")
+env.read_env(".env", False)
 class config:
     ALIYUN_OSS_URL = "assets.zilliz.com.cn/benchmark/"
@@ -19,9 +21,21 @@ class config:
     DROP_OLD = env.bool("DROP_OLD", True)
     USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
-    NUM_CONCURRENCY = [1, 5, 10, 15, 20, 25, 30, 35]
-    RESULTS_LOCAL_DIR = pathlib.Path(__file__).parent.joinpath("results")
+    NUM_CONCURRENCY = env.list("NUM_CONCURRENCY",  [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100], subcast=int )
+    CONCURRENCY_DURATION = 30
+    RESULTS_LOCAL_DIR = env.path(
+        "RESULTS_LOCAL_DIR", pathlib.Path(__file__).parent.joinpath("results")
+    )
+    CONFIG_LOCAL_DIR = env.path(
+        "CONFIG_LOCAL_DIR", pathlib.Path(__file__).parent.joinpath("config-files")
+    )
+    K_DEFAULT = 100  # default return top k nearest neighbors during search
+    CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
     CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
     LOAD_TIMEOUT_DEFAULT        = 2.5 * 3600 # 2.5h

{vectordb_bench-0.0.10 → vectordb_bench-0.0.12}/vectordb_bench/backend/assembler.py RENAMED Viewed

@@ -14,7 +14,7 @@ class Assembler:
     def assemble(cls, run_id , task: TaskConfig, source: DatasetSource) -> CaseRunner:
         c_cls = task.case_config.case_id.case_cls
-        c = c_cls()
+        c = c_cls(task.case_config.custom_case)
         if type(task.db_case_config) != EmptyDBCaseConfig:
             task.db_case_config.metric_type = c.dataset.data.metric_type

vectordb-bench 0.0.10__tar.gz → 0.0.12__tar.gz

vectordb-bench 0.0.10tar.gz → 0.0.12tar.gz