PyPI - vectordb-bench - Versions diffs - 0.0.29__tar.gz → 1.0.0__tar.gz - Mend

vectordb-bench 0.0.29tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (234) hide show

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/.github/workflows/pull_request.yml RENAMED Viewed

@@ -4,6 +4,7 @@ on:
   pull_request:
     branches:
       - main
+      - vdbbench_*
 jobs:
   build:

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vectordb-bench
-Version: 0.0.29
+Version: 1.0.0
 Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
 Author-email: XuanYang-cn <xuan.yang@zilliz.com>
 Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -21,7 +21,7 @@ Requires-Dist: oss2
 Requires-Dist: psutil
 Requires-Dist: polars
 Requires-Dist: plotly
-Requires-Dist: environs<14.1.0
+Requires-Dist: environs
 Requires-Dist: pydantic<v2
 Requires-Dist: scikit-learn
 Requires-Dist: pymilvus
@@ -53,6 +53,7 @@ Requires-Dist: PyMySQL; extra == "all"
 Requires-Dist: clickhouse-connect; extra == "all"
 Requires-Dist: pyvespa; extra == "all"
 Requires-Dist: lancedb; extra == "all"
+Requires-Dist: mysql-connector-python; extra == "all"
 Provides-Extra: qdrant
 Requires-Dist: qdrant-client; extra == "qdrant"
 Provides-Extra: pinecone
@@ -90,6 +91,8 @@ Provides-Extra: vespa
 Requires-Dist: pyvespa; extra == "vespa"
 Provides-Extra: lancedb
 Requires-Dist: lancedb; extra == "lancedb"
+Provides-Extra: oceanbase
+Requires-Dist: mysql-connector-python; extra == "oceanbase"
 Dynamic: license-file
 # VectorDBBench(VDBBench): A Benchmark Tool for VectorDB
@@ -151,6 +154,7 @@ All the database client supported
 | mongodb                  | `pip install vectordb-bench[mongodb]`       |
 | tidb                     | `pip install vectordb-bench[tidb]`          |
 | vespa                    | `pip install vectordb-bench[vespa]`         |
+| oceanbase                | `pip install vectordb-bench[oceanbase]`     |
 ### Run
@@ -295,12 +299,81 @@ Options:
   --force-merge-enabled BOOLEAN   Whether to perform force merge operation
   --flush-threshold-size TEXT     Size threshold for flushing the transaction
                                   log
+  --engine TEXT                   type of engine to use valid values [faiss, lucene]
   # Memory Management
   --cb-threshold TEXT             k-NN Memory circuit breaker threshold
+  # Quantization Type
+  --quantization-type TEXT        which type of quantization to use valid values [fp32, fp16]
+  --help                          Show this message and exit.
+  ```
+### Run OceanBase from command line
+Execute tests for the index types: HNSW, HNSW_SQ, or HNSW_BQ.
+```shell
+vectordbbench oceanbasehnsw --host xxx --port xxx --user root@mysql_tenant --database test \
+--m 16 --ef-construction 200 --case-type Performance1536D50K \
+--index-type HNSW --ef-search 100
+```
+To list the options for oceanbase, execute `vectordbbench oceanbasehnsw --help`, The following are some OceanBase-specific command-line options.
+```text
+$ vectordbbench oceanbasehnsw --help
+Usage: vectordbbench oceanbasehnsw [OPTIONS]
+Options:
+  [...]
+  --host TEXT                     OceanBase host
+  --user TEXT                     OceanBase username  [required]
+  --password TEXT                 OceanBase database password
+  --database TEXT                 DataBase name  [required]
+  --port INTEGER                  OceanBase port  [required]
+  --m INTEGER                     hnsw m  [required]
+  --ef-construction INTEGER       hnsw ef-construction  [required]
+  --ef-search INTEGER             hnsw ef-search  [required]
+  --index-type [HNSW|HNSW_SQ|HNSW_BQ]
+                                  Type of index to use. Supported values:
+                                  HNSW, HNSW_SQ, HNSW_BQ  [required]
   --help                          Show this message and exit.
   ```
+Execute tests for the index types: IVF_FLAT, IVF_SQ8, or IVF_PQ.
+```shell
+vectordbbench oceanbaseivf --host xxx --port xxx --user root@mysql_tenant --database test \
+--nlist 1000 --sample_per_nlist 256 --case-type Performance768D1M \
+--index-type IVF_FLAT --ivf_nprobes 100
+```
+To list the options for oceanbase, execute `vectordbbench oceanbaseivf --help`, The following are some OceanBase-specific command-line options.
+```text
+$ vectordbbench oceanbaseivf --help
+Usage: vectordbbench oceanbaseivf [OPTIONS]
+Options:
+  [...]
+  --host TEXT                     OceanBase host
+  --user TEXT                     OceanBase username  [required]
+  --password TEXT                 OceanBase database password
+  --database TEXT                 DataBase name  [required]
+  --port INTEGER                  OceanBase port  [required]
+  --index-type [IVF_FLAT|IVF_SQ8|IVF_PQ]
+                                  Type of index to use. Supported values:
+                                  IVF_FLAT, IVF_SQ8, IVF_PQ  [required]
+  --nlist INTEGER                 Number of cluster centers  [required]
+  --sample_per_nlist INTEGER      The cluster centers are calculated by total
+                                  sampling sample_per_nlist * nlist vectors
+                                  [required]
+  --ivf_nprobes TEXT              How many clustering centers to search during
+                                  the query  [required]
+  --m INTEGER                     The number of sub-vectors that each data
+                                  vector is divided into during IVF-PQ
+  --help                          Show this message and exit.                       Show this message and exit.
+  ```
 #### Using a configuration file.
 The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -339,6 +412,49 @@ milvushnsw:
 > - Options passed on the command line will override the configuration file*
 > - Parameter names use an _ not -
+#### Using a batch configuration file.
+The vectordbbench command can read a batch configuration file to run all the test cases in the yaml formatted configuration file.
+By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting
+the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file.
+The required format is:
+```yaml
+commandname:
+  - parameter_name: parameter_value
+    another_parameter_name: parameter_value
+```
+Example:
+```yaml
+pgvectorhnsw:
+  - db_label: pgConfigTest
+    user_name: vectordbbench
+    password: vectordbbench
+    db_name:  vectordbbench
+    host: localhost
+    m: 16
+    ef_construction: 128
+    ef_search: 128
+milvushnsw:
+  - skip_search_serial: True
+    case_type: Performance1536D50K
+    uri: http://localhost:19530
+    m: 16
+    ef_construction: 128
+    ef_search: 128
+    drop_old: False
+    load: False
+```
+> Notes:
+> - Options can only be passed through configuration files
+> - Parameter names use an _ not -
+How to use?
+```shell
+vectordbbench batchcli --batch-config-file <your-yaml-configuration-file>
+```
 ## Leaderboard
 ### Introduction
 To facilitate the presentation of test results and provide a comprehensive performance analysis report, we offer a [leaderboard page](https://zilliz.com/benchmark). It allows us to choose from QPS, QP$, and latency metrics, and provides a comprehensive assessment of a system's performance based on the test results of various cases and a set of scoring mechanisms (to be introduced later). On this leaderboard, we can select the systems and models to be compared, and filter out cases we do not want to consider. Comprehensive scores are always ranked from best to worst, and the specific test results of each query will be presented in the list below.
@@ -407,52 +523,35 @@ The standard benchmark results displayed here include all 15 cases that we curre
 All standard benchmark results are generated by a client running on an 8 core, 32 GB host, which is located in the same region as the server being tested. The client host is equipped with an `Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz` processor. Also all the servers for the open-source systems tested in our benchmarks run on hosts with the same type of processor.
 ### Run Test Page
-![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/f3135a29-8f12-4aac-bbb3-f2f55e2a2ff0)
-This is the page to run a test:
 1. Initially, you select the systems to be tested - multiple selections are allowed. Once selected, corresponding forms will pop up to gather necessary information for using the chosen databases. The db_label is used to differentiate different instances of the same system. We recommend filling in the host size or instance type here (as we do in our standard results).
 2. The next step is to select the test cases you want to perform. You can select multiple cases at once, and a form to collect corresponding parameters will appear.
 3. Finally, you'll need to provide a task label to distinguish different test results. Using the same label for different tests will result in the previous results being overwritten.
 Now we can only run one task at the same time.
+![image](fig/run_test_select_db.png)
+![image](fig/run_test_select_case.png)
+![image](fig/run_test_submit.png)
 ## Module
 ### Code Structure
 ![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/8c06512e-5419-4381-b084-9c93aed59639)
 ### Client
-Our client module is designed with flexibility and extensibility in mind, aiming to integrate APIs from different systems seamlessly. As of now, it supports Milvus, Zilliz Cloud, Elastic Search, Pinecone, Qdrant Cloud, Weaviate Cloud, PgVector, Redis, and Chroma. Stay tuned for more options, as we are consistently working on extending our reach to other systems.
+Our client module is designed with flexibility and extensibility in mind, aiming to integrate APIs from different systems seamlessly. As of now, it supports Milvus, Zilliz Cloud, Elastic Search, Pinecone, Qdrant Cloud, Weaviate Cloud, PgVector, Redis, Chroma, etc. Stay tuned for more options, as we are consistently working on extending our reach to other systems.
 ### Benchmark Cases
-We've developed an array of 15 comprehensive benchmark cases to test vector databases' various capabilities, each designed to give you a different piece of the puzzle. These cases are categorized into three main types:
+We've developed lots of comprehensive benchmark cases to test vector databases' various capabilities, each designed to give you a different piece of the puzzle. These cases are categorized into four main types:
 #### Capacity Case
 - **Large Dim:** Tests the database's loading capacity by inserting large-dimension vectors (GIST 100K vectors, 960 dimensions) until fully loaded. The final number of inserted vectors is reported.
 - **Small Dim:** Similar to the Large Dim case but uses small-dimension vectors (SIFT 500K vectors, 128 dimensions).
 #### Search Performance Case
 - **XLarge Dataset:** Measures search performance with a massive dataset (LAION 100M vectors, 768 dimensions) at varying parallel levels. The results include index building time, recall, latency, and maximum QPS.
-- **Large Dataset:** Similar to the XLarge Dataset case, but uses a slightly smaller dataset (10M-768dim, 5M-1536dim).
-- **Medium Dataset:** A case using a medium dataset (1M-768dim, 500K-1536dim).
+- **Large Dataset:** Similar to the XLarge Dataset case, but uses a slightly smaller dataset (10M-1024dim, 10M-768dim, 5M-1536dim).
+- **Medium Dataset:** A case using a medium dataset (1M-1024dim, 1M-768dim, 500K-1536dim).
+- **Small Dataset:** For development (100K-768dim, 50K-1536dim).
 #### Filtering Search Performance Case
-- **Large Dataset, Low Filtering Rate:** Evaluates search performance with a large dataset (10M-768dim, 5M-1536dim) under a low filtering rate (1% vectors) at different parallel levels.
-- **Medium Dataset, Low Filtering Rate:** This case uses a medium dataset (1M-768dim, 500K-1536dim) with a similar low filtering rate.
-- **Large Dataset, High Filtering Rate:** It tests with a large dataset (10M-768dim, 5M-1536dim) but under a high filtering rate (99% vectors).
-- **Medium Dataset, High Filtering Rate:** This case uses a medium dataset (1M-768dim, 500K-1536dim) with a high filtering rate.
-For a quick reference, here is a table summarizing the key aspects of each case:
-Case No. | Case Type | Dataset Size  | Filtering Rate | Results |
-|----------|-----------|--------------|----------------|---------|
-1 | Capacity Case | SIFT 500K vectors, 128 dimensions | N/A | Number of inserted vectors |
-2 | Capacity Case | GIST 100K vectors, 960 dimensions | N/A | Number of inserted vectors |
-3 | Search Performance Case | LAION 100M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-4 | Search Performance Case | Cohere 10M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-5 | Search Performance Case | Cohere 1M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-6 | Filtering Search Performance Case | Cohere 10M vectors, 768 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-7 | Filtering Search Performance Case | Cohere 1M vectors, 768 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-8 | Filtering Search Performance Case | Cohere 10M vectors, 768 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-9 | Filtering Search Performance Case | Cohere 1M vectors, 768 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-10 | Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-11 | Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-12 | Filtering Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-13 | Filtering Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-14 | Filtering Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-15 | Filtering Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
+- **Int-Filter Cases:** Evaluates search performance with int-based filter expression (e.g.  "id >= 2,000").
+- **Label-Filter Cases:** Evaluates search performance with label-based filter expressions (e.g., "color == 'red'"). The test includes randomly generated labels to simulate real-world filtering scenarios.
+#### Streaming Cases
+- **Insertion-Under-Load Case:** Evaluates search performance while maintaining a constant insertion workload. VectorDBBench applies a steady stream of insert requests at a fixed rate to simulate real-world scenarios where search operations must perform reliably under continuous data ingestion.
 Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/README.md RENAMED Viewed

@@ -57,6 +57,7 @@ All the database client supported
 | mongodb                  | `pip install vectordb-bench[mongodb]`       |
 | tidb                     | `pip install vectordb-bench[tidb]`          |
 | vespa                    | `pip install vectordb-bench[vespa]`         |
+| oceanbase                | `pip install vectordb-bench[oceanbase]`     |
 ### Run
@@ -201,12 +202,81 @@ Options:
   --force-merge-enabled BOOLEAN   Whether to perform force merge operation
   --flush-threshold-size TEXT     Size threshold for flushing the transaction
                                   log
+  --engine TEXT                   type of engine to use valid values [faiss, lucene]
   # Memory Management
   --cb-threshold TEXT             k-NN Memory circuit breaker threshold
+  # Quantization Type
+  --quantization-type TEXT        which type of quantization to use valid values [fp32, fp16]
+  --help                          Show this message and exit.
+  ```
+### Run OceanBase from command line
+Execute tests for the index types: HNSW, HNSW_SQ, or HNSW_BQ.
+```shell
+vectordbbench oceanbasehnsw --host xxx --port xxx --user root@mysql_tenant --database test \
+--m 16 --ef-construction 200 --case-type Performance1536D50K \
+--index-type HNSW --ef-search 100
+```
+To list the options for oceanbase, execute `vectordbbench oceanbasehnsw --help`, The following are some OceanBase-specific command-line options.
+```text
+$ vectordbbench oceanbasehnsw --help
+Usage: vectordbbench oceanbasehnsw [OPTIONS]
+Options:
+  [...]
+  --host TEXT                     OceanBase host
+  --user TEXT                     OceanBase username  [required]
+  --password TEXT                 OceanBase database password
+  --database TEXT                 DataBase name  [required]
+  --port INTEGER                  OceanBase port  [required]
+  --m INTEGER                     hnsw m  [required]
+  --ef-construction INTEGER       hnsw ef-construction  [required]
+  --ef-search INTEGER             hnsw ef-search  [required]
+  --index-type [HNSW|HNSW_SQ|HNSW_BQ]
+                                  Type of index to use. Supported values:
+                                  HNSW, HNSW_SQ, HNSW_BQ  [required]
   --help                          Show this message and exit.
   ```
+Execute tests for the index types: IVF_FLAT, IVF_SQ8, or IVF_PQ.
+```shell
+vectordbbench oceanbaseivf --host xxx --port xxx --user root@mysql_tenant --database test \
+--nlist 1000 --sample_per_nlist 256 --case-type Performance768D1M \
+--index-type IVF_FLAT --ivf_nprobes 100
+```
+To list the options for oceanbase, execute `vectordbbench oceanbaseivf --help`, The following are some OceanBase-specific command-line options.
+```text
+$ vectordbbench oceanbaseivf --help
+Usage: vectordbbench oceanbaseivf [OPTIONS]
+Options:
+  [...]
+  --host TEXT                     OceanBase host
+  --user TEXT                     OceanBase username  [required]
+  --password TEXT                 OceanBase database password
+  --database TEXT                 DataBase name  [required]
+  --port INTEGER                  OceanBase port  [required]
+  --index-type [IVF_FLAT|IVF_SQ8|IVF_PQ]
+                                  Type of index to use. Supported values:
+                                  IVF_FLAT, IVF_SQ8, IVF_PQ  [required]
+  --nlist INTEGER                 Number of cluster centers  [required]
+  --sample_per_nlist INTEGER      The cluster centers are calculated by total
+                                  sampling sample_per_nlist * nlist vectors
+                                  [required]
+  --ivf_nprobes TEXT              How many clustering centers to search during
+                                  the query  [required]
+  --m INTEGER                     The number of sub-vectors that each data
+                                  vector is divided into during IVF-PQ
+  --help                          Show this message and exit.                       Show this message and exit.
+  ```
 #### Using a configuration file.
 The vectordbbench command can optionally read some or all the options from a yaml formatted configuration file.
@@ -245,6 +315,49 @@ milvushnsw:
 > - Options passed on the command line will override the configuration file*
 > - Parameter names use an _ not -
+#### Using a batch configuration file.
+The vectordbbench command can read a batch configuration file to run all the test cases in the yaml formatted configuration file.
+By default, configuration files are expected to be in vectordb_bench/config-files/, this can be overridden by setting
+the environment variable CONFIG_LOCAL_DIR or by passing the full path to the file.
+The required format is:
+```yaml
+commandname:
+  - parameter_name: parameter_value
+    another_parameter_name: parameter_value
+```
+Example:
+```yaml
+pgvectorhnsw:
+  - db_label: pgConfigTest
+    user_name: vectordbbench
+    password: vectordbbench
+    db_name:  vectordbbench
+    host: localhost
+    m: 16
+    ef_construction: 128
+    ef_search: 128
+milvushnsw:
+  - skip_search_serial: True
+    case_type: Performance1536D50K
+    uri: http://localhost:19530
+    m: 16
+    ef_construction: 128
+    ef_search: 128
+    drop_old: False
+    load: False
+```
+> Notes:
+> - Options can only be passed through configuration files
+> - Parameter names use an _ not -
+How to use?
+```shell
+vectordbbench batchcli --batch-config-file <your-yaml-configuration-file>
+```
 ## Leaderboard
 ### Introduction
 To facilitate the presentation of test results and provide a comprehensive performance analysis report, we offer a [leaderboard page](https://zilliz.com/benchmark). It allows us to choose from QPS, QP$, and latency metrics, and provides a comprehensive assessment of a system's performance based on the test results of various cases and a set of scoring mechanisms (to be introduced later). On this leaderboard, we can select the systems and models to be compared, and filter out cases we do not want to consider. Comprehensive scores are always ranked from best to worst, and the specific test results of each query will be presented in the list below.
@@ -313,52 +426,35 @@ The standard benchmark results displayed here include all 15 cases that we curre
 All standard benchmark results are generated by a client running on an 8 core, 32 GB host, which is located in the same region as the server being tested. The client host is equipped with an `Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz` processor. Also all the servers for the open-source systems tested in our benchmarks run on hosts with the same type of processor.
 ### Run Test Page
-![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/f3135a29-8f12-4aac-bbb3-f2f55e2a2ff0)
-This is the page to run a test:
 1. Initially, you select the systems to be tested - multiple selections are allowed. Once selected, corresponding forms will pop up to gather necessary information for using the chosen databases. The db_label is used to differentiate different instances of the same system. We recommend filling in the host size or instance type here (as we do in our standard results).
 2. The next step is to select the test cases you want to perform. You can select multiple cases at once, and a form to collect corresponding parameters will appear.
 3. Finally, you'll need to provide a task label to distinguish different test results. Using the same label for different tests will result in the previous results being overwritten.
 Now we can only run one task at the same time.
+![image](fig/run_test_select_db.png)
+![image](fig/run_test_select_case.png)
+![image](fig/run_test_submit.png)
 ## Module
 ### Code Structure
 ![image](https://github.com/zilliztech/VectorDBBench/assets/105927039/8c06512e-5419-4381-b084-9c93aed59639)
 ### Client
-Our client module is designed with flexibility and extensibility in mind, aiming to integrate APIs from different systems seamlessly. As of now, it supports Milvus, Zilliz Cloud, Elastic Search, Pinecone, Qdrant Cloud, Weaviate Cloud, PgVector, Redis, and Chroma. Stay tuned for more options, as we are consistently working on extending our reach to other systems.
+Our client module is designed with flexibility and extensibility in mind, aiming to integrate APIs from different systems seamlessly. As of now, it supports Milvus, Zilliz Cloud, Elastic Search, Pinecone, Qdrant Cloud, Weaviate Cloud, PgVector, Redis, Chroma, etc. Stay tuned for more options, as we are consistently working on extending our reach to other systems.
 ### Benchmark Cases
-We've developed an array of 15 comprehensive benchmark cases to test vector databases' various capabilities, each designed to give you a different piece of the puzzle. These cases are categorized into three main types:
+We've developed lots of comprehensive benchmark cases to test vector databases' various capabilities, each designed to give you a different piece of the puzzle. These cases are categorized into four main types:
 #### Capacity Case
 - **Large Dim:** Tests the database's loading capacity by inserting large-dimension vectors (GIST 100K vectors, 960 dimensions) until fully loaded. The final number of inserted vectors is reported.
 - **Small Dim:** Similar to the Large Dim case but uses small-dimension vectors (SIFT 500K vectors, 128 dimensions).
 #### Search Performance Case
 - **XLarge Dataset:** Measures search performance with a massive dataset (LAION 100M vectors, 768 dimensions) at varying parallel levels. The results include index building time, recall, latency, and maximum QPS.
-- **Large Dataset:** Similar to the XLarge Dataset case, but uses a slightly smaller dataset (10M-768dim, 5M-1536dim).
-- **Medium Dataset:** A case using a medium dataset (1M-768dim, 500K-1536dim).
+- **Large Dataset:** Similar to the XLarge Dataset case, but uses a slightly smaller dataset (10M-1024dim, 10M-768dim, 5M-1536dim).
+- **Medium Dataset:** A case using a medium dataset (1M-1024dim, 1M-768dim, 500K-1536dim).
+- **Small Dataset:** For development (100K-768dim, 50K-1536dim).
 #### Filtering Search Performance Case
-- **Large Dataset, Low Filtering Rate:** Evaluates search performance with a large dataset (10M-768dim, 5M-1536dim) under a low filtering rate (1% vectors) at different parallel levels.
-- **Medium Dataset, Low Filtering Rate:** This case uses a medium dataset (1M-768dim, 500K-1536dim) with a similar low filtering rate.
-- **Large Dataset, High Filtering Rate:** It tests with a large dataset (10M-768dim, 5M-1536dim) but under a high filtering rate (99% vectors).
-- **Medium Dataset, High Filtering Rate:** This case uses a medium dataset (1M-768dim, 500K-1536dim) with a high filtering rate.
-For a quick reference, here is a table summarizing the key aspects of each case:
-Case No. | Case Type | Dataset Size  | Filtering Rate | Results |
-|----------|-----------|--------------|----------------|---------|
-1 | Capacity Case | SIFT 500K vectors, 128 dimensions | N/A | Number of inserted vectors |
-2 | Capacity Case | GIST 100K vectors, 960 dimensions | N/A | Number of inserted vectors |
-3 | Search Performance Case | LAION 100M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-4 | Search Performance Case | Cohere 10M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-5 | Search Performance Case | Cohere 1M vectors, 768 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-6 | Filtering Search Performance Case | Cohere 10M vectors, 768 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-7 | Filtering Search Performance Case | Cohere 1M vectors, 768 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-8 | Filtering Search Performance Case | Cohere 10M vectors, 768 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-9 | Filtering Search Performance Case | Cohere 1M vectors, 768 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-10 | Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-11 | Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | N/A | Index building time, recall, latency, maximum QPS |
-12 | Filtering Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-13 | Filtering Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | 1% vectors | Index building time, recall, latency, maximum QPS |
-14 | Filtering Search Performance Case | OpenAI generated 500K vectors, 1536 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
-15 | Filtering Search Performance Case | OpenAI generated 5M vectors, 1536 dimensions | 99% vectors | Index building time, recall, latency, maximum QPS |
+- **Int-Filter Cases:** Evaluates search performance with int-based filter expression (e.g.  "id >= 2,000").
+- **Label-Filter Cases:** Evaluates search performance with label-based filter expressions (e.g., "color == 'red'"). The test includes randomly generated labels to simulate real-world filtering scenarios.
+#### Streaming Cases
+- **Insertion-Under-Load Case:** Evaluates search performance while maintaining a constant insertion workload. VectorDBBench applies a steady stream of insert requests at a fixed rate to simulate real-world scenarios where search operations must perform reliably under continuous data ingestion.
 Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.

vectordb_bench-1.0.0/fig/homepage/bar-chart.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/concurrent.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/custom.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/label_filter.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/qp$.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/run_test.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/streaming.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/homepage/table.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/run_test_select_case.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/run_test_select_db.png ADDED Viewed

Binary file

vectordb_bench-1.0.0/fig/run_test_submit.png ADDED Viewed

Binary file

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/install/requirements_py3.11.txt RENAMED Viewed

@@ -24,3 +24,4 @@ scikit-learn
 pymilvus
 clickhouse_connect
 pyvespa
+mysql-connector-python

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/pyproject.toml RENAMED Viewed

@@ -35,7 +35,7 @@ dependencies = [
     "psutil",
     "polars",
     "plotly",
-    "environs<14.1.0",
+    "environs",
     "pydantic<v2",
     "scikit-learn",
     "pymilvus", # with pandas, numpy, ujson
@@ -73,6 +73,7 @@ all = [
     "clickhouse-connect",
     "pyvespa",
     "lancedb",
+    "mysql-connector-python",
 ]
 qdrant          = [ "qdrant-client" ]
@@ -96,6 +97,7 @@ tidb            = [ "PyMySQL" ]
 clickhouse      = [ "clickhouse-connect" ]
 vespa           = [ "pyvespa" ]
 lancedb         = [ "lancedb" ]
+oceanbase       = [ "mysql-connector-python" ]
 [project.urls]
 "repository" = "https://github.com/zilliztech/VectorDBBench"

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/vectordb_bench/__init__.py RENAMED Viewed

@@ -18,37 +18,16 @@ class config:
     DEFAULT_DATASET_URL = env.str("DEFAULT_DATASET_URL", AWS_S3_URL)
     DATASET_LOCAL_DIR = env.path("DATASET_LOCAL_DIR", "/tmp/vectordb_bench/dataset")
     NUM_PER_BATCH = env.int("NUM_PER_BATCH", 100)
+    TIME_PER_BATCH = 1  # 1s. for streaming insertion.
+    MAX_INSERT_RETRY = 5
+    MAX_SEARCH_RETRY = 5
+    LOAD_MAX_TRY_COUNT = 10
     DROP_OLD = env.bool("DROP_OLD", True)
     USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
-    NUM_CONCURRENCY = env.list(
-        "NUM_CONCURRENCY",
-        [
-            1,
-            5,
-            10,
-            15,
-            20,
-            25,
-            30,
-            35,
-            40,
-            45,
-            50,
-            55,
-            60,
-            65,
-            70,
-            75,
-            80,
-            85,
-            90,
-            95,
-            100,
-        ],
-        subcast=int,
-    )
+    NUM_CONCURRENCY = env.list("NUM_CONCURRENCY", [1, 5, 10, 20, 30, 40, 60, 80], subcast=int)
     CONCURRENCY_DURATION = 30
@@ -68,6 +47,7 @@ class config:
     CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600  # 24h
     LOAD_TIMEOUT_DEFAULT = 24 * 3600  # 24h
+    LOAD_TIMEOUT_768D_100K = 24 * 3600  # 24h
     LOAD_TIMEOUT_768D_1M = 24 * 3600  # 24h
     LOAD_TIMEOUT_768D_10M = 240 * 3600  # 10d
     LOAD_TIMEOUT_768D_100M = 2400 * 3600  # 100d
@@ -75,7 +55,11 @@ class config:
     LOAD_TIMEOUT_1536D_500K = 24 * 3600  # 24h
     LOAD_TIMEOUT_1536D_5M = 240 * 3600  # 10d
+    LOAD_TIMEOUT_1024D_1M = 24 * 3600  # 24h
+    LOAD_TIMEOUT_1024D_10M = 240 * 3600  # 10d
     OPTIMIZE_TIMEOUT_DEFAULT = 24 * 3600  # 24h
+    OPTIMIZE_TIMEOUT_768D_100K = 24 * 3600  # 24h
     OPTIMIZE_TIMEOUT_768D_1M = 24 * 3600  # 24h
     OPTIMIZE_TIMEOUT_768D_10M = 240 * 3600  # 10d
     OPTIMIZE_TIMEOUT_768D_100M = 2400 * 3600  # 100d
@@ -83,6 +67,9 @@ class config:
     OPTIMIZE_TIMEOUT_1536D_500K = 24 * 3600  # 24h
     OPTIMIZE_TIMEOUT_1536D_5M = 240 * 3600  # 10d
+    OPTIMIZE_TIMEOUT_1024D_1M = 24 * 3600  # 24h
+    OPTIMIZE_TIMEOUT_1024D_10M = 240 * 3600  # 10d
     def display(self) -> str:
         return [
             i

{vectordb_bench-0.0.29 → vectordb_bench-1.0.0}/vectordb_bench/backend/assembler.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import logging
-from vectordb_bench.backend.clients import EmptyDBCaseConfig
+from vectordb_bench.backend.clients import DB, EmptyDBCaseConfig
 from vectordb_bench.backend.data_source import DatasetSource
+from vectordb_bench.backend.filter import FilterOp
 from vectordb_bench.models import TaskConfig
 from .cases import CaseLabel
@@ -10,6 +11,13 @@ from .task_runner import CaseRunner, RunningStatus, TaskRunner
 log = logging.getLogger(__name__)
+class FilterNotSupportedError(ValueError):
+    """Raised when a filter type is not supported by a vector database."""
+    def __init__(self, db_name: str, filter_type: FilterOp):
+        super().__init__(f"{filter_type} Filter test is not supported by {db_name}.")
 class Assembler:
     @classmethod
     def assemble(cls, run_id: str, task: TaskConfig, source: DatasetSource) -> CaseRunner:
@@ -39,25 +47,30 @@ class Assembler:
         runners = [cls.assemble(run_id, task, source) for task in tasks]
         load_runners = [r for r in runners if r.ca.label == CaseLabel.Load]
         perf_runners = [r for r in runners if r.ca.label == CaseLabel.Performance]
+        streaming_runners = [r for r in runners if r.ca.label == CaseLabel.Streaming]
         # group by db
-        db2runner = {}
+        db2runner: dict[DB, list[CaseRunner]] = {}
         for r in perf_runners:
             db = r.config.db
             if db not in db2runner:
                 db2runner[db] = []
             db2runner[db].append(r)
-        # check dbclient installed
-        for k in db2runner:
-            _ = k.init_cls
+        # check
+        for db, runners in db2runner.items():
+            db_instance = db.init_cls
+            for runner in runners:
+                if not db_instance.filter_supported(runner.ca.filters):
+                    raise FilterNotSupportedError(db.value, runner.ca.filters.type)
         # sort by dataset size
         for _, runner in db2runner.items():
-            runner.sort(key=lambda x: x.ca.dataset.data.size)
+            runner.sort(key=lambda x: (x.ca.dataset.data.size, 0 if x.ca.filters.type == FilterOp.StrEqual else 1))
         all_runners = []
         all_runners.extend(load_runners)
+        all_runners.extend(streaming_runners)
         for v in db2runner.values():
             all_runners.extend(v)

vectordb-bench 0.0.29__tar.gz → 1.0.0__tar.gz

vectordb-bench 0.0.29tar.gz → 1.0.0tar.gz