vectordb-bench 0.0.11__tar.gz → 0.0.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.gitignore +1 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/PKG-INFO +46 -15
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/README.md +33 -12
- vectordb_bench-0.0.13/fig/custom_case_run_test.png +0 -0
- vectordb_bench-0.0.13/fig/custom_dataset.png +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/install/requirements_py3.11.txt +1 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/pyproject.toml +8 -2
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/__init__.py +1 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/assembler.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/cases.py +64 -18
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/__init__.py +35 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/api.py +21 -1
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +159 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/cli.py +44 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/config.py +58 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/run.py +125 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/cli.py +88 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/config.py +54 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/memorydb.py +254 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/cli.py +154 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/config.py +162 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +292 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvectorscale/config.py +111 -0
- vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +272 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/dataset.py +27 -5
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/vectordbbench.py +7 -0
- vectordb_bench-0.0.13/vectordb_bench/custom/custom_case.json +18 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/charts.py +6 -6
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/data.py +18 -11
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/expanderStyle.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/filters.py +20 -13
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/headerIcon.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/priceTable.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/stPageConfig.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/concurrent/charts.py +26 -29
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/displayCustomCase.py +31 -0
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/displaypPrams.py +11 -0
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/getCustomConfig.py +40 -0
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/initStyle.py +15 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/autoRefresh.py +1 -1
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/caseSelector.py +115 -0
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/dbConfigSetting.py +77 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/dbSelector.py +2 -14
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/generateTasks.py +3 -5
- vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/initStyle.py +16 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/submitTask.py +1 -1
- {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/dbCaseConfigs.py +311 -40
- {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/styles.py +2 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/concurrent.py +11 -18
- vectordb_bench-0.0.13/vectordb_bench/frontend/pages/custom.py +64 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/quries_per_dollar.py +5 -5
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/run_test.py +4 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/tables.py +2 -2
- vectordb_bench-0.0.13/vectordb_bench/frontend/utils.py +22 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/vdb_benchmark.py +3 -3
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/models.py +26 -10
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/getLeaderboardData.py +1 -1
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/PKG-INFO +46 -15
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/SOURCES.txt +22 -3
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/requires.txt +15 -2
- vectordb_bench-0.0.11/vectordb_bench/backend/clients/pgvecto_rs/config.py +0 -127
- vectordb_bench-0.0.11/vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +0 -192
- vectordb_bench-0.0.11/vectordb_bench/frontend/components/run_test/caseSelector.py +0 -93
- vectordb_bench-0.0.11/vectordb_bench/frontend/components/run_test/dbConfigSetting.py +0 -59
- vectordb_bench-0.0.11/vectordb_bench/frontend/utils.py +0 -6
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.devcontainer/Dockerfile +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.devcontainer/devcontainer.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.env.example +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.github/workflows/publish_package_on_release.yml +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.github/workflows/pull_request.yml +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.ruff.toml +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/Dockerfile +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/LICENSE +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/Makefile +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/OWNERS +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/install.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/setup.cfg +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/conftest.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/pytest.ini +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_bench_runner.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_chroma.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_data_source.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_dataset.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_elasticsearch_cloud.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_models.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_redis.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_utils.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/ut_cases.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/__main__.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/__init__.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/chroma/chroma.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/chroma/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/elastic_cloud/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/milvus.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/pgvector.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pinecone/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pinecone/pinecone.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/qdrant_cloud/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/redis.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/test.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/config.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/data_source.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/result_collector.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/__init__.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/mp_runner.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/serial_runner.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/task_runner.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/utils.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/base.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/__init__.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/cli.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/config-files/sample_config.yml +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/footer.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/nav.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/get_results/saveAsImage.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/hideSidebar.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/tables/data.py +0 -0
- {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/dbPrices.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/interface.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/log_util.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/metric.py +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ElasticCloud/result_20230727_standard_elasticcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ElasticCloud/result_20230808_standard_elasticcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Milvus/result_20230727_standard_milvus.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Milvus/result_20230808_standard_milvus.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/PgVector/result_20230727_standard_pgvector.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/PgVector/result_20230808_standard_pgvector.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Pinecone/result_20230727_standard_pinecone.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Pinecone/result_20230808_standard_pinecone.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/QdrantCloud/result_20230727_standard_qdrantcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/QdrantCloud/result_20230808_standard_qdrantcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/WeaviateCloud/result_20230727_standard_weaviatecloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/WeaviateCloud/result_20230808_standard_weaviatecloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/dbPrices.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/leaderboard.json +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/dependency_links.txt +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/entry_points.txt +0 -0
- {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: vectordb-bench
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.13
|
4
4
|
Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
|
5
5
|
Author-email: XuanYang-cn <xuan.yang@zilliz.com>
|
6
6
|
Project-URL: repository, https://github.com/zilliztech/VectorDBBench
|
@@ -36,12 +36,14 @@ Requires-Dist: pinecone-client; extra == "all"
|
|
36
36
|
Requires-Dist: weaviate-client; extra == "all"
|
37
37
|
Requires-Dist: elasticsearch; extra == "all"
|
38
38
|
Requires-Dist: pgvector; extra == "all"
|
39
|
+
Requires-Dist: pgvecto_rs[psycopg3]>=0.2.1; extra == "all"
|
39
40
|
Requires-Dist: sqlalchemy; extra == "all"
|
40
41
|
Requires-Dist: redis; extra == "all"
|
41
42
|
Requires-Dist: chromadb; extra == "all"
|
42
|
-
Requires-Dist: psycopg2; extra == "all"
|
43
43
|
Requires-Dist: psycopg; extra == "all"
|
44
44
|
Requires-Dist: psycopg-binary; extra == "all"
|
45
|
+
Requires-Dist: opensearch-dsl==2.1.0; extra == "all"
|
46
|
+
Requires-Dist: opensearch-py==2.6.0; extra == "all"
|
45
47
|
Provides-Extra: qdrant
|
46
48
|
Requires-Dist: qdrant-client; extra == "qdrant"
|
47
49
|
Provides-Extra: pinecone
|
@@ -54,12 +56,20 @@ Provides-Extra: pgvector
|
|
54
56
|
Requires-Dist: psycopg; extra == "pgvector"
|
55
57
|
Requires-Dist: psycopg-binary; extra == "pgvector"
|
56
58
|
Requires-Dist: pgvector; extra == "pgvector"
|
59
|
+
Provides-Extra: pgvectorscale
|
60
|
+
Requires-Dist: psycopg; extra == "pgvectorscale"
|
61
|
+
Requires-Dist: psycopg-binary; extra == "pgvectorscale"
|
62
|
+
Requires-Dist: pgvector; extra == "pgvectorscale"
|
57
63
|
Provides-Extra: pgvecto-rs
|
58
|
-
Requires-Dist:
|
64
|
+
Requires-Dist: pgvecto_rs[psycopg3]>=0.2.1; extra == "pgvecto-rs"
|
59
65
|
Provides-Extra: redis
|
60
66
|
Requires-Dist: redis; extra == "redis"
|
67
|
+
Provides-Extra: memorydb
|
68
|
+
Requires-Dist: memorydb; extra == "memorydb"
|
61
69
|
Provides-Extra: chromadb
|
62
70
|
Requires-Dist: chromadb; extra == "chromadb"
|
71
|
+
Provides-Extra: awsopensearch
|
72
|
+
Requires-Dist: awsopensearch; extra == "awsopensearch"
|
63
73
|
Provides-Extra: zilliz-cloud
|
64
74
|
|
65
75
|
# VectorDBBench: A Benchmark Tool for VectorDB
|
@@ -91,18 +101,21 @@ pip install vectordb-bench[pinecone]
|
|
91
101
|
```
|
92
102
|
All the database client supported
|
93
103
|
|
94
|
-
|Optional database client|install command|
|
95
|
-
|
96
|
-
|pymilvus(*default*)
|
97
|
-
|all
|
98
|
-
|qdrant
|
99
|
-
|pinecone
|
100
|
-
|weaviate
|
101
|
-
|elastic
|
102
|
-
|pgvector
|
103
|
-
|pgvecto.rs
|
104
|
-
|
|
105
|
-
|
|
104
|
+
| Optional database client | install command |
|
105
|
+
|--------------------------|---------------------------------------------|
|
106
|
+
| pymilvus(*default*) | `pip install vectordb-bench` |
|
107
|
+
| all | `pip install vectordb-bench[all]` |
|
108
|
+
| qdrant | `pip install vectordb-bench[qdrant]` |
|
109
|
+
| pinecone | `pip install vectordb-bench[pinecone]` |
|
110
|
+
| weaviate | `pip install vectordb-bench[weaviate]` |
|
111
|
+
| elastic | `pip install vectordb-bench[elastic]` |
|
112
|
+
| pgvector | `pip install vectordb-bench[pgvector]` |
|
113
|
+
| pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
|
114
|
+
| pgvectorscale | `pip install vectordb-bench[pgvectorscale]` |
|
115
|
+
| redis | `pip install vectordb-bench[redis]` |
|
116
|
+
| memorydb | `pip install vectordb-bench[memorydb]` |
|
117
|
+
| chromadb | `pip install vectordb-bench[chromadb]` |
|
118
|
+
| awsopensearch | `pip install vectordb-bench[awsopensearch]` |
|
106
119
|
|
107
120
|
### Run
|
108
121
|
|
@@ -345,6 +358,24 @@ Case No. | Case Type | Dataset Size | Filtering Rate | Results |
|
|
345
358
|
|
346
359
|
Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
|
347
360
|
|
361
|
+
#### Custom Dataset for Performance case
|
362
|
+
|
363
|
+
Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
|
364
|
+
|
365
|
+

|
366
|
+

|
367
|
+
|
368
|
+
We have strict requirements for the data set format, please follow them.
|
369
|
+
- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
|
370
|
+
- Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
371
|
+
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
372
|
+
- Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
|
373
|
+
|
374
|
+
- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
|
375
|
+
|
376
|
+
- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
|
377
|
+
|
378
|
+
|
348
379
|
## Goals
|
349
380
|
Our goals of this benchmark are:
|
350
381
|
### Reproducibility & Usability
|
@@ -27,18 +27,21 @@ pip install vectordb-bench[pinecone]
|
|
27
27
|
```
|
28
28
|
All the database client supported
|
29
29
|
|
30
|
-
|Optional database client|install command|
|
31
|
-
|
32
|
-
|pymilvus(*default*)
|
33
|
-
|all
|
34
|
-
|qdrant
|
35
|
-
|pinecone
|
36
|
-
|weaviate
|
37
|
-
|elastic
|
38
|
-
|pgvector
|
39
|
-
|pgvecto.rs
|
40
|
-
|
|
41
|
-
|
|
30
|
+
| Optional database client | install command |
|
31
|
+
|--------------------------|---------------------------------------------|
|
32
|
+
| pymilvus(*default*) | `pip install vectordb-bench` |
|
33
|
+
| all | `pip install vectordb-bench[all]` |
|
34
|
+
| qdrant | `pip install vectordb-bench[qdrant]` |
|
35
|
+
| pinecone | `pip install vectordb-bench[pinecone]` |
|
36
|
+
| weaviate | `pip install vectordb-bench[weaviate]` |
|
37
|
+
| elastic | `pip install vectordb-bench[elastic]` |
|
38
|
+
| pgvector | `pip install vectordb-bench[pgvector]` |
|
39
|
+
| pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
|
40
|
+
| pgvectorscale | `pip install vectordb-bench[pgvectorscale]` |
|
41
|
+
| redis | `pip install vectordb-bench[redis]` |
|
42
|
+
| memorydb | `pip install vectordb-bench[memorydb]` |
|
43
|
+
| chromadb | `pip install vectordb-bench[chromadb]` |
|
44
|
+
| awsopensearch | `pip install vectordb-bench[awsopensearch]` |
|
42
45
|
|
43
46
|
### Run
|
44
47
|
|
@@ -281,6 +284,24 @@ Case No. | Case Type | Dataset Size | Filtering Rate | Results |
|
|
281
284
|
|
282
285
|
Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
|
283
286
|
|
287
|
+
#### Custom Dataset for Performance case
|
288
|
+
|
289
|
+
Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
|
290
|
+
|
291
|
+

|
292
|
+

|
293
|
+
|
294
|
+
We have strict requirements for the data set format, please follow them.
|
295
|
+
- `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
|
296
|
+
- Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
297
|
+
- Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
|
298
|
+
- Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
|
299
|
+
|
300
|
+
- `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
|
301
|
+
|
302
|
+
- `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
|
303
|
+
|
304
|
+
|
284
305
|
## Goals
|
285
306
|
Our goals of this benchmark are:
|
286
307
|
### Reproducibility & Usability
|
Binary file
|
Binary file
|
@@ -56,12 +56,14 @@ all = [
|
|
56
56
|
"weaviate-client",
|
57
57
|
"elasticsearch",
|
58
58
|
"pgvector",
|
59
|
+
"pgvecto_rs[psycopg3]>=0.2.1",
|
59
60
|
"sqlalchemy",
|
60
61
|
"redis",
|
61
62
|
"chromadb",
|
62
|
-
"psycopg2",
|
63
63
|
"psycopg",
|
64
64
|
"psycopg-binary",
|
65
|
+
"opensearch-dsl==2.1.0",
|
66
|
+
"opensearch-py==2.6.0",
|
65
67
|
]
|
66
68
|
|
67
69
|
qdrant = [ "qdrant-client" ]
|
@@ -69,9 +71,12 @@ pinecone = [ "pinecone-client" ]
|
|
69
71
|
weaviate = [ "weaviate-client" ]
|
70
72
|
elastic = [ "elasticsearch" ]
|
71
73
|
pgvector = [ "psycopg", "psycopg-binary", "pgvector" ]
|
72
|
-
|
74
|
+
pgvectorscale = [ "psycopg", "psycopg-binary", "pgvector" ]
|
75
|
+
pgvecto_rs = [ "pgvecto_rs[psycopg3]>=0.2.1" ]
|
73
76
|
redis = [ "redis" ]
|
77
|
+
memorydb = [ "memorydb" ]
|
74
78
|
chromadb = [ "chromadb" ]
|
79
|
+
awsopensearch = [ "awsopensearch" ]
|
75
80
|
zilliz_cloud = []
|
76
81
|
|
77
82
|
[project.urls]
|
@@ -80,4 +85,5 @@ zilliz_cloud = []
|
|
80
85
|
[project.scripts]
|
81
86
|
init_bench = "vectordb_bench.__main__:main"
|
82
87
|
vectordbbench = "vectordb_bench.cli.vectordbbench:cli"
|
88
|
+
|
83
89
|
[tool.setuptools_scm]
|
@@ -35,6 +35,7 @@ class config:
|
|
35
35
|
|
36
36
|
|
37
37
|
K_DEFAULT = 100 # default return top k nearest neighbors during search
|
38
|
+
CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
|
38
39
|
|
39
40
|
CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
|
40
41
|
LOAD_TIMEOUT_DEFAULT = 2.5 * 3600 # 2.5h
|
@@ -14,7 +14,7 @@ class Assembler:
|
|
14
14
|
def assemble(cls, run_id , task: TaskConfig, source: DatasetSource) -> CaseRunner:
|
15
15
|
c_cls = task.case_config.case_id.case_cls
|
16
16
|
|
17
|
-
c = c_cls()
|
17
|
+
c = c_cls(task.case_config.custom_case)
|
18
18
|
if type(task.db_case_config) != EmptyDBCaseConfig:
|
19
19
|
task.db_case_config.metric_type = c.dataset.data.metric_type
|
20
20
|
|
@@ -4,9 +4,13 @@ from enum import Enum, auto
|
|
4
4
|
from typing import Type
|
5
5
|
|
6
6
|
from vectordb_bench import config
|
7
|
+
from vectordb_bench.backend.clients.api import MetricType
|
7
8
|
from vectordb_bench.base import BaseModel
|
9
|
+
from vectordb_bench.frontend.components.custom.getCustomConfig import (
|
10
|
+
CustomDatasetConfig,
|
11
|
+
)
|
8
12
|
|
9
|
-
from .dataset import Dataset, DatasetManager
|
13
|
+
from .dataset import CustomDataset, Dataset, DatasetManager
|
10
14
|
|
11
15
|
|
12
16
|
log = logging.getLogger(__name__)
|
@@ -44,25 +48,24 @@ class CaseType(Enum):
|
|
44
48
|
Performance1536D50K = 50
|
45
49
|
|
46
50
|
Custom = 100
|
51
|
+
PerformanceCustomDataset = 101
|
47
52
|
|
48
|
-
@property
|
49
53
|
def case_cls(self, custom_configs: dict | None = None) -> Type["Case"]:
|
50
|
-
if
|
51
|
-
|
52
|
-
|
54
|
+
if custom_configs is None:
|
55
|
+
return type2case.get(self)()
|
56
|
+
else:
|
57
|
+
return type2case.get(self)(**custom_configs)
|
53
58
|
|
54
|
-
|
55
|
-
|
56
|
-
c = self.case_cls
|
59
|
+
def case_name(self, custom_configs: dict | None = None) -> str:
|
60
|
+
c = self.case_cls(custom_configs)
|
57
61
|
if c is not None:
|
58
|
-
return c
|
62
|
+
return c.name
|
59
63
|
raise ValueError("Case unsupported")
|
60
64
|
|
61
|
-
|
62
|
-
|
63
|
-
c = self.case_cls
|
65
|
+
def case_description(self, custom_configs: dict | None = None) -> str:
|
66
|
+
c = self.case_cls(custom_configs)
|
64
67
|
if c is not None:
|
65
|
-
return c
|
68
|
+
return c.description
|
66
69
|
raise ValueError("Case unsupported")
|
67
70
|
|
68
71
|
|
@@ -289,26 +292,69 @@ Results will show index building time, recall, and maximum QPS."""
|
|
289
292
|
optimize_timeout: float | int | None = 15 * 60
|
290
293
|
|
291
294
|
|
295
|
+
def metric_type_map(s: str) -> MetricType:
|
296
|
+
if s.lower() == "cosine":
|
297
|
+
return MetricType.COSINE
|
298
|
+
if s.lower() == "l2" or s.lower() == "euclidean":
|
299
|
+
return MetricType.L2
|
300
|
+
if s.lower() == "ip":
|
301
|
+
return MetricType.IP
|
302
|
+
err_msg = f"Not support metric_type: {s}"
|
303
|
+
log.error(err_msg)
|
304
|
+
raise RuntimeError(err_msg)
|
305
|
+
|
306
|
+
|
307
|
+
class PerformanceCustomDataset(PerformanceCase):
|
308
|
+
case_id: CaseType = CaseType.PerformanceCustomDataset
|
309
|
+
name: str = "Performance With Custom Dataset"
|
310
|
+
description: str = ""
|
311
|
+
dataset: DatasetManager
|
312
|
+
|
313
|
+
def __init__(
|
314
|
+
self,
|
315
|
+
name,
|
316
|
+
description,
|
317
|
+
load_timeout,
|
318
|
+
optimize_timeout,
|
319
|
+
dataset_config,
|
320
|
+
**kwargs,
|
321
|
+
):
|
322
|
+
dataset_config = CustomDatasetConfig(**dataset_config)
|
323
|
+
dataset = CustomDataset(
|
324
|
+
name=dataset_config.name,
|
325
|
+
size=dataset_config.size,
|
326
|
+
dim=dataset_config.dim,
|
327
|
+
metric_type=metric_type_map(dataset_config.metric_type),
|
328
|
+
use_shuffled=dataset_config.use_shuffled,
|
329
|
+
with_gt=dataset_config.with_gt,
|
330
|
+
dir=dataset_config.dir,
|
331
|
+
file_num=dataset_config.file_count,
|
332
|
+
)
|
333
|
+
super().__init__(
|
334
|
+
name=name,
|
335
|
+
description=description,
|
336
|
+
load_timeout=load_timeout,
|
337
|
+
optimize_timeout=optimize_timeout,
|
338
|
+
dataset=DatasetManager(data=dataset),
|
339
|
+
)
|
340
|
+
|
341
|
+
|
292
342
|
type2case = {
|
293
343
|
CaseType.CapacityDim960: CapacityDim960,
|
294
344
|
CaseType.CapacityDim128: CapacityDim128,
|
295
|
-
|
296
345
|
CaseType.Performance768D100M: Performance768D100M,
|
297
346
|
CaseType.Performance768D10M: Performance768D10M,
|
298
347
|
CaseType.Performance768D1M: Performance768D1M,
|
299
|
-
|
300
348
|
CaseType.Performance768D10M1P: Performance768D10M1P,
|
301
349
|
CaseType.Performance768D1M1P: Performance768D1M1P,
|
302
350
|
CaseType.Performance768D10M99P: Performance768D10M99P,
|
303
351
|
CaseType.Performance768D1M99P: Performance768D1M99P,
|
304
|
-
|
305
352
|
CaseType.Performance1536D500K: Performance1536D500K,
|
306
353
|
CaseType.Performance1536D5M: Performance1536D5M,
|
307
|
-
|
308
354
|
CaseType.Performance1536D500K1P: Performance1536D500K1P,
|
309
355
|
CaseType.Performance1536D5M1P: Performance1536D5M1P,
|
310
|
-
|
311
356
|
CaseType.Performance1536D500K99P: Performance1536D500K99P,
|
312
357
|
CaseType.Performance1536D5M99P: Performance1536D5M99P,
|
313
358
|
CaseType.Performance1536D50K: Performance1536D50K,
|
359
|
+
CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
|
314
360
|
}
|
@@ -30,8 +30,11 @@ class DB(Enum):
|
|
30
30
|
WeaviateCloud = "WeaviateCloud"
|
31
31
|
PgVector = "PgVector"
|
32
32
|
PgVectoRS = "PgVectoRS"
|
33
|
+
PgVectorScale = "PgVectorScale"
|
33
34
|
Redis = "Redis"
|
35
|
+
MemoryDB = "MemoryDB"
|
34
36
|
Chroma = "Chroma"
|
37
|
+
AWSOpenSearch = "OpenSearch"
|
35
38
|
Test = "test"
|
36
39
|
|
37
40
|
|
@@ -69,15 +72,27 @@ class DB(Enum):
|
|
69
72
|
if self == DB.PgVectoRS:
|
70
73
|
from .pgvecto_rs.pgvecto_rs import PgVectoRS
|
71
74
|
return PgVectoRS
|
75
|
+
|
76
|
+
if self == DB.PgVectorScale:
|
77
|
+
from .pgvectorscale.pgvectorscale import PgVectorScale
|
78
|
+
return PgVectorScale
|
72
79
|
|
73
80
|
if self == DB.Redis:
|
74
81
|
from .redis.redis import Redis
|
75
82
|
return Redis
|
83
|
+
|
84
|
+
if self == DB.MemoryDB:
|
85
|
+
from .memorydb.memorydb import MemoryDB
|
86
|
+
return MemoryDB
|
76
87
|
|
77
88
|
if self == DB.Chroma:
|
78
89
|
from .chroma.chroma import ChromaClient
|
79
90
|
return ChromaClient
|
80
91
|
|
92
|
+
if self == DB.AWSOpenSearch:
|
93
|
+
from .aws_opensearch.aws_opensearch import AWSOpenSearch
|
94
|
+
return AWSOpenSearch
|
95
|
+
|
81
96
|
@property
|
82
97
|
def config_cls(self) -> Type[DBConfig]:
|
83
98
|
"""Import while in use"""
|
@@ -113,14 +128,26 @@ class DB(Enum):
|
|
113
128
|
from .pgvecto_rs.config import PgVectoRSConfig
|
114
129
|
return PgVectoRSConfig
|
115
130
|
|
131
|
+
if self == DB.PgVectorScale:
|
132
|
+
from .pgvectorscale.config import PgVectorScaleConfig
|
133
|
+
return PgVectorScaleConfig
|
134
|
+
|
116
135
|
if self == DB.Redis:
|
117
136
|
from .redis.config import RedisConfig
|
118
137
|
return RedisConfig
|
138
|
+
|
139
|
+
if self == DB.MemoryDB:
|
140
|
+
from .memorydb.config import MemoryDBConfig
|
141
|
+
return MemoryDBConfig
|
119
142
|
|
120
143
|
if self == DB.Chroma:
|
121
144
|
from .chroma.config import ChromaConfig
|
122
145
|
return ChromaConfig
|
123
146
|
|
147
|
+
if self == DB.AWSOpenSearch:
|
148
|
+
from .aws_opensearch.config import AWSOpenSearchConfig
|
149
|
+
return AWSOpenSearchConfig
|
150
|
+
|
124
151
|
def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
|
125
152
|
if self == DB.Milvus:
|
126
153
|
from .milvus.config import _milvus_case_config
|
@@ -150,6 +177,14 @@ class DB(Enum):
|
|
150
177
|
from .pgvecto_rs.config import _pgvecto_rs_case_config
|
151
178
|
return _pgvecto_rs_case_config.get(index_type)
|
152
179
|
|
180
|
+
if self == DB.AWSOpenSearch:
|
181
|
+
from .aws_opensearch.config import AWSOpenSearchIndexConfig
|
182
|
+
return AWSOpenSearchIndexConfig
|
183
|
+
|
184
|
+
if self == DB.PgVectorScale:
|
185
|
+
from .pgvectorscale.config import _pgvectorscale_case_config
|
186
|
+
return _pgvectorscale_case_config.get(index_type)
|
187
|
+
|
153
188
|
# DB.Pinecone, DB.Chroma, DB.Redis
|
154
189
|
return EmptyDBCaseConfig
|
155
190
|
|
@@ -15,6 +15,7 @@ class MetricType(str, Enum):
|
|
15
15
|
class IndexType(str, Enum):
|
16
16
|
HNSW = "HNSW"
|
17
17
|
DISKANN = "DISKANN"
|
18
|
+
STREAMING_DISKANN = "DISKANN"
|
18
19
|
IVFFlat = "IVF_FLAT"
|
19
20
|
IVFSQ8 = "IVF_SQ8"
|
20
21
|
Flat = "FLAT"
|
@@ -38,6 +39,22 @@ class DBConfig(ABC, BaseModel):
|
|
38
39
|
"""
|
39
40
|
|
40
41
|
db_label: str = ""
|
42
|
+
version: str = ""
|
43
|
+
note: str = ""
|
44
|
+
|
45
|
+
@staticmethod
|
46
|
+
def common_short_configs() -> list[str]:
|
47
|
+
"""
|
48
|
+
short input, such as `db_label`, `version`
|
49
|
+
"""
|
50
|
+
return ["version", "db_label"]
|
51
|
+
|
52
|
+
@staticmethod
|
53
|
+
def common_long_configs() -> list[str]:
|
54
|
+
"""
|
55
|
+
long input, such as `note`
|
56
|
+
"""
|
57
|
+
return ["note"]
|
41
58
|
|
42
59
|
@abstractmethod
|
43
60
|
def to_dict(self) -> dict:
|
@@ -45,7 +62,10 @@ class DBConfig(ABC, BaseModel):
|
|
45
62
|
|
46
63
|
@validator("*")
|
47
64
|
def not_empty_field(cls, v, field):
|
48
|
-
if
|
65
|
+
if (
|
66
|
+
field.name in cls.common_short_configs()
|
67
|
+
or field.name in cls.common_long_configs()
|
68
|
+
):
|
49
69
|
return v
|
50
70
|
if not v and isinstance(v, (str, SecretStr)):
|
51
71
|
raise ValueError("Empty string!")
|
@@ -0,0 +1,159 @@
|
|
1
|
+
import logging
|
2
|
+
from contextlib import contextmanager
|
3
|
+
import time
|
4
|
+
from typing import Iterable, Type
|
5
|
+
from ..api import VectorDB, DBCaseConfig, DBConfig, IndexType
|
6
|
+
from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig
|
7
|
+
from opensearchpy import OpenSearch
|
8
|
+
from opensearchpy.helpers import bulk
|
9
|
+
|
10
|
+
log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class AWSOpenSearch(VectorDB):
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
dim: int,
|
17
|
+
db_config: dict,
|
18
|
+
db_case_config: AWSOpenSearchIndexConfig,
|
19
|
+
index_name: str = "vdb_bench_index", # must be lowercase
|
20
|
+
id_col_name: str = "id",
|
21
|
+
vector_col_name: str = "embedding",
|
22
|
+
drop_old: bool = False,
|
23
|
+
**kwargs,
|
24
|
+
):
|
25
|
+
self.dim = dim
|
26
|
+
self.db_config = db_config
|
27
|
+
self.case_config = db_case_config
|
28
|
+
self.index_name = index_name
|
29
|
+
self.id_col_name = id_col_name
|
30
|
+
self.category_col_names = [
|
31
|
+
f"scalar-{categoryCount}" for categoryCount in [2, 5, 10, 100, 1000]
|
32
|
+
]
|
33
|
+
self.vector_col_name = vector_col_name
|
34
|
+
|
35
|
+
log.info(f"AWS_OpenSearch client config: {self.db_config}")
|
36
|
+
client = OpenSearch(**self.db_config)
|
37
|
+
if drop_old:
|
38
|
+
log.info(f"AWS_OpenSearch client drop old index: {self.index_name}")
|
39
|
+
is_existed = client.indices.exists(index=self.index_name)
|
40
|
+
if is_existed:
|
41
|
+
client.indices.delete(index=self.index_name)
|
42
|
+
self._create_index(client)
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def config_cls(cls) -> AWSOpenSearchConfig:
|
46
|
+
return AWSOpenSearchConfig
|
47
|
+
|
48
|
+
@classmethod
|
49
|
+
def case_config_cls(
|
50
|
+
cls, index_type: IndexType | None = None
|
51
|
+
) -> AWSOpenSearchIndexConfig:
|
52
|
+
return AWSOpenSearchIndexConfig
|
53
|
+
|
54
|
+
def _create_index(self, client: OpenSearch):
|
55
|
+
settings = {
|
56
|
+
"index": {
|
57
|
+
"knn": True,
|
58
|
+
# "number_of_shards": 5,
|
59
|
+
# "refresh_interval": "600s",
|
60
|
+
}
|
61
|
+
}
|
62
|
+
mappings = {
|
63
|
+
"properties": {
|
64
|
+
self.id_col_name: {"type": "integer"},
|
65
|
+
**{
|
66
|
+
categoryCol: {"type": "keyword"}
|
67
|
+
for categoryCol in self.category_col_names
|
68
|
+
},
|
69
|
+
self.vector_col_name: {
|
70
|
+
"type": "knn_vector",
|
71
|
+
"dimension": self.dim,
|
72
|
+
"method": self.case_config.index_param(),
|
73
|
+
},
|
74
|
+
}
|
75
|
+
}
|
76
|
+
try:
|
77
|
+
client.indices.create(
|
78
|
+
index=self.index_name, body=dict(settings=settings, mappings=mappings)
|
79
|
+
)
|
80
|
+
except Exception as e:
|
81
|
+
log.warning(f"Failed to create index: {self.index_name} error: {str(e)}")
|
82
|
+
raise e from None
|
83
|
+
|
84
|
+
@contextmanager
|
85
|
+
def init(self) -> None:
|
86
|
+
"""connect to elasticsearch"""
|
87
|
+
self.client = OpenSearch(**self.db_config)
|
88
|
+
|
89
|
+
yield
|
90
|
+
# self.client.transport.close()
|
91
|
+
self.client = None
|
92
|
+
del self.client
|
93
|
+
|
94
|
+
def insert_embeddings(
|
95
|
+
self,
|
96
|
+
embeddings: Iterable[list[float]],
|
97
|
+
metadata: list[int],
|
98
|
+
**kwargs,
|
99
|
+
) -> tuple[int, Exception]:
|
100
|
+
"""Insert the embeddings to the elasticsearch."""
|
101
|
+
assert self.client is not None, "should self.init() first"
|
102
|
+
|
103
|
+
insert_data = []
|
104
|
+
for i in range(len(embeddings)):
|
105
|
+
insert_data.append({"index": {"_index": self.index_name, "_id": metadata[i]}})
|
106
|
+
insert_data.append({self.vector_col_name: embeddings[i]})
|
107
|
+
try:
|
108
|
+
resp = self.client.bulk(insert_data)
|
109
|
+
log.info(f"AWS_OpenSearch adding documents: {len(resp['items'])}")
|
110
|
+
resp = self.client.indices.stats(self.index_name)
|
111
|
+
log.info(f"Total document count in index: {resp['_all']['primaries']['indexing']['index_total']}")
|
112
|
+
return (len(embeddings), None)
|
113
|
+
except Exception as e:
|
114
|
+
log.warning(f"Failed to insert data: {self.index_name} error: {str(e)}")
|
115
|
+
time.sleep(10)
|
116
|
+
return self.insert_embeddings(embeddings, metadata)
|
117
|
+
|
118
|
+
def search_embedding(
|
119
|
+
self,
|
120
|
+
query: list[float],
|
121
|
+
k: int = 100,
|
122
|
+
filters: dict | None = None,
|
123
|
+
) -> list[int]:
|
124
|
+
"""Get k most similar embeddings to query vector.
|
125
|
+
|
126
|
+
Args:
|
127
|
+
query(list[float]): query embedding to look up documents similar to.
|
128
|
+
k(int): Number of most similar embeddings to return. Defaults to 100.
|
129
|
+
filters(dict, optional): filtering expression to filter the data while searching.
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
list[tuple[int, float]]: list of k most similar embeddings in (id, score) tuple to the query embedding.
|
133
|
+
"""
|
134
|
+
assert self.client is not None, "should self.init() first"
|
135
|
+
|
136
|
+
body = {
|
137
|
+
"size": k,
|
138
|
+
"query": {"knn": {self.vector_col_name: {"vector": query, "k": k}}},
|
139
|
+
}
|
140
|
+
try:
|
141
|
+
resp = self.client.search(index=self.index_name, body=body)
|
142
|
+
log.info(f'Search took: {resp["took"]}')
|
143
|
+
log.info(f'Search shards: {resp["_shards"]}')
|
144
|
+
log.info(f'Search hits total: {resp["hits"]["total"]}')
|
145
|
+
result = [int(d["_id"]) for d in resp["hits"]["hits"]]
|
146
|
+
# log.info(f'success! length={len(res)}')
|
147
|
+
|
148
|
+
return result
|
149
|
+
except Exception as e:
|
150
|
+
log.warning(f"Failed to search: {self.index_name} error: {str(e)}")
|
151
|
+
raise e from None
|
152
|
+
|
153
|
+
def optimize(self):
|
154
|
+
"""optimize will be called between insertion and search in performance cases."""
|
155
|
+
pass
|
156
|
+
|
157
|
+
def ready_to_load(self):
|
158
|
+
"""ready_to_load will be called before load in load cases."""
|
159
|
+
pass
|