vectordb-bench 0.0.11__tar.gz → 0.0.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.gitignore +1 -0
  2. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/PKG-INFO +46 -15
  3. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/README.md +33 -12
  4. vectordb_bench-0.0.13/fig/custom_case_run_test.png +0 -0
  5. vectordb_bench-0.0.13/fig/custom_dataset.png +0 -0
  6. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/install/requirements_py3.11.txt +1 -0
  7. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/pyproject.toml +8 -2
  8. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/__init__.py +1 -0
  9. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/assembler.py +1 -1
  10. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/cases.py +64 -18
  11. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/__init__.py +35 -0
  12. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/api.py +21 -1
  13. vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/aws_opensearch.py +159 -0
  14. vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/cli.py +44 -0
  15. vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/config.py +58 -0
  16. vectordb_bench-0.0.13/vectordb_bench/backend/clients/aws_opensearch/run.py +125 -0
  17. vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/cli.py +88 -0
  18. vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/config.py +54 -0
  19. vectordb_bench-0.0.13/vectordb_bench/backend/clients/memorydb/memorydb.py +254 -0
  20. vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/cli.py +154 -0
  21. vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/config.py +162 -0
  22. vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +292 -0
  23. vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvectorscale/config.py +111 -0
  24. vectordb_bench-0.0.13/vectordb_bench/backend/clients/pgvectorscale/pgvectorscale.py +272 -0
  25. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/dataset.py +27 -5
  26. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/vectordbbench.py +7 -0
  27. vectordb_bench-0.0.13/vectordb_bench/custom/custom_case.json +18 -0
  28. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/charts.py +6 -6
  29. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/data.py +18 -11
  30. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/expanderStyle.py +1 -1
  31. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/filters.py +20 -13
  32. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/headerIcon.py +1 -1
  33. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/priceTable.py +1 -1
  34. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/stPageConfig.py +1 -1
  35. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/concurrent/charts.py +26 -29
  36. vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/displayCustomCase.py +31 -0
  37. vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/displaypPrams.py +11 -0
  38. vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/getCustomConfig.py +40 -0
  39. vectordb_bench-0.0.13/vectordb_bench/frontend/components/custom/initStyle.py +15 -0
  40. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/autoRefresh.py +1 -1
  41. vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/caseSelector.py +115 -0
  42. vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/dbConfigSetting.py +77 -0
  43. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/dbSelector.py +2 -14
  44. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/generateTasks.py +3 -5
  45. vectordb_bench-0.0.13/vectordb_bench/frontend/components/run_test/initStyle.py +16 -0
  46. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/submitTask.py +1 -1
  47. {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/dbCaseConfigs.py +311 -40
  48. {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/styles.py +2 -0
  49. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/concurrent.py +11 -18
  50. vectordb_bench-0.0.13/vectordb_bench/frontend/pages/custom.py +64 -0
  51. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/quries_per_dollar.py +5 -5
  52. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/run_test.py +4 -0
  53. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/pages/tables.py +2 -2
  54. vectordb_bench-0.0.13/vectordb_bench/frontend/utils.py +22 -0
  55. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/vdb_benchmark.py +3 -3
  56. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/models.py +26 -10
  57. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/getLeaderboardData.py +1 -1
  58. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/PKG-INFO +46 -15
  59. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/SOURCES.txt +22 -3
  60. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/requires.txt +15 -2
  61. vectordb_bench-0.0.11/vectordb_bench/backend/clients/pgvecto_rs/config.py +0 -127
  62. vectordb_bench-0.0.11/vectordb_bench/backend/clients/pgvecto_rs/pgvecto_rs.py +0 -192
  63. vectordb_bench-0.0.11/vectordb_bench/frontend/components/run_test/caseSelector.py +0 -93
  64. vectordb_bench-0.0.11/vectordb_bench/frontend/components/run_test/dbConfigSetting.py +0 -59
  65. vectordb_bench-0.0.11/vectordb_bench/frontend/utils.py +0 -6
  66. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.devcontainer/Dockerfile +0 -0
  67. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.devcontainer/devcontainer.json +0 -0
  68. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.env.example +0 -0
  69. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.github/workflows/publish_package_on_release.yml +0 -0
  70. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.github/workflows/pull_request.yml +0 -0
  71. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/.ruff.toml +0 -0
  72. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/Dockerfile +0 -0
  73. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/LICENSE +0 -0
  74. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/Makefile +0 -0
  75. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/OWNERS +0 -0
  76. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/install.py +0 -0
  77. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/setup.cfg +0 -0
  78. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/conftest.py +0 -0
  79. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/pytest.ini +0 -0
  80. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_bench_runner.py +0 -0
  81. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_chroma.py +0 -0
  82. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_data_source.py +0 -0
  83. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_dataset.py +0 -0
  84. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_elasticsearch_cloud.py +0 -0
  85. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_models.py +0 -0
  86. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_redis.py +0 -0
  87. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/test_utils.py +0 -0
  88. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/tests/ut_cases.py +0 -0
  89. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/__main__.py +0 -0
  90. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/__init__.py +0 -0
  91. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/chroma/chroma.py +0 -0
  92. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/chroma/config.py +0 -0
  93. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/elastic_cloud/config.py +0 -0
  94. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py +0 -0
  95. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/cli.py +0 -0
  96. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/config.py +0 -0
  97. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/milvus/milvus.py +0 -0
  98. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/cli.py +0 -0
  99. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/config.py +0 -0
  100. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pgvector/pgvector.py +0 -0
  101. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pinecone/config.py +0 -0
  102. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/pinecone/pinecone.py +0 -0
  103. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/qdrant_cloud/config.py +0 -0
  104. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py +0 -0
  105. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/cli.py +0 -0
  106. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/config.py +0 -0
  107. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/redis/redis.py +0 -0
  108. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/cli.py +0 -0
  109. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/config.py +0 -0
  110. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/test/test.py +0 -0
  111. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/cli.py +0 -0
  112. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/config.py +0 -0
  113. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py +0 -0
  114. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/cli.py +0 -0
  115. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/config.py +0 -0
  116. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/clients/zilliz_cloud/zilliz_cloud.py +0 -0
  117. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/data_source.py +0 -0
  118. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/result_collector.py +0 -0
  119. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/__init__.py +0 -0
  120. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/mp_runner.py +0 -0
  121. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/runner/serial_runner.py +0 -0
  122. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/task_runner.py +0 -0
  123. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/backend/utils.py +0 -0
  124. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/base.py +0 -0
  125. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/__init__.py +0 -0
  126. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/cli/cli.py +0 -0
  127. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/config-files/sample_config.yml +0 -0
  128. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/footer.py +0 -0
  129. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/check_results/nav.py +0 -0
  130. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/get_results/saveAsImage.py +0 -0
  131. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/run_test/hideSidebar.py +0 -0
  132. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/frontend/components/tables/data.py +0 -0
  133. {vectordb_bench-0.0.11/vectordb_bench/frontend/const → vectordb_bench-0.0.13/vectordb_bench/frontend/config}/dbPrices.py +0 -0
  134. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/interface.py +0 -0
  135. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/log_util.py +0 -0
  136. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/metric.py +0 -0
  137. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ElasticCloud/result_20230727_standard_elasticcloud.json +0 -0
  138. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ElasticCloud/result_20230808_standard_elasticcloud.json +0 -0
  139. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Milvus/result_20230727_standard_milvus.json +0 -0
  140. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Milvus/result_20230808_standard_milvus.json +0 -0
  141. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/PgVector/result_20230727_standard_pgvector.json +0 -0
  142. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/PgVector/result_20230808_standard_pgvector.json +0 -0
  143. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Pinecone/result_20230727_standard_pinecone.json +0 -0
  144. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/Pinecone/result_20230808_standard_pinecone.json +0 -0
  145. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/QdrantCloud/result_20230727_standard_qdrantcloud.json +0 -0
  146. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/QdrantCloud/result_20230808_standard_qdrantcloud.json +0 -0
  147. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/WeaviateCloud/result_20230727_standard_weaviatecloud.json +0 -0
  148. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/WeaviateCloud/result_20230808_standard_weaviatecloud.json +0 -0
  149. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20230727_standard_zillizcloud.json +0 -0
  150. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20230808_standard_zillizcloud.json +0 -0
  151. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/ZillizCloud/result_20240105_standard_202401_zillizcloud.json +0 -0
  152. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/dbPrices.json +0 -0
  153. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench/results/leaderboard.json +0 -0
  154. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/dependency_links.txt +0 -0
  155. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/entry_points.txt +0 -0
  156. {vectordb_bench-0.0.11 → vectordb_bench-0.0.13}/vectordb_bench.egg-info/top_level.txt +0 -0
@@ -9,3 +9,4 @@ __MACOSX
9
9
  build/
10
10
  venv/
11
11
  .idea/
12
+ results/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vectordb-bench
3
- Version: 0.0.11
3
+ Version: 0.0.13
4
4
  Summary: VectorDBBench is not just an offering of benchmark results for mainstream vector databases and cloud services, it's your go-to tool for the ultimate performance and cost-effectiveness comparison. Designed with ease-of-use in mind, VectorDBBench is devised to help users, even non-professionals, reproduce results or test new systems, making the hunt for the optimal choice amongst a plethora of cloud services and open-source vector databases a breeze.
5
5
  Author-email: XuanYang-cn <xuan.yang@zilliz.com>
6
6
  Project-URL: repository, https://github.com/zilliztech/VectorDBBench
@@ -36,12 +36,14 @@ Requires-Dist: pinecone-client; extra == "all"
36
36
  Requires-Dist: weaviate-client; extra == "all"
37
37
  Requires-Dist: elasticsearch; extra == "all"
38
38
  Requires-Dist: pgvector; extra == "all"
39
+ Requires-Dist: pgvecto_rs[psycopg3]>=0.2.1; extra == "all"
39
40
  Requires-Dist: sqlalchemy; extra == "all"
40
41
  Requires-Dist: redis; extra == "all"
41
42
  Requires-Dist: chromadb; extra == "all"
42
- Requires-Dist: psycopg2; extra == "all"
43
43
  Requires-Dist: psycopg; extra == "all"
44
44
  Requires-Dist: psycopg-binary; extra == "all"
45
+ Requires-Dist: opensearch-dsl==2.1.0; extra == "all"
46
+ Requires-Dist: opensearch-py==2.6.0; extra == "all"
45
47
  Provides-Extra: qdrant
46
48
  Requires-Dist: qdrant-client; extra == "qdrant"
47
49
  Provides-Extra: pinecone
@@ -54,12 +56,20 @@ Provides-Extra: pgvector
54
56
  Requires-Dist: psycopg; extra == "pgvector"
55
57
  Requires-Dist: psycopg-binary; extra == "pgvector"
56
58
  Requires-Dist: pgvector; extra == "pgvector"
59
+ Provides-Extra: pgvectorscale
60
+ Requires-Dist: psycopg; extra == "pgvectorscale"
61
+ Requires-Dist: psycopg-binary; extra == "pgvectorscale"
62
+ Requires-Dist: pgvector; extra == "pgvectorscale"
57
63
  Provides-Extra: pgvecto-rs
58
- Requires-Dist: psycopg2; extra == "pgvecto-rs"
64
+ Requires-Dist: pgvecto_rs[psycopg3]>=0.2.1; extra == "pgvecto-rs"
59
65
  Provides-Extra: redis
60
66
  Requires-Dist: redis; extra == "redis"
67
+ Provides-Extra: memorydb
68
+ Requires-Dist: memorydb; extra == "memorydb"
61
69
  Provides-Extra: chromadb
62
70
  Requires-Dist: chromadb; extra == "chromadb"
71
+ Provides-Extra: awsopensearch
72
+ Requires-Dist: awsopensearch; extra == "awsopensearch"
63
73
  Provides-Extra: zilliz-cloud
64
74
 
65
75
  # VectorDBBench: A Benchmark Tool for VectorDB
@@ -91,18 +101,21 @@ pip install vectordb-bench[pinecone]
91
101
  ```
92
102
  All the database client supported
93
103
 
94
- |Optional database client|install command|
95
- |---------------|---------------|
96
- |pymilvus(*default*)|`pip install vectordb-bench`|
97
- |all|`pip install vectordb-bench[all]`|
98
- |qdrant|`pip install vectordb-bench[qdrant]`|
99
- |pinecone|`pip install vectordb-bench[pinecone]`|
100
- |weaviate|`pip install vectordb-bench[weaviate]`|
101
- |elastic|`pip install vectordb-bench[elastic]`|
102
- |pgvector|`pip install vectordb-bench[pgvector]`|
103
- |pgvecto.rs|`pip install vectordb-bench[pgvecto_rs]`|
104
- |redis|`pip install vectordb-bench[redis]`|
105
- |chromadb|`pip install vectordb-bench[chromadb]`|
104
+ | Optional database client | install command |
105
+ |--------------------------|---------------------------------------------|
106
+ | pymilvus(*default*) | `pip install vectordb-bench` |
107
+ | all | `pip install vectordb-bench[all]` |
108
+ | qdrant | `pip install vectordb-bench[qdrant]` |
109
+ | pinecone | `pip install vectordb-bench[pinecone]` |
110
+ | weaviate | `pip install vectordb-bench[weaviate]` |
111
+ | elastic | `pip install vectordb-bench[elastic]` |
112
+ | pgvector | `pip install vectordb-bench[pgvector]` |
113
+ | pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
114
+ | pgvectorscale | `pip install vectordb-bench[pgvectorscale]` |
115
+ | redis | `pip install vectordb-bench[redis]` |
116
+ | memorydb | `pip install vectordb-bench[memorydb]` |
117
+ | chromadb | `pip install vectordb-bench[chromadb]` |
118
+ | awsopensearch | `pip install vectordb-bench[awsopensearch]` |
106
119
 
107
120
  ### Run
108
121
 
@@ -345,6 +358,24 @@ Case No. | Case Type | Dataset Size | Filtering Rate | Results |
345
358
 
346
359
  Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
347
360
 
361
+ #### Custom Dataset for Performance case
362
+
363
+ Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
364
+
365
+ ![image](fig/custom_dataset.png)
366
+ ![image](fig/custom_case_run_test.png)
367
+
368
+ We have strict requirements for the data set format, please follow them.
369
+ - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
370
+ - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
371
+ - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
372
+ - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
373
+
374
+ - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
375
+
376
+ - `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
377
+
378
+
348
379
  ## Goals
349
380
  Our goals of this benchmark are:
350
381
  ### Reproducibility & Usability
@@ -27,18 +27,21 @@ pip install vectordb-bench[pinecone]
27
27
  ```
28
28
  All the database client supported
29
29
 
30
- |Optional database client|install command|
31
- |---------------|---------------|
32
- |pymilvus(*default*)|`pip install vectordb-bench`|
33
- |all|`pip install vectordb-bench[all]`|
34
- |qdrant|`pip install vectordb-bench[qdrant]`|
35
- |pinecone|`pip install vectordb-bench[pinecone]`|
36
- |weaviate|`pip install vectordb-bench[weaviate]`|
37
- |elastic|`pip install vectordb-bench[elastic]`|
38
- |pgvector|`pip install vectordb-bench[pgvector]`|
39
- |pgvecto.rs|`pip install vectordb-bench[pgvecto_rs]`|
40
- |redis|`pip install vectordb-bench[redis]`|
41
- |chromadb|`pip install vectordb-bench[chromadb]`|
30
+ | Optional database client | install command |
31
+ |--------------------------|---------------------------------------------|
32
+ | pymilvus(*default*) | `pip install vectordb-bench` |
33
+ | all | `pip install vectordb-bench[all]` |
34
+ | qdrant | `pip install vectordb-bench[qdrant]` |
35
+ | pinecone | `pip install vectordb-bench[pinecone]` |
36
+ | weaviate | `pip install vectordb-bench[weaviate]` |
37
+ | elastic | `pip install vectordb-bench[elastic]` |
38
+ | pgvector | `pip install vectordb-bench[pgvector]` |
39
+ | pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
40
+ | pgvectorscale | `pip install vectordb-bench[pgvectorscale]` |
41
+ | redis | `pip install vectordb-bench[redis]` |
42
+ | memorydb | `pip install vectordb-bench[memorydb]` |
43
+ | chromadb | `pip install vectordb-bench[chromadb]` |
44
+ | awsopensearch | `pip install vectordb-bench[awsopensearch]` |
42
45
 
43
46
  ### Run
44
47
 
@@ -281,6 +284,24 @@ Case No. | Case Type | Dataset Size | Filtering Rate | Results |
281
284
 
282
285
  Each case provides an in-depth examination of a vector database's abilities, providing you a comprehensive view of the database's performance.
283
286
 
287
+ #### Custom Dataset for Performance case
288
+
289
+ Through the `/custom` page, users can customize their own performance case using local datasets. After saving, the corresponding case can be selected from the `/run_test` page to perform the test.
290
+
291
+ ![image](fig/custom_dataset.png)
292
+ ![image](fig/custom_case_run_test.png)
293
+
294
+ We have strict requirements for the data set format, please follow them.
295
+ - `Folder Path` - The path to the folder containing all the files. Please ensure that all files in the folder are in the `Parquet` format.
296
+ - Vectors data files: The file must be named `train.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
297
+ - Query test vectors: The file must be named `test.parquet` and should have two columns: `id` as an incrementing `int` and `emb` as an array of `float32`.
298
+ - Ground truth file: The file must be named `neighbors.parquet` and should have two columns: `id` corresponding to query vectors and `neighbors_id` as an array of `int`.
299
+
300
+ - `Train File Count` - If the vector file is too large, you can consider splitting it into multiple files. The naming format for the split files should be `train-[index]-of-[file_count].parquet`. For example, `train-01-of-10.parquet` represents the second file (0-indexed) among 10 split files.
301
+
302
+ - `Use Shuffled Data` - If you check this option, the vector data files need to be modified. VectorDBBench will load the data labeled with `shuffle`. For example, use `shuffle_train.parquet` instead of `train.parquet` and `shuffle_train-04-of-10.parquet` instead of `train-04-of-10.parquet`. The `id` column in the shuffled data can be in any order.
303
+
304
+
284
305
  ## Goals
285
306
  Our goals of this benchmark are:
286
307
  ### Reproducibility & Usability
@@ -5,6 +5,7 @@ pinecone-client
5
5
  weaviate-client
6
6
  elasticsearch
7
7
  pgvector
8
+ pgvecto_rs[psycopg3]>=0.2.1
8
9
  sqlalchemy
9
10
  redis
10
11
  chromadb
@@ -56,12 +56,14 @@ all = [
56
56
  "weaviate-client",
57
57
  "elasticsearch",
58
58
  "pgvector",
59
+ "pgvecto_rs[psycopg3]>=0.2.1",
59
60
  "sqlalchemy",
60
61
  "redis",
61
62
  "chromadb",
62
- "psycopg2",
63
63
  "psycopg",
64
64
  "psycopg-binary",
65
+ "opensearch-dsl==2.1.0",
66
+ "opensearch-py==2.6.0",
65
67
  ]
66
68
 
67
69
  qdrant = [ "qdrant-client" ]
@@ -69,9 +71,12 @@ pinecone = [ "pinecone-client" ]
69
71
  weaviate = [ "weaviate-client" ]
70
72
  elastic = [ "elasticsearch" ]
71
73
  pgvector = [ "psycopg", "psycopg-binary", "pgvector" ]
72
- pgvecto_rs = [ "psycopg2" ]
74
+ pgvectorscale = [ "psycopg", "psycopg-binary", "pgvector" ]
75
+ pgvecto_rs = [ "pgvecto_rs[psycopg3]>=0.2.1" ]
73
76
  redis = [ "redis" ]
77
+ memorydb = [ "memorydb" ]
74
78
  chromadb = [ "chromadb" ]
79
+ awsopensearch = [ "awsopensearch" ]
75
80
  zilliz_cloud = []
76
81
 
77
82
  [project.urls]
@@ -80,4 +85,5 @@ zilliz_cloud = []
80
85
  [project.scripts]
81
86
  init_bench = "vectordb_bench.__main__:main"
82
87
  vectordbbench = "vectordb_bench.cli.vectordbbench:cli"
88
+
83
89
  [tool.setuptools_scm]
@@ -35,6 +35,7 @@ class config:
35
35
 
36
36
 
37
37
  K_DEFAULT = 100 # default return top k nearest neighbors during search
38
+ CUSTOM_CONFIG_DIR = pathlib.Path(__file__).parent.joinpath("custom/custom_case.json")
38
39
 
39
40
  CAPACITY_TIMEOUT_IN_SECONDS = 24 * 3600 # 24h
40
41
  LOAD_TIMEOUT_DEFAULT = 2.5 * 3600 # 2.5h
@@ -14,7 +14,7 @@ class Assembler:
14
14
  def assemble(cls, run_id , task: TaskConfig, source: DatasetSource) -> CaseRunner:
15
15
  c_cls = task.case_config.case_id.case_cls
16
16
 
17
- c = c_cls()
17
+ c = c_cls(task.case_config.custom_case)
18
18
  if type(task.db_case_config) != EmptyDBCaseConfig:
19
19
  task.db_case_config.metric_type = c.dataset.data.metric_type
20
20
 
@@ -4,9 +4,13 @@ from enum import Enum, auto
4
4
  from typing import Type
5
5
 
6
6
  from vectordb_bench import config
7
+ from vectordb_bench.backend.clients.api import MetricType
7
8
  from vectordb_bench.base import BaseModel
9
+ from vectordb_bench.frontend.components.custom.getCustomConfig import (
10
+ CustomDatasetConfig,
11
+ )
8
12
 
9
- from .dataset import Dataset, DatasetManager
13
+ from .dataset import CustomDataset, Dataset, DatasetManager
10
14
 
11
15
 
12
16
  log = logging.getLogger(__name__)
@@ -44,25 +48,24 @@ class CaseType(Enum):
44
48
  Performance1536D50K = 50
45
49
 
46
50
  Custom = 100
51
+ PerformanceCustomDataset = 101
47
52
 
48
- @property
49
53
  def case_cls(self, custom_configs: dict | None = None) -> Type["Case"]:
50
- if self not in type2case:
51
- raise NotImplementedError(f"Case {self} has not implemented. You can add it manually to vectordb_bench.backend.cases.type2case or define a custom_configs['custom_cls']")
52
- return type2case[self]
54
+ if custom_configs is None:
55
+ return type2case.get(self)()
56
+ else:
57
+ return type2case.get(self)(**custom_configs)
53
58
 
54
- @property
55
- def case_name(self) -> str:
56
- c = self.case_cls
59
+ def case_name(self, custom_configs: dict | None = None) -> str:
60
+ c = self.case_cls(custom_configs)
57
61
  if c is not None:
58
- return c().name
62
+ return c.name
59
63
  raise ValueError("Case unsupported")
60
64
 
61
- @property
62
- def case_description(self) -> str:
63
- c = self.case_cls
65
+ def case_description(self, custom_configs: dict | None = None) -> str:
66
+ c = self.case_cls(custom_configs)
64
67
  if c is not None:
65
- return c().description
68
+ return c.description
66
69
  raise ValueError("Case unsupported")
67
70
 
68
71
 
@@ -289,26 +292,69 @@ Results will show index building time, recall, and maximum QPS."""
289
292
  optimize_timeout: float | int | None = 15 * 60
290
293
 
291
294
 
295
+ def metric_type_map(s: str) -> MetricType:
296
+ if s.lower() == "cosine":
297
+ return MetricType.COSINE
298
+ if s.lower() == "l2" or s.lower() == "euclidean":
299
+ return MetricType.L2
300
+ if s.lower() == "ip":
301
+ return MetricType.IP
302
+ err_msg = f"Not support metric_type: {s}"
303
+ log.error(err_msg)
304
+ raise RuntimeError(err_msg)
305
+
306
+
307
+ class PerformanceCustomDataset(PerformanceCase):
308
+ case_id: CaseType = CaseType.PerformanceCustomDataset
309
+ name: str = "Performance With Custom Dataset"
310
+ description: str = ""
311
+ dataset: DatasetManager
312
+
313
+ def __init__(
314
+ self,
315
+ name,
316
+ description,
317
+ load_timeout,
318
+ optimize_timeout,
319
+ dataset_config,
320
+ **kwargs,
321
+ ):
322
+ dataset_config = CustomDatasetConfig(**dataset_config)
323
+ dataset = CustomDataset(
324
+ name=dataset_config.name,
325
+ size=dataset_config.size,
326
+ dim=dataset_config.dim,
327
+ metric_type=metric_type_map(dataset_config.metric_type),
328
+ use_shuffled=dataset_config.use_shuffled,
329
+ with_gt=dataset_config.with_gt,
330
+ dir=dataset_config.dir,
331
+ file_num=dataset_config.file_count,
332
+ )
333
+ super().__init__(
334
+ name=name,
335
+ description=description,
336
+ load_timeout=load_timeout,
337
+ optimize_timeout=optimize_timeout,
338
+ dataset=DatasetManager(data=dataset),
339
+ )
340
+
341
+
292
342
  type2case = {
293
343
  CaseType.CapacityDim960: CapacityDim960,
294
344
  CaseType.CapacityDim128: CapacityDim128,
295
-
296
345
  CaseType.Performance768D100M: Performance768D100M,
297
346
  CaseType.Performance768D10M: Performance768D10M,
298
347
  CaseType.Performance768D1M: Performance768D1M,
299
-
300
348
  CaseType.Performance768D10M1P: Performance768D10M1P,
301
349
  CaseType.Performance768D1M1P: Performance768D1M1P,
302
350
  CaseType.Performance768D10M99P: Performance768D10M99P,
303
351
  CaseType.Performance768D1M99P: Performance768D1M99P,
304
-
305
352
  CaseType.Performance1536D500K: Performance1536D500K,
306
353
  CaseType.Performance1536D5M: Performance1536D5M,
307
-
308
354
  CaseType.Performance1536D500K1P: Performance1536D500K1P,
309
355
  CaseType.Performance1536D5M1P: Performance1536D5M1P,
310
-
311
356
  CaseType.Performance1536D500K99P: Performance1536D500K99P,
312
357
  CaseType.Performance1536D5M99P: Performance1536D5M99P,
313
358
  CaseType.Performance1536D50K: Performance1536D50K,
359
+ CaseType.PerformanceCustomDataset: PerformanceCustomDataset,
314
360
  }
@@ -30,8 +30,11 @@ class DB(Enum):
30
30
  WeaviateCloud = "WeaviateCloud"
31
31
  PgVector = "PgVector"
32
32
  PgVectoRS = "PgVectoRS"
33
+ PgVectorScale = "PgVectorScale"
33
34
  Redis = "Redis"
35
+ MemoryDB = "MemoryDB"
34
36
  Chroma = "Chroma"
37
+ AWSOpenSearch = "OpenSearch"
35
38
  Test = "test"
36
39
 
37
40
 
@@ -69,15 +72,27 @@ class DB(Enum):
69
72
  if self == DB.PgVectoRS:
70
73
  from .pgvecto_rs.pgvecto_rs import PgVectoRS
71
74
  return PgVectoRS
75
+
76
+ if self == DB.PgVectorScale:
77
+ from .pgvectorscale.pgvectorscale import PgVectorScale
78
+ return PgVectorScale
72
79
 
73
80
  if self == DB.Redis:
74
81
  from .redis.redis import Redis
75
82
  return Redis
83
+
84
+ if self == DB.MemoryDB:
85
+ from .memorydb.memorydb import MemoryDB
86
+ return MemoryDB
76
87
 
77
88
  if self == DB.Chroma:
78
89
  from .chroma.chroma import ChromaClient
79
90
  return ChromaClient
80
91
 
92
+ if self == DB.AWSOpenSearch:
93
+ from .aws_opensearch.aws_opensearch import AWSOpenSearch
94
+ return AWSOpenSearch
95
+
81
96
  @property
82
97
  def config_cls(self) -> Type[DBConfig]:
83
98
  """Import while in use"""
@@ -113,14 +128,26 @@ class DB(Enum):
113
128
  from .pgvecto_rs.config import PgVectoRSConfig
114
129
  return PgVectoRSConfig
115
130
 
131
+ if self == DB.PgVectorScale:
132
+ from .pgvectorscale.config import PgVectorScaleConfig
133
+ return PgVectorScaleConfig
134
+
116
135
  if self == DB.Redis:
117
136
  from .redis.config import RedisConfig
118
137
  return RedisConfig
138
+
139
+ if self == DB.MemoryDB:
140
+ from .memorydb.config import MemoryDBConfig
141
+ return MemoryDBConfig
119
142
 
120
143
  if self == DB.Chroma:
121
144
  from .chroma.config import ChromaConfig
122
145
  return ChromaConfig
123
146
 
147
+ if self == DB.AWSOpenSearch:
148
+ from .aws_opensearch.config import AWSOpenSearchConfig
149
+ return AWSOpenSearchConfig
150
+
124
151
  def case_config_cls(self, index_type: IndexType | None = None) -> Type[DBCaseConfig]:
125
152
  if self == DB.Milvus:
126
153
  from .milvus.config import _milvus_case_config
@@ -150,6 +177,14 @@ class DB(Enum):
150
177
  from .pgvecto_rs.config import _pgvecto_rs_case_config
151
178
  return _pgvecto_rs_case_config.get(index_type)
152
179
 
180
+ if self == DB.AWSOpenSearch:
181
+ from .aws_opensearch.config import AWSOpenSearchIndexConfig
182
+ return AWSOpenSearchIndexConfig
183
+
184
+ if self == DB.PgVectorScale:
185
+ from .pgvectorscale.config import _pgvectorscale_case_config
186
+ return _pgvectorscale_case_config.get(index_type)
187
+
153
188
  # DB.Pinecone, DB.Chroma, DB.Redis
154
189
  return EmptyDBCaseConfig
155
190
 
@@ -15,6 +15,7 @@ class MetricType(str, Enum):
15
15
  class IndexType(str, Enum):
16
16
  HNSW = "HNSW"
17
17
  DISKANN = "DISKANN"
18
+ STREAMING_DISKANN = "DISKANN"
18
19
  IVFFlat = "IVF_FLAT"
19
20
  IVFSQ8 = "IVF_SQ8"
20
21
  Flat = "FLAT"
@@ -38,6 +39,22 @@ class DBConfig(ABC, BaseModel):
38
39
  """
39
40
 
40
41
  db_label: str = ""
42
+ version: str = ""
43
+ note: str = ""
44
+
45
+ @staticmethod
46
+ def common_short_configs() -> list[str]:
47
+ """
48
+ short input, such as `db_label`, `version`
49
+ """
50
+ return ["version", "db_label"]
51
+
52
+ @staticmethod
53
+ def common_long_configs() -> list[str]:
54
+ """
55
+ long input, such as `note`
56
+ """
57
+ return ["note"]
41
58
 
42
59
  @abstractmethod
43
60
  def to_dict(self) -> dict:
@@ -45,7 +62,10 @@ class DBConfig(ABC, BaseModel):
45
62
 
46
63
  @validator("*")
47
64
  def not_empty_field(cls, v, field):
48
- if field.name == "db_label":
65
+ if (
66
+ field.name in cls.common_short_configs()
67
+ or field.name in cls.common_long_configs()
68
+ ):
49
69
  return v
50
70
  if not v and isinstance(v, (str, SecretStr)):
51
71
  raise ValueError("Empty string!")
@@ -0,0 +1,159 @@
1
+ import logging
2
+ from contextlib import contextmanager
3
+ import time
4
+ from typing import Iterable, Type
5
+ from ..api import VectorDB, DBCaseConfig, DBConfig, IndexType
6
+ from .config import AWSOpenSearchConfig, AWSOpenSearchIndexConfig
7
+ from opensearchpy import OpenSearch
8
+ from opensearchpy.helpers import bulk
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ class AWSOpenSearch(VectorDB):
14
+ def __init__(
15
+ self,
16
+ dim: int,
17
+ db_config: dict,
18
+ db_case_config: AWSOpenSearchIndexConfig,
19
+ index_name: str = "vdb_bench_index", # must be lowercase
20
+ id_col_name: str = "id",
21
+ vector_col_name: str = "embedding",
22
+ drop_old: bool = False,
23
+ **kwargs,
24
+ ):
25
+ self.dim = dim
26
+ self.db_config = db_config
27
+ self.case_config = db_case_config
28
+ self.index_name = index_name
29
+ self.id_col_name = id_col_name
30
+ self.category_col_names = [
31
+ f"scalar-{categoryCount}" for categoryCount in [2, 5, 10, 100, 1000]
32
+ ]
33
+ self.vector_col_name = vector_col_name
34
+
35
+ log.info(f"AWS_OpenSearch client config: {self.db_config}")
36
+ client = OpenSearch(**self.db_config)
37
+ if drop_old:
38
+ log.info(f"AWS_OpenSearch client drop old index: {self.index_name}")
39
+ is_existed = client.indices.exists(index=self.index_name)
40
+ if is_existed:
41
+ client.indices.delete(index=self.index_name)
42
+ self._create_index(client)
43
+
44
+ @classmethod
45
+ def config_cls(cls) -> AWSOpenSearchConfig:
46
+ return AWSOpenSearchConfig
47
+
48
+ @classmethod
49
+ def case_config_cls(
50
+ cls, index_type: IndexType | None = None
51
+ ) -> AWSOpenSearchIndexConfig:
52
+ return AWSOpenSearchIndexConfig
53
+
54
+ def _create_index(self, client: OpenSearch):
55
+ settings = {
56
+ "index": {
57
+ "knn": True,
58
+ # "number_of_shards": 5,
59
+ # "refresh_interval": "600s",
60
+ }
61
+ }
62
+ mappings = {
63
+ "properties": {
64
+ self.id_col_name: {"type": "integer"},
65
+ **{
66
+ categoryCol: {"type": "keyword"}
67
+ for categoryCol in self.category_col_names
68
+ },
69
+ self.vector_col_name: {
70
+ "type": "knn_vector",
71
+ "dimension": self.dim,
72
+ "method": self.case_config.index_param(),
73
+ },
74
+ }
75
+ }
76
+ try:
77
+ client.indices.create(
78
+ index=self.index_name, body=dict(settings=settings, mappings=mappings)
79
+ )
80
+ except Exception as e:
81
+ log.warning(f"Failed to create index: {self.index_name} error: {str(e)}")
82
+ raise e from None
83
+
84
+ @contextmanager
85
+ def init(self) -> None:
86
+ """connect to elasticsearch"""
87
+ self.client = OpenSearch(**self.db_config)
88
+
89
+ yield
90
+ # self.client.transport.close()
91
+ self.client = None
92
+ del self.client
93
+
94
+ def insert_embeddings(
95
+ self,
96
+ embeddings: Iterable[list[float]],
97
+ metadata: list[int],
98
+ **kwargs,
99
+ ) -> tuple[int, Exception]:
100
+ """Insert the embeddings to the elasticsearch."""
101
+ assert self.client is not None, "should self.init() first"
102
+
103
+ insert_data = []
104
+ for i in range(len(embeddings)):
105
+ insert_data.append({"index": {"_index": self.index_name, "_id": metadata[i]}})
106
+ insert_data.append({self.vector_col_name: embeddings[i]})
107
+ try:
108
+ resp = self.client.bulk(insert_data)
109
+ log.info(f"AWS_OpenSearch adding documents: {len(resp['items'])}")
110
+ resp = self.client.indices.stats(self.index_name)
111
+ log.info(f"Total document count in index: {resp['_all']['primaries']['indexing']['index_total']}")
112
+ return (len(embeddings), None)
113
+ except Exception as e:
114
+ log.warning(f"Failed to insert data: {self.index_name} error: {str(e)}")
115
+ time.sleep(10)
116
+ return self.insert_embeddings(embeddings, metadata)
117
+
118
+ def search_embedding(
119
+ self,
120
+ query: list[float],
121
+ k: int = 100,
122
+ filters: dict | None = None,
123
+ ) -> list[int]:
124
+ """Get k most similar embeddings to query vector.
125
+
126
+ Args:
127
+ query(list[float]): query embedding to look up documents similar to.
128
+ k(int): Number of most similar embeddings to return. Defaults to 100.
129
+ filters(dict, optional): filtering expression to filter the data while searching.
130
+
131
+ Returns:
132
+ list[tuple[int, float]]: list of k most similar embeddings in (id, score) tuple to the query embedding.
133
+ """
134
+ assert self.client is not None, "should self.init() first"
135
+
136
+ body = {
137
+ "size": k,
138
+ "query": {"knn": {self.vector_col_name: {"vector": query, "k": k}}},
139
+ }
140
+ try:
141
+ resp = self.client.search(index=self.index_name, body=body)
142
+ log.info(f'Search took: {resp["took"]}')
143
+ log.info(f'Search shards: {resp["_shards"]}')
144
+ log.info(f'Search hits total: {resp["hits"]["total"]}')
145
+ result = [int(d["_id"]) for d in resp["hits"]["hits"]]
146
+ # log.info(f'success! length={len(res)}')
147
+
148
+ return result
149
+ except Exception as e:
150
+ log.warning(f"Failed to search: {self.index_name} error: {str(e)}")
151
+ raise e from None
152
+
153
+ def optimize(self):
154
+ """optimize will be called between insertion and search in performance cases."""
155
+ pass
156
+
157
+ def ready_to_load(self):
158
+ """ready_to_load will be called before load in load cases."""
159
+ pass