isage-middleware 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of isage-middleware might be problematic. Click here for more details.
- isage_middleware-0.1.3.dist-info/METADATA +115 -0
- isage_middleware-0.1.3.dist-info/RECORD +291 -0
- sage/__init__.py +56 -2
- sage/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/__init__.py +52 -79
- sage/middleware/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/__pycache__/_version.cpython-311.opt-2.pyc +0 -0
- sage/middleware/__pycache__/_version.cpython-311.pyc +0 -0
- sage/middleware/_version.py +35 -0
- sage/middleware/api/__init__.py +52 -18
- sage/middleware/api/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-311.pyc +0 -0
- sage/middleware/components/enterprise/__init__.py +56 -0
- sage/middleware/components/enterprise/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/enterprise/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/__init__.py +56 -0
- sage/middleware/components/neuromem/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/__pycache__/memory_manager.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/__pycache__/memory_manager.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/__pycache__/memory_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/__pycache__/memory_service.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__init__.py +56 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/base_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/base_collection.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/graph_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/graph_collection.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/kv_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/kv_collection.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/vdb_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/__pycache__/vdb_collection.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/memory_collection/base_collection.py +167 -0
- sage/middleware/components/neuromem/memory_collection/graph_collection.py +11 -0
- sage/middleware/components/neuromem/memory_collection/kv_collection.py +709 -0
- sage/middleware/components/neuromem/memory_collection/vdb_collection.py +922 -0
- sage/middleware/components/neuromem/memory_manager.py +401 -0
- sage/middleware/components/neuromem/memory_service.py +324 -0
- sage/middleware/components/neuromem/micro_service/__init__.py +56 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/neuromem_vdb.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/neuromem_vdb.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/neuromem_vdb_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/__pycache__/neuromem_vdb_service.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/micro_service/neuromem_vdb.py +198 -0
- sage/middleware/components/neuromem/micro_service/neuromem_vdb_service.py +118 -0
- sage/middleware/components/neuromem/search_engine/__init__.py +56 -0
- sage/middleware/components/neuromem/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/graph_index/__init__.py +56 -0
- sage/middleware/components/neuromem/search_engine/graph_index/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/graph_index/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/graph_index/__pycache__/base_graph_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/graph_index/__pycache__/base_graph_index.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/graph_index/base_graph_index.py +40 -0
- sage/middleware/components/neuromem/search_engine/hybird_index/__init__.py +56 -0
- sage/middleware/components/neuromem/search_engine/hybird_index/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/hybird_index/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__init__.py +56 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/base_kv_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/base_kv_index.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/bm25s_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/__pycache__/bm25s_index.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/kv_index/base_kv_index.py +76 -0
- sage/middleware/components/neuromem/search_engine/kv_index/bm25s_index.py +320 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__init__.py +56 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/base_vdb_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/base_vdb_index.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/faiss_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/__pycache__/faiss_index.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/base_vdb_index.py +53 -0
- sage/middleware/components/neuromem/search_engine/vdb_index/faiss_index.py +700 -0
- sage/middleware/components/neuromem/storage_engine/__init__.py +56 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/metadata_storage.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/metadata_storage.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/text_storage.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/text_storage.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/vector_storage.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/__pycache__/vector_storage.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__init__.py +56 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/base_kv_backend.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/base_kv_backend.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/dict_kv_backend.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/__pycache__/dict_kv_backend.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/base_kv_backend.py +65 -0
- sage/middleware/components/neuromem/storage_engine/kv_backend/dict_kv_backend.py +54 -0
- sage/middleware/components/neuromem/storage_engine/metadata_storage.py +260 -0
- sage/middleware/components/neuromem/storage_engine/text_storage.py +106 -0
- sage/middleware/components/neuromem/storage_engine/vector_storage.py +85 -0
- sage/middleware/components/neuromem/tests/__init__.py +56 -0
- sage/middleware/components/neuromem/tests/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/__pycache__/test_memory_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/__pycache__/test_memory_service.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/__init__.py +56 -0
- sage/middleware/components/neuromem/tests/core_test/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__init__.py +56 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/kv_collection_test.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/kv_collection_test.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/vdb_collection_test.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/__pycache__/vdb_collection_test.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/kv_collection_test.py +60 -0
- sage/middleware/components/neuromem/tests/core_test/collection_test/vdb_collection_test.py +88 -0
- sage/middleware/components/neuromem/tests/core_test/manager_test.py +154 -0
- sage/middleware/components/neuromem/tests/test_memory_service.py +293 -0
- sage/middleware/components/neuromem/utils/__init__.py +56 -0
- sage/middleware/components/neuromem/utils/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/utils/__pycache__/path_utils.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/neuromem/utils/__pycache__/path_utils.cpython-311.pyc +0 -0
- sage/middleware/components/neuromem/utils/path_utils.py +25 -0
- sage/middleware/components/sage_db/__init__.py +56 -0
- sage/middleware/components/sage_db/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/sage_db/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/sage_db/__pycache__/sage_db.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/sage_db/__pycache__/sage_db.cpython-311.pyc +0 -0
- sage/middleware/components/sage_db/python/__init__.py +56 -0
- sage/middleware/components/sage_db/python/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/sage_db/python/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/components/sage_db/python/__pycache__/sage_db.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/sage_db/python/__pycache__/sage_db.cpython-311.pyc +0 -0
- sage/middleware/components/sage_db/tests/__pycache__/test_python.cpython-311.opt-2.pyc +0 -0
- sage/middleware/components/sage_db/tests/__pycache__/test_python.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/api_usage_tutorial.py +3 -3
- sage/middleware/examples/dag_microservices_demo.py +7 -8
- sage/middleware/examples/microservices_integration_demo.py +8 -11
- sage/middleware/examples/microservices_registration_demo.py +8 -12
- sage/middleware/services/__init__.py +56 -0
- sage/middleware/services/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/graph/__init__.py +52 -4
- sage/middleware/services/graph/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-311.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-311.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-311.pyc +0 -0
- sage/middleware/services/graph/examples/graph_demo.py +3 -2
- sage/middleware/services/graph/graph_service.py +68 -0
- sage/middleware/services/graph/search_engine/__init__.py +56 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-311.pyc +0 -0
- sage/middleware/services/kv/__init__.py +52 -4
- sage/middleware/services/kv/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-311.pyc +0 -0
- sage/middleware/services/kv/examples/__pycache__/{kv_demo.cpython-313.opt-2.pyc → kv_demo.cpython-311.opt-2.pyc} +0 -0
- sage/middleware/services/kv/examples/__pycache__/{kv_demo.cpython-313.pyc → kv_demo.cpython-311.pyc} +0 -0
- sage/middleware/services/kv/examples/kv_demo.py +1 -1
- sage/middleware/services/kv/search_engine/__init__.py +56 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-311.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-311.pyc +0 -0
- sage/middleware/services/memory/__init__.py +52 -8
- sage/middleware/services/memory/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-311.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/{memory_demo.cpython-313.opt-2.pyc → memory_demo.cpython-311.opt-2.pyc} +0 -0
- sage/middleware/services/memory/examples/__pycache__/{memory_demo.cpython-313.pyc → memory_demo.cpython-311.pyc} +0 -0
- sage/middleware/services/memory/examples/dag_microservices_demo.py +8 -9
- sage/middleware/services/memory/examples/memory_demo.py +4 -4
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-311.pyc +0 -0
- sage/middleware/services/memory/memory_service.py +14 -11
- sage/middleware/services/memory/utils/__init__.py +56 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/__init__.py +52 -4
- sage/middleware/services/vdb/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/examples/__pycache__/{vdb_demo.cpython-313.opt-2.pyc → vdb_demo.cpython-311.opt-2.pyc} +0 -0
- sage/middleware/services/vdb/examples/__pycache__/{vdb_demo.cpython-313.pyc → vdb_demo.cpython-311.pyc} +0 -0
- sage/middleware/services/vdb/examples/vdb_demo.py +2 -2
- sage/middleware/services/vdb/search_engine/__init__.py +56 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/vdb_service.py +44 -41
- sage/middleware/utils/__init__.py +53 -2
- sage/middleware/utils/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__init__.py +52 -31
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-311.pyc +0 -0
- isage_middleware-0.1.1.dist-info/METADATA +0 -424
- isage_middleware-0.1.1.dist-info/RECORD +0 -182
- sage/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/api/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-313.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-313.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-313.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-313.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-313.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-313.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-313.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-313.pyc +0 -0
- sage/middleware/enterprise/__init__.py +0 -75
- sage/middleware/enterprise/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/enterprise/sage_db/__init__.py +0 -132
- sage/middleware/enterprise/sage_db/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/sage_db.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/sage_db.cpython-313.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__init__.py +0 -7
- sage/middleware/enterprise/sage_db/python/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/sage_db.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/sage_db.cpython-313.pyc +0 -0
- sage/middleware/enterprise/sage_db/tests/__pycache__/test_python.cpython-313.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/tests/__pycache__/test_python.cpython-313.pyc +0 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-313.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-313.pyc +0 -0
- sage/middleware/examples/__pycache__/dag_microservices_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/dag_microservices_demo.cpython-313.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-313.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_integration_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_integration_demo.cpython-313.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-313.pyc +0 -0
- sage/middleware/services/graph/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-313.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-313.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-313.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-313.pyc +0 -0
- sage/middleware/services/kv/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-313.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-313.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-313.pyc +0 -0
- sage/middleware/services/memory/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-313.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/dag_microservices_demo.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/dag_microservices_demo.cpython-313.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-313.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-313.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-313.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-313.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-313.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-313.pyc +0 -0
- sage/middleware/utils/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-313.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-313.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-313.pyc +0 -0
- {isage_middleware-0.1.1.dist-info → isage_middleware-0.1.3.dist-info}/WHEEL +0 -0
- {isage_middleware-0.1.1.dist-info → isage_middleware-0.1.3.dist-info}/top_level.txt +0 -0
- /sage/middleware/{enterprise → components}/sage_db/python/sage_db.py +0 -0
- /sage/middleware/{enterprise → components}/sage_db/sage_db.py +0 -0
- /sage/middleware/{enterprise → components}/sage_db/tests/test_python.py +0 -0
|
@@ -0,0 +1,922 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import yaml
|
|
4
|
+
import shutil
|
|
5
|
+
import inspect
|
|
6
|
+
import numpy as np
|
|
7
|
+
from typing import Optional, Dict, Any, List, Callable
|
|
8
|
+
from sage.common.utils.logging.custom_logger import CustomLogger
|
|
9
|
+
from sage.middleware.components.neuromem.memory_collection.base_collection import BaseMemoryCollection
|
|
10
|
+
from sage.middleware.components.neuromem.search_engine.vdb_index import index_factory
|
|
11
|
+
from sage.middleware.components.neuromem.utils.path_utils import get_default_data_dir
|
|
12
|
+
from sage.middleware.utils.embedding.embedding_api import apply_embedding_model
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class VDBMemoryCollection(BaseMemoryCollection):
|
|
16
|
+
"""
|
|
17
|
+
Memory collection with vector database support.
|
|
18
|
+
支持向量数据库功能的内存集合类。
|
|
19
|
+
|
|
20
|
+
支持两种初始化方式:
|
|
21
|
+
1. 通过声明VDBMemoryCollection(config, corpus)创建
|
|
22
|
+
2. 通过VDBMemoryCollection.load(name, vdb_path)恢复式创建
|
|
23
|
+
"""
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
config: Dict[str, Any]
|
|
27
|
+
):
|
|
28
|
+
"""
|
|
29
|
+
初始化VDBMemoryCollection
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: 配置字典,必须包含name等参数
|
|
33
|
+
"""
|
|
34
|
+
# 初始化CustomLogger
|
|
35
|
+
self.logger = CustomLogger()
|
|
36
|
+
|
|
37
|
+
if "name" not in config:
|
|
38
|
+
self.logger.error("config中必须包含'name'字段")
|
|
39
|
+
raise ValueError("config中必须包含'name'字段")
|
|
40
|
+
|
|
41
|
+
self.name = config["name"]
|
|
42
|
+
super().__init__(self.name)
|
|
43
|
+
|
|
44
|
+
# 使用新的参数命名规范,删除向后兼容的代码
|
|
45
|
+
self.default_embedding_model_name = config.get("default_embedding_model", "default")
|
|
46
|
+
self.default_embedding_model = apply_embedding_model(self.default_embedding_model_name)
|
|
47
|
+
self.default_dim = config.get("default_dim", self.default_embedding_model.get_dim())
|
|
48
|
+
self.default_topk = config.get("default_topk", 5)
|
|
49
|
+
self.default_backend_type = config.get("default_vdb_backend", "FAISS")
|
|
50
|
+
|
|
51
|
+
self.index_info = {} # index_name -> dict: { embedding_model_name, dim, index, backend_type, topk, description, metadata_filter_func, metadata_conditions }
|
|
52
|
+
self.index_embedding_model = {} # index_name -> embedding_model 以供随时使用
|
|
53
|
+
|
|
54
|
+
# 创建全局索引,优先使用config中的global_index配置,否则使用默认配置
|
|
55
|
+
global_index_config = config.get("global_index", {})
|
|
56
|
+
if "name" not in global_index_config:
|
|
57
|
+
global_index_config["name"] = "global_index"
|
|
58
|
+
if "embedding_model" not in global_index_config:
|
|
59
|
+
global_index_config["embedding_model"] = self.default_embedding_model_name
|
|
60
|
+
if "dim" not in global_index_config:
|
|
61
|
+
global_index_config["dim"] = self.default_dim
|
|
62
|
+
if "backend_type" not in global_index_config:
|
|
63
|
+
global_index_config["backend_type"] = self.default_backend_type
|
|
64
|
+
if "topk" not in global_index_config:
|
|
65
|
+
global_index_config["topk"] = self.default_topk
|
|
66
|
+
|
|
67
|
+
self.create_index(config=global_index_config)
|
|
68
|
+
|
|
69
|
+
def batch_insert_data(self, data: List[str], metadatas: Optional[List[Dict[str, Any]]] = None):
|
|
70
|
+
"""
|
|
71
|
+
批量插入数据到collection中(仅存储,不创建索引)
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
data: 数据列表(文本、图片等)
|
|
75
|
+
metadatas: 对应的元数据列表,可选
|
|
76
|
+
"""
|
|
77
|
+
self.logger.info(f"Batch inserting {len(data)} data items to storage")
|
|
78
|
+
|
|
79
|
+
if metadatas is not None and len(metadatas) != len(data):
|
|
80
|
+
raise ValueError("metadatas length must match data length")
|
|
81
|
+
|
|
82
|
+
for i, item in enumerate(data):
|
|
83
|
+
stable_id = self._get_stable_id(item)
|
|
84
|
+
self.text_storage.store(stable_id, item)
|
|
85
|
+
|
|
86
|
+
if metadatas and metadatas[i]:
|
|
87
|
+
metadata = metadatas[i]
|
|
88
|
+
# 自动注册所有未知的元数据字段
|
|
89
|
+
for field_name in metadata.keys():
|
|
90
|
+
if not self.metadata_storage.has_field(field_name):
|
|
91
|
+
self.metadata_storage.add_field(field_name)
|
|
92
|
+
self.metadata_storage.store(stable_id, metadata)
|
|
93
|
+
|
|
94
|
+
def _serialize_func(self, func):
|
|
95
|
+
"""
|
|
96
|
+
改善lambda序列化管理
|
|
97
|
+
"""
|
|
98
|
+
if func is None:
|
|
99
|
+
return None
|
|
100
|
+
try:
|
|
101
|
+
return inspect.getsource(func).strip()
|
|
102
|
+
except Exception:
|
|
103
|
+
return str(func)
|
|
104
|
+
|
|
105
|
+
def _deserialize_func(self, func_str):
|
|
106
|
+
"""
|
|
107
|
+
反序列化函数字符串
|
|
108
|
+
"""
|
|
109
|
+
if func_str is None or func_str == "None" or func_str == "":
|
|
110
|
+
return lambda m: True
|
|
111
|
+
|
|
112
|
+
# 简单的lambda函数恢复,实际生产环境中需要更安全的方式
|
|
113
|
+
try:
|
|
114
|
+
# 这里只是一个简单的示例,实际应该使用更安全的方式
|
|
115
|
+
if func_str.startswith("lambda"):
|
|
116
|
+
return eval(func_str)
|
|
117
|
+
else:
|
|
118
|
+
return lambda m: True
|
|
119
|
+
except Exception:
|
|
120
|
+
return lambda m: True
|
|
121
|
+
|
|
122
|
+
def store(self, store_path: Optional[str] = None):
|
|
123
|
+
self.logger.debug(f"VDBMemoryCollection: store called")
|
|
124
|
+
|
|
125
|
+
if store_path is None:
|
|
126
|
+
# 使用默认数据目录
|
|
127
|
+
base_dir = get_default_data_dir()
|
|
128
|
+
else:
|
|
129
|
+
# 使用传入的数据目录(通常来自MemoryManager)
|
|
130
|
+
base_dir = store_path
|
|
131
|
+
|
|
132
|
+
collection_dir = os.path.join(base_dir, "vdb_collection", self.name)
|
|
133
|
+
os.makedirs(collection_dir, exist_ok=True)
|
|
134
|
+
|
|
135
|
+
# 1. 存储text和metadata
|
|
136
|
+
self.text_storage.store_to_disk(os.path.join(collection_dir, "text_storage.json"))
|
|
137
|
+
self.metadata_storage.store_to_disk(os.path.join(collection_dir, "metadata_storage.json"))
|
|
138
|
+
|
|
139
|
+
# 2. 索引和index_info
|
|
140
|
+
indexes_dir = os.path.join(collection_dir, "indexes")
|
|
141
|
+
os.makedirs(indexes_dir, exist_ok=True)
|
|
142
|
+
saved_index_info = {}
|
|
143
|
+
for index_name, info in self.index_info.items():
|
|
144
|
+
idx = info["index"]
|
|
145
|
+
idx_path = os.path.join(indexes_dir, index_name)
|
|
146
|
+
os.makedirs(idx_path, exist_ok=True)
|
|
147
|
+
idx.store(idx_path)
|
|
148
|
+
saved_index_info[index_name] = {
|
|
149
|
+
"embedding_model_name": info.get("embedding_model_name", "default"),
|
|
150
|
+
"dim": info.get("dim", self.default_dim),
|
|
151
|
+
"backend_type": info.get("backend_type", "FAISS"),
|
|
152
|
+
"topk": info.get("topk", self.default_topk),
|
|
153
|
+
"description": info.get("description", ""),
|
|
154
|
+
"index_type": idx.__class__.__name__,
|
|
155
|
+
"metadata_filter_func": self._serialize_func(info.get("metadata_filter_func")),
|
|
156
|
+
"metadata_conditions": info.get("metadata_conditions", {}),
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# 3. collection全局config
|
|
160
|
+
config = {
|
|
161
|
+
"name": self.name,
|
|
162
|
+
"default_embedding_model_name": self.default_embedding_model_name,
|
|
163
|
+
"default_dim": self.default_dim,
|
|
164
|
+
"default_topk": self.default_topk,
|
|
165
|
+
"default_backend_type": self.default_backend_type,
|
|
166
|
+
"indexes": saved_index_info,
|
|
167
|
+
}
|
|
168
|
+
with open(os.path.join(collection_dir, "config.json"), "w", encoding="utf-8") as f:
|
|
169
|
+
json.dump(config, f, ensure_ascii=False, indent=2)
|
|
170
|
+
|
|
171
|
+
return {"collection_path": collection_dir}
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def load(cls, name: str, vdb_path: Optional[str] = None):
|
|
175
|
+
"""
|
|
176
|
+
从磁盘加载VDBMemoryCollection实例
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
name: 集合名称
|
|
180
|
+
vdb_path: 加载路径,如果为None则使用默认路径
|
|
181
|
+
"""
|
|
182
|
+
if vdb_path is None:
|
|
183
|
+
# 如果没有指定路径,使用默认路径结构
|
|
184
|
+
base_dir = get_default_data_dir()
|
|
185
|
+
load_path = os.path.join(base_dir, "vdb_collection", name)
|
|
186
|
+
else:
|
|
187
|
+
load_path = vdb_path
|
|
188
|
+
|
|
189
|
+
# 此时 load_path 应该是指向具体collection的完整路径
|
|
190
|
+
config_path = os.path.join(load_path, "config.json")
|
|
191
|
+
if not os.path.exists(config_path):
|
|
192
|
+
raise FileNotFoundError(f"No config found for collection at {config_path}")
|
|
193
|
+
|
|
194
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
|
195
|
+
config = json.load(f)
|
|
196
|
+
|
|
197
|
+
# 使用新的初始化方式创建实例
|
|
198
|
+
instance = cls(
|
|
199
|
+
config={
|
|
200
|
+
"name": name,
|
|
201
|
+
"default_embedding_model": config.get("default_embedding_model_name", config.get("embedding_model_name", "mockembedder")),
|
|
202
|
+
"default_dim": config.get("default_dim", config.get("dim", 128)),
|
|
203
|
+
"default_topk": config.get("default_topk", 5),
|
|
204
|
+
"default_vdb_backend": config.get("default_backend_type", config.get("backend_type", "FAISS"))
|
|
205
|
+
}
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# 加载storages (不再加载vector_storage)
|
|
209
|
+
instance.text_storage.load_from_disk(os.path.join(load_path, "text_storage.json"))
|
|
210
|
+
instance.metadata_storage.load_from_disk(os.path.join(load_path, "metadata_storage.json"))
|
|
211
|
+
|
|
212
|
+
# 清空在初始化时创建的默认索引
|
|
213
|
+
instance.index_info.clear()
|
|
214
|
+
|
|
215
|
+
# 加载索引和index_info
|
|
216
|
+
indexes_dir = os.path.join(load_path, "indexes")
|
|
217
|
+
for index_name, idx_info in config.get("indexes", {}).items():
|
|
218
|
+
idx_type = idx_info["index_type"]
|
|
219
|
+
idx_path = os.path.join(indexes_dir, index_name)
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
# 使用工厂类加载索引
|
|
223
|
+
if idx_type == "FaissIndex":
|
|
224
|
+
idx = index_factory.load_index(index_name, "FAISS", idx_path)
|
|
225
|
+
else:
|
|
226
|
+
# 尝试通过工厂类加载其他类型的索引
|
|
227
|
+
backend_type = idx_type.replace("Index", "").upper()
|
|
228
|
+
idx = index_factory.load_index(index_name, backend_type, idx_path)
|
|
229
|
+
|
|
230
|
+
except Exception as e:
|
|
231
|
+
raise NotImplementedError(f"Unknown index_type {idx_type}")
|
|
232
|
+
|
|
233
|
+
# 恢复index_info
|
|
234
|
+
instance.index_info[index_name] = {
|
|
235
|
+
"embedding_model_name": idx_info.get("embedding_model_name", "default"),
|
|
236
|
+
"dim": idx_info.get("dim", config.get("default_dim", config.get("dim", 128))),
|
|
237
|
+
"index": idx,
|
|
238
|
+
"backend_type": idx_info.get("backend_type", "FAISS"),
|
|
239
|
+
"topk": idx_info.get("topk", 5),
|
|
240
|
+
"description": idx_info.get("description", ""),
|
|
241
|
+
"metadata_filter_func": instance._deserialize_func(idx_info.get("metadata_filter_func")),
|
|
242
|
+
"metadata_conditions": idx_info.get("metadata_conditions", {}),
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
# 恢复embedding模型
|
|
246
|
+
embedding_model_name = idx_info.get("embedding_model_name", "default")
|
|
247
|
+
instance.index_embedding_model[index_name] = apply_embedding_model(embedding_model_name)
|
|
248
|
+
|
|
249
|
+
return instance
|
|
250
|
+
|
|
251
|
+
@staticmethod
|
|
252
|
+
def clear(name, clear_path=None):
|
|
253
|
+
if clear_path is None:
|
|
254
|
+
clear_path = get_default_data_dir()
|
|
255
|
+
collection_dir = os.path.join(clear_path, "vdb_collection", name)
|
|
256
|
+
try:
|
|
257
|
+
shutil.rmtree(collection_dir)
|
|
258
|
+
print(f"Cleared collection: {collection_dir}")
|
|
259
|
+
except FileNotFoundError:
|
|
260
|
+
print(f"Collection does not exist: {collection_dir}")
|
|
261
|
+
except Exception as e:
|
|
262
|
+
print(f"Failed to clear: {e}")
|
|
263
|
+
|
|
264
|
+
def create_index(
|
|
265
|
+
self,
|
|
266
|
+
config: Optional[dict] = None,
|
|
267
|
+
metadata_filter_func: Optional[Callable[[Dict[str, Any]], bool]] = None,
|
|
268
|
+
**metadata_conditions
|
|
269
|
+
):
|
|
270
|
+
"""
|
|
271
|
+
使用元数据筛选条件创建新的向量索引。
|
|
272
|
+
"""
|
|
273
|
+
# 检查1: config必须不为空且包含name字段
|
|
274
|
+
if config is None:
|
|
275
|
+
self.logger.warning("config不能为空,无法创建索引")
|
|
276
|
+
return 0
|
|
277
|
+
|
|
278
|
+
if "name" not in config:
|
|
279
|
+
self.logger.warning("config中必须包含'name'字段,无法创建索引")
|
|
280
|
+
return 0
|
|
281
|
+
|
|
282
|
+
index_name = config["name"]
|
|
283
|
+
|
|
284
|
+
# 检查2: 如果索引已存在,不允许创建
|
|
285
|
+
if index_name in self.index_info:
|
|
286
|
+
self.logger.warning(f"索引 '{index_name}' 已存在,无法重复创建")
|
|
287
|
+
return 0
|
|
288
|
+
|
|
289
|
+
# 从config中解包Collection级别的参数,如果没有则使用默认值
|
|
290
|
+
backend_type = config.get("backend_type", self.default_backend_type)
|
|
291
|
+
description = config.get("description", f"Index for {index_name}")
|
|
292
|
+
embedding_model_name = config.get("embedding_model", self.default_embedding_model_name)
|
|
293
|
+
dim = config.get("dim", self.default_dim)
|
|
294
|
+
topk = config.get("topk", self.default_topk)
|
|
295
|
+
|
|
296
|
+
# 创建embedding模型
|
|
297
|
+
embedding_model = apply_embedding_model(embedding_model_name)
|
|
298
|
+
|
|
299
|
+
# 准备传递给工厂的Index级别配置参数
|
|
300
|
+
# Collection负责参数检查和分离,工厂只负责创建索引
|
|
301
|
+
index_config = {
|
|
302
|
+
"name": index_name,
|
|
303
|
+
"dim": dim
|
|
304
|
+
}
|
|
305
|
+
# 将其他Index级别的配置参数传递给工厂
|
|
306
|
+
for key, value in config.items():
|
|
307
|
+
if key not in ["backend_type", "description", "embedding_model", "topk"]:
|
|
308
|
+
index_config[key] = value
|
|
309
|
+
|
|
310
|
+
# 使用工厂类创建空索引,现在直接传递config
|
|
311
|
+
try:
|
|
312
|
+
# 使用新的基于config的创建方法,简化接口
|
|
313
|
+
index = index_factory.create_index_from_config(
|
|
314
|
+
config=index_config
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# 存储到index_info中
|
|
318
|
+
self.index_info[index_name] = {
|
|
319
|
+
"embedding_model_name": embedding_model_name,
|
|
320
|
+
"dim": dim,
|
|
321
|
+
"index": index,
|
|
322
|
+
"backend_type": backend_type,
|
|
323
|
+
"topk": topk,
|
|
324
|
+
"description": description,
|
|
325
|
+
"metadata_filter_func": metadata_filter_func,
|
|
326
|
+
"metadata_conditions": metadata_conditions,
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
# 存储embedding模型到index_embedding_model中
|
|
330
|
+
self.index_embedding_model[index_name] = embedding_model
|
|
331
|
+
|
|
332
|
+
self.logger.info(f"成功创建索引 '{index_name}',后端类型: {backend_type}")
|
|
333
|
+
return 1 # 成功创建返回1
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
self.logger.error(f"Failed to create index {index_name} with backend {backend_type}: {e}")
|
|
337
|
+
raise
|
|
338
|
+
|
|
339
|
+
# 直接删除某个索引
|
|
340
|
+
def delete_index(self, index_name: str):
|
|
341
|
+
"""
|
|
342
|
+
删除指定名称的索引。
|
|
343
|
+
"""
|
|
344
|
+
if index_name in self.index_info:
|
|
345
|
+
del self.index_info[index_name]
|
|
346
|
+
else:
|
|
347
|
+
raise ValueError(f"Index '{index_name}' does not exist.")
|
|
348
|
+
|
|
349
|
+
# 同时删除embedding模型
|
|
350
|
+
if index_name in self.index_embedding_model:
|
|
351
|
+
del self.index_embedding_model[index_name]
|
|
352
|
+
|
|
353
|
+
# 列举索引信息
|
|
354
|
+
def list_index(self, *index_names) -> List[Dict[str, str]]:
|
|
355
|
+
"""
|
|
356
|
+
列出指定的索引或所有索引及其描述信息。
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
*index_names: 可选的索引名称,如果不指定则返回所有索引
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
List[Dict]: [{"name": ..., "description": ...}, ...]
|
|
363
|
+
"""
|
|
364
|
+
if index_names:
|
|
365
|
+
# 如果指定了索引名称,只返回这些索引的信息
|
|
366
|
+
result = []
|
|
367
|
+
for name in index_names:
|
|
368
|
+
if name in self.index_info:
|
|
369
|
+
result.append({"name": name, "description": self.index_info[name]["description"]})
|
|
370
|
+
else:
|
|
371
|
+
self.logger.warning(f"索引 '{name}' 不存在")
|
|
372
|
+
return result
|
|
373
|
+
else:
|
|
374
|
+
# 如果没有指定,返回所有索引信息
|
|
375
|
+
return [
|
|
376
|
+
{"name": name, "description": info["description"]}
|
|
377
|
+
for name, info in self.index_info.items()
|
|
378
|
+
]
|
|
379
|
+
|
|
380
|
+
# 单条文本插入(指定索引,否则全局)
|
|
381
|
+
def insert(
|
|
382
|
+
self,
|
|
383
|
+
raw_text: str,
|
|
384
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
385
|
+
*index_names
|
|
386
|
+
) -> str:
|
|
387
|
+
self.logger.debug(f"VDBMemoryCollection: insert called")
|
|
388
|
+
|
|
389
|
+
# 首先存储数据到storage
|
|
390
|
+
stable_id = self._get_stable_id(raw_text)
|
|
391
|
+
self.text_storage.store(stable_id, raw_text)
|
|
392
|
+
|
|
393
|
+
if metadata:
|
|
394
|
+
# 自动注册所有未知的元数据字段
|
|
395
|
+
for field_name in metadata.keys():
|
|
396
|
+
if not self.metadata_storage.has_field(field_name):
|
|
397
|
+
self.metadata_storage.add_field(field_name)
|
|
398
|
+
self.metadata_storage.store(stable_id, metadata)
|
|
399
|
+
|
|
400
|
+
# 如果没有指定索引,插入到全局索引
|
|
401
|
+
if not index_names:
|
|
402
|
+
if "global_index" not in self.index_info:
|
|
403
|
+
# 如果全局索引不存在,创建它
|
|
404
|
+
self.logger.info(f"创建全局索引: global_index")
|
|
405
|
+
self.create_index(config={"name": "global_index"})
|
|
406
|
+
index_names = ("global_index",)
|
|
407
|
+
|
|
408
|
+
# 检查指定的索引是否存在
|
|
409
|
+
index_names_set = set(index_names)
|
|
410
|
+
for index_name in index_names_set:
|
|
411
|
+
if index_name not in self.index_info:
|
|
412
|
+
self.logger.warning(f"指定的索引 '{index_name}' 不存在,插入操作失败")
|
|
413
|
+
return "0" # 返回0表示失败
|
|
414
|
+
|
|
415
|
+
# 修正插入逻辑:为每个索引使用对应的embedding模型
|
|
416
|
+
for index_name in index_names_set:
|
|
417
|
+
# 获取该索引对应的embedding模型
|
|
418
|
+
if index_name in self.index_embedding_model:
|
|
419
|
+
embedding_model = self.index_embedding_model[index_name]
|
|
420
|
+
else:
|
|
421
|
+
# 如果索引没有对应的embedding模型,使用默认模型
|
|
422
|
+
self.logger.warning(f"索引 '{index_name}' 没有对应的embedding模型,使用默认模型")
|
|
423
|
+
embedding_model = self.default_embedding_model
|
|
424
|
+
|
|
425
|
+
# 使用对应的embedding模型编码文本
|
|
426
|
+
embedding = embedding_model.encode(raw_text)
|
|
427
|
+
|
|
428
|
+
# 统一处理不同格式的embedding结果
|
|
429
|
+
if hasattr(embedding, "detach") and hasattr(embedding, "cpu"):
|
|
430
|
+
# PyTorch tensor
|
|
431
|
+
embedding = embedding.detach().cpu().numpy()
|
|
432
|
+
if isinstance(embedding, list):
|
|
433
|
+
# Python list
|
|
434
|
+
embedding = np.array(embedding)
|
|
435
|
+
if not isinstance(embedding, np.ndarray):
|
|
436
|
+
# 其他类型,尝试转换为numpy数组
|
|
437
|
+
embedding = np.array(embedding)
|
|
438
|
+
|
|
439
|
+
# 确保数据类型是float32
|
|
440
|
+
embedding = embedding.astype(np.float32)
|
|
441
|
+
|
|
442
|
+
# 检查embedding维度是否与索引要求一致
|
|
443
|
+
expected_dim = self.index_info[index_name]["dim"]
|
|
444
|
+
if embedding.shape[-1] != expected_dim:
|
|
445
|
+
self.logger.warning(f"索引 '{index_name}' 要求维度 {expected_dim},但embedding维度为 {embedding.shape[-1]},跳过插入")
|
|
446
|
+
continue
|
|
447
|
+
|
|
448
|
+
# 插入到对应的索引中
|
|
449
|
+
index = self.index_info[index_name]["index"]
|
|
450
|
+
index.insert(embedding, stable_id)
|
|
451
|
+
|
|
452
|
+
return stable_id
|
|
453
|
+
|
|
454
|
+
# 单条文本更新(指定索引更新)
|
|
455
|
+
def update(
|
|
456
|
+
self,
|
|
457
|
+
former_data: str,
|
|
458
|
+
new_data: str,
|
|
459
|
+
new_metadata: Optional[Dict[str, Any]] = None,
|
|
460
|
+
*index_names: str
|
|
461
|
+
) -> str:
|
|
462
|
+
old_id = self._get_stable_id(former_data)
|
|
463
|
+
if not self.text_storage.has(old_id):
|
|
464
|
+
raise ValueError("Original data not found.")
|
|
465
|
+
|
|
466
|
+
self.text_storage.delete(old_id)
|
|
467
|
+
self.metadata_storage.delete(old_id)
|
|
468
|
+
|
|
469
|
+
for index_info in self.index_info.values():
|
|
470
|
+
index_info["index"].delete(old_id)
|
|
471
|
+
|
|
472
|
+
return self.insert(new_data, new_metadata, *index_names)
|
|
473
|
+
|
|
474
|
+
# 单条文本删除(全索引删除)
|
|
475
|
+
def delete(self, raw_text: str):
|
|
476
|
+
stable_id = self._get_stable_id(raw_text)
|
|
477
|
+
self.text_storage.delete(stable_id)
|
|
478
|
+
self.metadata_storage.delete(stable_id)
|
|
479
|
+
|
|
480
|
+
for index_info in self.index_info.values():
|
|
481
|
+
index_info["index"].delete(stable_id)
|
|
482
|
+
|
|
483
|
+
def retrieve(
|
|
484
|
+
self,
|
|
485
|
+
raw_data: str,
|
|
486
|
+
topk: Optional[int] = None,
|
|
487
|
+
index_name: Optional[str] = None,
|
|
488
|
+
threshold: Optional[float] = None,
|
|
489
|
+
with_metadata: bool = False,
|
|
490
|
+
metadata_filter_func: Optional[Callable[[Dict[str, Any]], bool]] = None,
|
|
491
|
+
**metadata_conditions
|
|
492
|
+
):
|
|
493
|
+
self.logger.debug(f"VDBMemoryCollection: retrieve called")
|
|
494
|
+
|
|
495
|
+
# 如果没有指定索引,使用或创建全局索引
|
|
496
|
+
if index_name is None:
|
|
497
|
+
index_name = "global_index"
|
|
498
|
+
if index_name not in self.index_info:
|
|
499
|
+
self.logger.info(f"Creating global index: {index_name}")
|
|
500
|
+
# 创建全局索引时,需要考虑元数据过滤条件
|
|
501
|
+
config = {"name": index_name}
|
|
502
|
+
self.create_index(config=config, metadata_filter_func=metadata_filter_func, **metadata_conditions)
|
|
503
|
+
|
|
504
|
+
if index_name not in self.index_info:
|
|
505
|
+
raise ValueError(f"Index '{index_name}' does not exist.")
|
|
506
|
+
|
|
507
|
+
if topk is None:
|
|
508
|
+
topk = int(self.default_topk)
|
|
509
|
+
|
|
510
|
+
# 使用对应索引的embedding模型
|
|
511
|
+
if index_name in self.index_embedding_model:
|
|
512
|
+
embedding_model = self.index_embedding_model[index_name]
|
|
513
|
+
else:
|
|
514
|
+
self.logger.warning(f"索引 '{index_name}' 没有对应的embedding模型,使用默认模型")
|
|
515
|
+
embedding_model = self.default_embedding_model
|
|
516
|
+
|
|
517
|
+
query_embedding = embedding_model.encode(raw_data)
|
|
518
|
+
|
|
519
|
+
# 统一处理不同格式的embedding结果
|
|
520
|
+
if hasattr(query_embedding, "detach") and hasattr(query_embedding, "cpu"):
|
|
521
|
+
# PyTorch tensor
|
|
522
|
+
query_embedding = query_embedding.detach().cpu().numpy()
|
|
523
|
+
if isinstance(query_embedding, list):
|
|
524
|
+
# Python list
|
|
525
|
+
query_embedding = np.array(query_embedding)
|
|
526
|
+
if not isinstance(query_embedding, np.ndarray):
|
|
527
|
+
# 其他类型,尝试转换为numpy数组
|
|
528
|
+
query_embedding = np.array(query_embedding)
|
|
529
|
+
|
|
530
|
+
# 确保数据类型是float32
|
|
531
|
+
query_embedding = query_embedding.astype(np.float32)
|
|
532
|
+
|
|
533
|
+
sub_index = self.index_info[index_name]["index"]
|
|
534
|
+
# 增加检索数量以补偿过滤后可能的损失
|
|
535
|
+
search_topk = topk * 2 # 检索更多结果以确保过滤后有足够的结果
|
|
536
|
+
top_k_ids, distances = sub_index.search(query_embedding, topk=search_topk, threshold=threshold)
|
|
537
|
+
|
|
538
|
+
if top_k_ids and isinstance(top_k_ids[0], (list, np.ndarray)):
|
|
539
|
+
top_k_ids = top_k_ids[0]
|
|
540
|
+
if distances and isinstance(distances[0], (list, np.ndarray)):
|
|
541
|
+
distances = distances[0]
|
|
542
|
+
top_k_ids = [str(i) for i in top_k_ids]
|
|
543
|
+
|
|
544
|
+
# 应用元数据过滤
|
|
545
|
+
if metadata_filter_func or metadata_conditions:
|
|
546
|
+
filtered_ids = self.filter_ids(top_k_ids, metadata_filter_func, **metadata_conditions)
|
|
547
|
+
else:
|
|
548
|
+
filtered_ids = top_k_ids
|
|
549
|
+
|
|
550
|
+
# 截取需要的数量,检索到几个就返回几个
|
|
551
|
+
final_ids = filtered_ids[:topk]
|
|
552
|
+
|
|
553
|
+
# 如果检索结果少于请求数量,记录信息但不警告
|
|
554
|
+
if len(final_ids) < topk:
|
|
555
|
+
self.logger.info(f"Retrieved {len(final_ids)} results (requested {topk})")
|
|
556
|
+
|
|
557
|
+
if with_metadata:
|
|
558
|
+
return [{"text": self.text_storage.get(i), "metadata": self.metadata_storage.get(i)} for i in final_ids]
|
|
559
|
+
else:
|
|
560
|
+
return [self.text_storage.get(i) for i in final_ids]
|
|
561
|
+
|
|
562
|
+
def update_index(self, index_name: str):
|
|
563
|
+
"""
|
|
564
|
+
更新指定索引:删除当前索引,保留config,重新创建索引并批量插入数据
|
|
565
|
+
|
|
566
|
+
Args:
|
|
567
|
+
index_name: 要更新的索引名称
|
|
568
|
+
"""
|
|
569
|
+
if index_name not in self.index_info:
|
|
570
|
+
raise ValueError(f"Index '{index_name}' does not exist.")
|
|
571
|
+
|
|
572
|
+
# 保存原始配置信息
|
|
573
|
+
info = self.index_info[index_name]
|
|
574
|
+
original_config = {
|
|
575
|
+
"name": index_name,
|
|
576
|
+
"backend_type": info["backend_type"],
|
|
577
|
+
"description": info["description"],
|
|
578
|
+
"embedding_model": info["embedding_model_name"],
|
|
579
|
+
"dim": info["dim"],
|
|
580
|
+
"topk": info["topk"]
|
|
581
|
+
}
|
|
582
|
+
original_metadata_filter_func = info.get("metadata_filter_func")
|
|
583
|
+
original_metadata_conditions = info.get("metadata_conditions", {})
|
|
584
|
+
|
|
585
|
+
self.logger.info(f"开始更新索引: {index_name}")
|
|
586
|
+
|
|
587
|
+
# 删除当前索引
|
|
588
|
+
self.delete_index(index_name)
|
|
589
|
+
|
|
590
|
+
# 重新创建索引
|
|
591
|
+
self.create_index(
|
|
592
|
+
config=original_config,
|
|
593
|
+
metadata_filter_func=original_metadata_filter_func,
|
|
594
|
+
**original_metadata_conditions
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# 从storage中获取所有数据
|
|
598
|
+
all_ids = self.get_all_ids()
|
|
599
|
+
|
|
600
|
+
if not all_ids:
|
|
601
|
+
self.logger.warning("storage中没有数据,索引更新完成但为空")
|
|
602
|
+
return
|
|
603
|
+
|
|
604
|
+
# 应用元数据过滤(如果index_info中有规则)
|
|
605
|
+
filtered_ids = []
|
|
606
|
+
if original_metadata_filter_func or original_metadata_conditions:
|
|
607
|
+
for item_id in all_ids:
|
|
608
|
+
metadata = self.metadata_storage.get(item_id)
|
|
609
|
+
# 应用过滤函数
|
|
610
|
+
if original_metadata_filter_func and not original_metadata_filter_func(metadata or {}):
|
|
611
|
+
continue
|
|
612
|
+
# 应用条件过滤
|
|
613
|
+
if original_metadata_conditions:
|
|
614
|
+
match = True
|
|
615
|
+
for key, value in original_metadata_conditions.items():
|
|
616
|
+
if not metadata or metadata.get(key) != value:
|
|
617
|
+
match = False
|
|
618
|
+
break
|
|
619
|
+
if not match:
|
|
620
|
+
continue
|
|
621
|
+
filtered_ids.append(item_id)
|
|
622
|
+
else:
|
|
623
|
+
filtered_ids = all_ids
|
|
624
|
+
|
|
625
|
+
# 获取对应的embedding模型
|
|
626
|
+
embedding_model = self.index_embedding_model[index_name]
|
|
627
|
+
|
|
628
|
+
# 批量编码和插入
|
|
629
|
+
vectors = []
|
|
630
|
+
valid_ids = []
|
|
631
|
+
|
|
632
|
+
for item_id in filtered_ids:
|
|
633
|
+
text = self.text_storage.get(item_id)
|
|
634
|
+
if text:
|
|
635
|
+
embedding = embedding_model.encode(text)
|
|
636
|
+
|
|
637
|
+
# 统一处理不同格式的embedding结果
|
|
638
|
+
if hasattr(embedding, "detach") and hasattr(embedding, "cpu"):
|
|
639
|
+
embedding = embedding.detach().cpu().numpy()
|
|
640
|
+
if isinstance(embedding, list):
|
|
641
|
+
embedding = np.array(embedding)
|
|
642
|
+
if not isinstance(embedding, np.ndarray):
|
|
643
|
+
embedding = np.array(embedding)
|
|
644
|
+
embedding = embedding.astype(np.float32)
|
|
645
|
+
|
|
646
|
+
vectors.append(embedding)
|
|
647
|
+
valid_ids.append(item_id)
|
|
648
|
+
|
|
649
|
+
# 使用底层的batch_insert方法批量插入
|
|
650
|
+
if vectors and valid_ids:
|
|
651
|
+
index = self.index_info[index_name]["index"]
|
|
652
|
+
result = index.batch_insert(vectors, valid_ids)
|
|
653
|
+
self.logger.info(f"索引 '{index_name}' 更新完成,插入了 {result} 条数据")
|
|
654
|
+
else:
|
|
655
|
+
self.logger.warning(f"索引 '{index_name}' 更新完成,但没有找到符合条件的数据")
|
|
656
|
+
|
|
657
|
+
if __name__ == "__main__":
|
|
658
|
+
import torch
|
|
659
|
+
from transformers import AutoTokenizer, AutoModel
|
|
660
|
+
import shutil
|
|
661
|
+
import tempfile
|
|
662
|
+
|
|
663
|
+
def colored(text, color):
|
|
664
|
+
colors = {"green": "\033[92m", "red": "\033[91m", "yellow": "\033[93m", "reset": "\033[0m"}
|
|
665
|
+
return colors.get(color, "") + str(text) + colors["reset"]
|
|
666
|
+
|
|
667
|
+
class MockEmbeddingModel:
|
|
668
|
+
def encode(self, text):
|
|
669
|
+
# 模拟embedding,将文本长度作为特征
|
|
670
|
+
return torch.tensor([float(len(text))] * 4) # 4维向量
|
|
671
|
+
|
|
672
|
+
def run_test():
|
|
673
|
+
print(colored("\n=== 开始VDBMemoryCollection测试 ===", "yellow"))
|
|
674
|
+
|
|
675
|
+
# 准备测试环境
|
|
676
|
+
test_name = "test_collection"
|
|
677
|
+
test_dir = tempfile.mkdtemp()
|
|
678
|
+
|
|
679
|
+
try:
|
|
680
|
+
# 1. 测试新的初始化方式
|
|
681
|
+
print(colored("\n1. 测试新的初始化方式", "yellow"))
|
|
682
|
+
|
|
683
|
+
# 方式1:通过config创建,使用mockembedder确保维度一致
|
|
684
|
+
config = {
|
|
685
|
+
"name": test_name,
|
|
686
|
+
"default_embedding_model": "mockembedder",
|
|
687
|
+
"default_dim": 128, # 与mockembedder的固定维度一致
|
|
688
|
+
"default_topk": 5,
|
|
689
|
+
"default_vdb_backend": "FAISS"
|
|
690
|
+
}
|
|
691
|
+
collection = VDBMemoryCollection(config=config)
|
|
692
|
+
print(colored("✓ 通过config初始化成功", "green"))
|
|
693
|
+
|
|
694
|
+
# 方式2:测试batch_insert_data
|
|
695
|
+
corpus = ["第一条文本", "第二条文本", "第三条文本"]
|
|
696
|
+
config_with_corpus = {
|
|
697
|
+
"name": f"{test_name}_corpus",
|
|
698
|
+
"default_embedding_model": "mockembedder",
|
|
699
|
+
"default_dim": 128,
|
|
700
|
+
"default_topk": 5,
|
|
701
|
+
"default_vdb_backend": "FAISS"
|
|
702
|
+
}
|
|
703
|
+
collection_with_corpus = VDBMemoryCollection(config=config_with_corpus)
|
|
704
|
+
collection_with_corpus.batch_insert_data(corpus)
|
|
705
|
+
# batch_insert_data 只存储数据,需要手动插入到索引
|
|
706
|
+
for text in corpus:
|
|
707
|
+
collection_with_corpus.insert(text)
|
|
708
|
+
print(colored("✓ 通过batch_insert_data成功", "green"))
|
|
709
|
+
|
|
710
|
+
# 2. 测试插入
|
|
711
|
+
print(colored("\n2. 测试数据插入", "yellow"))
|
|
712
|
+
texts = [
|
|
713
|
+
"这是第一条测试文本",
|
|
714
|
+
"这是第二条测试文本,带有metadata",
|
|
715
|
+
"这是第三条测试文本"
|
|
716
|
+
]
|
|
717
|
+
metadata = {"type": "test", "priority": "high"}
|
|
718
|
+
|
|
719
|
+
# 插入文本,不指定索引(应该会使用global_index)
|
|
720
|
+
id1 = collection.insert(texts[0])
|
|
721
|
+
# 插入文本,带metadata
|
|
722
|
+
id2 = collection.insert(texts[1], metadata=metadata)
|
|
723
|
+
# 插入文本到指定索引
|
|
724
|
+
collection.create_index(config={"name": "custom_index", "description": "自定义测试索引"})
|
|
725
|
+
id3 = collection.insert(texts[2], None, "custom_index")
|
|
726
|
+
|
|
727
|
+
print(colored("✓ 数据插入成功", "green"))
|
|
728
|
+
|
|
729
|
+
# 3. 测试检索
|
|
730
|
+
print(colored("\n3. 测试检索功能", "yellow"))
|
|
731
|
+
|
|
732
|
+
# 测试全局索引检索
|
|
733
|
+
results = collection.retrieve("测试文本", topk=2)
|
|
734
|
+
print(f"全局索引检索结果数量: {len(results)}")
|
|
735
|
+
assert len(results) > 0, "全局索引检索失败"
|
|
736
|
+
|
|
737
|
+
# 测试指定索引检索
|
|
738
|
+
results = collection.retrieve("测试文本", topk=2, index_name="custom_index")
|
|
739
|
+
print(f"自定义索引检索结果数量: {len(results)}")
|
|
740
|
+
assert len(results) > 0, "自定义索引检索失败"
|
|
741
|
+
|
|
742
|
+
# 测试带metadata的检索
|
|
743
|
+
results = collection.retrieve(
|
|
744
|
+
"测试文本",
|
|
745
|
+
topk=2,
|
|
746
|
+
with_metadata=True,
|
|
747
|
+
metadata_filter_func=lambda m: m and m.get("priority") == "high"
|
|
748
|
+
)
|
|
749
|
+
assert any(r.get("metadata", {}).get("priority") == "high" for r in results if isinstance(r, dict)), "metadata过滤失败"
|
|
750
|
+
|
|
751
|
+
print(colored("✓ 检索功能测试通过", "green"))
|
|
752
|
+
|
|
753
|
+
# 4. 测试更新和删除
|
|
754
|
+
print(colored("\n4. 测试更新和删除", "yellow"))
|
|
755
|
+
|
|
756
|
+
# 测试更新
|
|
757
|
+
new_text = "这是更新后的文本"
|
|
758
|
+
collection.update(texts[0], new_text)
|
|
759
|
+
results = collection.retrieve(new_text, topk=1)
|
|
760
|
+
assert results[0] == new_text, "更新操作失败"
|
|
761
|
+
|
|
762
|
+
# 测试删除
|
|
763
|
+
collection.delete(texts[1])
|
|
764
|
+
|
|
765
|
+
print(colored("✓ 更新和删除功能测试通过", "green"))
|
|
766
|
+
|
|
767
|
+
# 5. 测试持久化
|
|
768
|
+
print(colored("\n5. 测试持久化", "yellow"))
|
|
769
|
+
|
|
770
|
+
# 保存
|
|
771
|
+
save_path = os.path.join(test_dir, "save_test")
|
|
772
|
+
collection.store(save_path)
|
|
773
|
+
|
|
774
|
+
# 测试新的load方式
|
|
775
|
+
collection_dir = os.path.join(save_path, "vdb_collection", test_name)
|
|
776
|
+
loaded_collection = VDBMemoryCollection.load(test_name, collection_dir)
|
|
777
|
+
results = loaded_collection.retrieve("测试文本", topk=1)
|
|
778
|
+
assert len(results) > 0, "持久化后检索失败"
|
|
779
|
+
|
|
780
|
+
print(colored("✓ 持久化功能测试通过", "green"))
|
|
781
|
+
|
|
782
|
+
# 6. 测试batch_insert_data功能
|
|
783
|
+
print(colored("\n6. 测试batch_insert_data功能", "yellow"))
|
|
784
|
+
corpus_results = collection_with_corpus.retrieve("文本", topk=3)
|
|
785
|
+
print(f"从batch_insert_data的集合检索结果数量: {len(corpus_results)}")
|
|
786
|
+
assert len(corpus_results) > 0, "batch_insert_data集合检索失败"
|
|
787
|
+
print(colored("✓ batch_insert_data功能测试通过", "green"))
|
|
788
|
+
|
|
789
|
+
# 7. 测试update_index功能
|
|
790
|
+
print(colored("\n7. 测试update_index功能", "yellow"))
|
|
791
|
+
# 向collection中添加更多数据
|
|
792
|
+
collection.insert("测试update_index的文本1")
|
|
793
|
+
collection.insert("测试update_index的文本2")
|
|
794
|
+
# 更新global_index
|
|
795
|
+
collection.update_index("global_index")
|
|
796
|
+
results = collection.retrieve("update_index", topk=5)
|
|
797
|
+
print(f"update_index后检索结果数量: {len(results)}")
|
|
798
|
+
print(colored("✓ update_index功能测试通过", "green"))
|
|
799
|
+
|
|
800
|
+
# 8. 测试lambda函数在持久化后的工作情况
|
|
801
|
+
print(colored("\n8. 测试lambda函数持久化", "yellow"))
|
|
802
|
+
|
|
803
|
+
# 创建一个新的collection专门测试lambda
|
|
804
|
+
lambda_test_name = "lambda_test_collection"
|
|
805
|
+
lambda_config = {
|
|
806
|
+
"name": lambda_test_name,
|
|
807
|
+
"default_embedding_model": "mockembedder",
|
|
808
|
+
"default_dim": 128,
|
|
809
|
+
"default_topk": 5,
|
|
810
|
+
"default_vdb_backend": "FAISS"
|
|
811
|
+
}
|
|
812
|
+
lambda_collection = VDBMemoryCollection(config=lambda_config)
|
|
813
|
+
|
|
814
|
+
# 添加测试数据
|
|
815
|
+
test_data = [
|
|
816
|
+
("重要文档1", {"priority": "high", "category": "important"}),
|
|
817
|
+
("普通文档2", {"priority": "low", "category": "normal"}),
|
|
818
|
+
("重要文档3", {"priority": "high", "category": "important"}),
|
|
819
|
+
("普通文档4", {"priority": "medium", "category": "normal"}),
|
|
820
|
+
]
|
|
821
|
+
|
|
822
|
+
for text, metadata in test_data:
|
|
823
|
+
lambda_collection.insert(text, metadata)
|
|
824
|
+
|
|
825
|
+
# 创建一个带有lambda函数的索引
|
|
826
|
+
high_priority_filter = lambda m: m and m.get("priority") == "high"
|
|
827
|
+
lambda_collection.create_index(
|
|
828
|
+
config={
|
|
829
|
+
"name": "high_priority_index",
|
|
830
|
+
"description": "只包含高优先级文档的索引"
|
|
831
|
+
},
|
|
832
|
+
metadata_filter_func=high_priority_filter
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# 向带lambda的索引插入数据
|
|
836
|
+
for text, metadata in test_data:
|
|
837
|
+
if high_priority_filter(metadata):
|
|
838
|
+
lambda_collection.insert(text, metadata, "high_priority_index")
|
|
839
|
+
|
|
840
|
+
# 测试保存前的lambda过滤效果
|
|
841
|
+
print("保存前测试lambda过滤...")
|
|
842
|
+
before_save_results = lambda_collection.retrieve(
|
|
843
|
+
"文档",
|
|
844
|
+
topk=10,
|
|
845
|
+
index_name="high_priority_index"
|
|
846
|
+
)
|
|
847
|
+
print(f"保存前高优先级索引检索到 {len(before_save_results)} 条结果")
|
|
848
|
+
assert len(before_save_results) == 2, f"预期2条高优先级结果,实际得到{len(before_save_results)}条"
|
|
849
|
+
|
|
850
|
+
# 保存collection
|
|
851
|
+
lambda_save_path = os.path.join(test_dir, "lambda_save_test")
|
|
852
|
+
lambda_collection.store(lambda_save_path)
|
|
853
|
+
print("lambda collection保存完成")
|
|
854
|
+
|
|
855
|
+
# 加载collection
|
|
856
|
+
lambda_collection_dir = os.path.join(lambda_save_path, "vdb_collection", lambda_test_name)
|
|
857
|
+
loaded_lambda_collection = VDBMemoryCollection.load(lambda_test_name, lambda_collection_dir)
|
|
858
|
+
print("lambda collection加载完成")
|
|
859
|
+
|
|
860
|
+
# 测试加载后的lambda过滤效果
|
|
861
|
+
print("加载后测试lambda过滤...")
|
|
862
|
+
after_load_results = loaded_lambda_collection.retrieve(
|
|
863
|
+
"文档",
|
|
864
|
+
topk=10,
|
|
865
|
+
index_name="high_priority_index"
|
|
866
|
+
)
|
|
867
|
+
print(f"加载后高优先级索引检索到 {len(after_load_results)} 条结果")
|
|
868
|
+
|
|
869
|
+
# 验证结果是否一致
|
|
870
|
+
assert len(after_load_results) == len(before_save_results), \
|
|
871
|
+
f"加载后结果数量不一致:保存前{len(before_save_results)}条,加载后{len(after_load_results)}条"
|
|
872
|
+
|
|
873
|
+
# 测试lambda函数在检索时的工作情况
|
|
874
|
+
print("测试加载后lambda函数在检索时的过滤效果...")
|
|
875
|
+
filtered_results = loaded_lambda_collection.retrieve(
|
|
876
|
+
"文档",
|
|
877
|
+
topk=10,
|
|
878
|
+
with_metadata=True,
|
|
879
|
+
metadata_filter_func=lambda m: m and m.get("priority") == "high"
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
high_priority_count = sum(1 for r in filtered_results
|
|
883
|
+
if isinstance(r, dict) and
|
|
884
|
+
r.get("metadata", {}).get("priority") == "high")
|
|
885
|
+
print(f"通过lambda过滤检索到 {high_priority_count} 条高优先级结果")
|
|
886
|
+
assert high_priority_count > 0, "lambda函数在加载后没有正确工作"
|
|
887
|
+
|
|
888
|
+
# 测试update_index是否能正确处理lambda函数
|
|
889
|
+
print("测试update_index对lambda函数的处理...")
|
|
890
|
+
loaded_lambda_collection.insert("新增重要文档", {"priority": "high", "category": "new"})
|
|
891
|
+
loaded_lambda_collection.update_index("high_priority_index")
|
|
892
|
+
|
|
893
|
+
updated_results = loaded_lambda_collection.retrieve(
|
|
894
|
+
"文档",
|
|
895
|
+
topk=10,
|
|
896
|
+
index_name="high_priority_index"
|
|
897
|
+
)
|
|
898
|
+
print(f"update_index后高优先级索引检索到 {len(updated_results)} 条结果")
|
|
899
|
+
# 应该包含新增的重要文档,所以结果应该增加
|
|
900
|
+
assert len(updated_results) >= len(after_load_results), \
|
|
901
|
+
"update_index后lambda函数没有正确处理新数据"
|
|
902
|
+
|
|
903
|
+
print(colored("✓ lambda函数持久化测试通过", "green"))
|
|
904
|
+
print(" - lambda函数序列化/反序列化正常")
|
|
905
|
+
print(" - 持久化后检索过滤功能正常")
|
|
906
|
+
print(" - update_index正确处理lambda函数")
|
|
907
|
+
|
|
908
|
+
print(colored("\n=== 所有测试通过!===", "green"))
|
|
909
|
+
|
|
910
|
+
except Exception as e:
|
|
911
|
+
print(colored(f"\n测试失败: {str(e)}", "red"))
|
|
912
|
+
import traceback
|
|
913
|
+
traceback.print_exc()
|
|
914
|
+
raise
|
|
915
|
+
finally:
|
|
916
|
+
# 清理测试数据
|
|
917
|
+
try:
|
|
918
|
+
shutil.rmtree(test_dir)
|
|
919
|
+
except:
|
|
920
|
+
pass
|
|
921
|
+
|
|
922
|
+
run_test()
|