llama-stack 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/cli/stack/_list_deps.py +11 -7
- llama_stack/cli/stack/run.py +3 -25
- llama_stack/core/access_control/datatypes.py +78 -0
- llama_stack/core/configure.py +2 -2
- {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
- llama_stack/core/connectors/connectors.py +162 -0
- llama_stack/core/conversations/conversations.py +61 -58
- llama_stack/core/datatypes.py +54 -8
- llama_stack/core/library_client.py +60 -13
- llama_stack/core/prompts/prompts.py +43 -42
- llama_stack/core/routers/datasets.py +20 -17
- llama_stack/core/routers/eval_scoring.py +143 -53
- llama_stack/core/routers/inference.py +20 -9
- llama_stack/core/routers/safety.py +30 -42
- llama_stack/core/routers/vector_io.py +15 -7
- llama_stack/core/routing_tables/models.py +42 -3
- llama_stack/core/routing_tables/scoring_functions.py +19 -19
- llama_stack/core/routing_tables/shields.py +20 -17
- llama_stack/core/routing_tables/vector_stores.py +8 -5
- llama_stack/core/server/auth.py +192 -17
- llama_stack/core/server/fastapi_router_registry.py +40 -5
- llama_stack/core/server/server.py +24 -5
- llama_stack/core/stack.py +54 -10
- llama_stack/core/storage/datatypes.py +9 -0
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/exec.py +2 -2
- llama_stack/core/utils/type_inspection.py +16 -2
- llama_stack/distributions/dell/config.yaml +4 -1
- llama_stack/distributions/dell/doc_template.md +209 -0
- llama_stack/distributions/dell/run-with-safety.yaml +4 -1
- llama_stack/distributions/nvidia/config.yaml +4 -1
- llama_stack/distributions/nvidia/doc_template.md +170 -0
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
- llama_stack/distributions/oci/config.yaml +4 -1
- llama_stack/distributions/oci/doc_template.md +140 -0
- llama_stack/distributions/open-benchmark/config.yaml +9 -1
- llama_stack/distributions/postgres-demo/config.yaml +1 -1
- llama_stack/distributions/starter/build.yaml +62 -0
- llama_stack/distributions/starter/config.yaml +22 -3
- llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/starter/starter.py +13 -1
- llama_stack/distributions/starter-gpu/build.yaml +62 -0
- llama_stack/distributions/starter-gpu/config.yaml +22 -3
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/template.py +10 -2
- llama_stack/distributions/watsonx/config.yaml +4 -1
- llama_stack/log.py +1 -0
- llama_stack/models/llama/resources/dog.jpg +0 -0
- llama_stack/models/llama/resources/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/small_dog.jpg +0 -0
- llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +187 -60
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
- llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
- llama_stack/providers/inline/batches/reference/batches.py +2 -1
- llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
- llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
- llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
- llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
- llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
- llama_stack/providers/registry/agents.py +1 -0
- llama_stack/providers/registry/inference.py +1 -9
- llama_stack/providers/registry/vector_io.py +136 -16
- llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
- llama_stack/providers/remote/eval/nvidia/README.md +134 -0
- llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
- llama_stack/providers/remote/files/s3/README.md +266 -0
- llama_stack/providers/remote/files/s3/config.py +5 -3
- llama_stack/providers/remote/files/s3/files.py +2 -2
- llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
- llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
- llama_stack/providers/remote/inference/openai/openai.py +2 -0
- llama_stack/providers/remote/inference/together/together.py +4 -0
- llama_stack/providers/remote/inference/vertexai/config.py +3 -3
- llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
- llama_stack/providers/remote/inference/vllm/config.py +37 -18
- llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
- llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
- llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
- llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
- llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
- llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
- llama_stack/providers/remote/safety/nvidia/README.md +78 -0
- llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
- llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
- llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
- llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
- llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
- llama_stack/providers/remote/vector_io/oci/config.py +41 -0
- llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
- llama_stack/providers/utils/bedrock/client.py +3 -3
- llama_stack/providers/utils/bedrock/config.py +7 -7
- llama_stack/providers/utils/inference/__init__.py +0 -25
- llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
- llama_stack/providers/utils/inference/http_client.py +239 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
- llama_stack/providers/utils/inference/model_registry.py +148 -2
- llama_stack/providers/utils/inference/openai_compat.py +1 -158
- llama_stack/providers/utils/inference/openai_mixin.py +42 -2
- llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
- llama_stack/providers/utils/memory/vector_store.py +46 -19
- llama_stack/providers/utils/responses/responses_store.py +40 -6
- llama_stack/providers/utils/safety.py +114 -0
- llama_stack/providers/utils/tools/mcp.py +44 -3
- llama_stack/testing/api_recorder.py +9 -3
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/RECORD +135 -279
- llama_stack-0.5.0.dist-info/top_level.txt +1 -0
- llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
- llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
- llama_stack/models/llama/hadamard_utils.py +0 -88
- llama_stack/models/llama/llama3/args.py +0 -74
- llama_stack/models/llama/llama3/generation.py +0 -378
- llama_stack/models/llama/llama3/model.py +0 -304
- llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
- llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
- llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
- llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
- llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama3/quantization/loader.py +0 -316
- llama_stack/models/llama/llama3_1/__init__.py +0 -12
- llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
- llama_stack/models/llama/llama3_1/prompts.py +0 -258
- llama_stack/models/llama/llama3_2/__init__.py +0 -5
- llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
- llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
- llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
- llama_stack/models/llama/llama3_3/__init__.py +0 -5
- llama_stack/models/llama/llama3_3/prompts.py +0 -259
- llama_stack/models/llama/llama4/args.py +0 -107
- llama_stack/models/llama/llama4/ffn.py +0 -58
- llama_stack/models/llama/llama4/moe.py +0 -214
- llama_stack/models/llama/llama4/preprocess.py +0 -435
- llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama4/quantization/loader.py +0 -226
- llama_stack/models/llama/llama4/vision/__init__.py +0 -5
- llama_stack/models/llama/llama4/vision/embedding.py +0 -210
- llama_stack/models/llama/llama4/vision/encoder.py +0 -412
- llama_stack/models/llama/quantize_impls.py +0 -316
- llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
- llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
- llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
- llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
- llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
- llama_stack-0.4.3.dist-info/top_level.txt +0 -2
- llama_stack_api/__init__.py +0 -945
- llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/admin/api.py +0 -72
- llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/admin/models.py +0 -113
- llama_stack_api/agents.py +0 -173
- llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/batches/api.py +0 -53
- llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/batches/models.py +0 -78
- llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/common/errors.py +0 -95
- llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/common/responses.py +0 -77
- llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/connectors.py +0 -146
- llama_stack_api/conversations.py +0 -270
- llama_stack_api/datasetio.py +0 -55
- llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/datatypes.py +0 -373
- llama_stack_api/eval.py +0 -137
- llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/files/api.py +0 -51
- llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/files/models.py +0 -107
- llama_stack_api/inference.py +0 -1169
- llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/internal/kvstore.py +0 -28
- llama_stack_api/internal/sqlstore.py +0 -81
- llama_stack_api/llama_stack_api/__init__.py +0 -945
- llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/llama_stack_api/admin/api.py +0 -72
- llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/llama_stack_api/admin/models.py +0 -113
- llama_stack_api/llama_stack_api/agents.py +0 -173
- llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/llama_stack_api/batches/api.py +0 -53
- llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/llama_stack_api/batches/models.py +0 -78
- llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/llama_stack_api/common/errors.py +0 -95
- llama_stack_api/llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/llama_stack_api/common/responses.py +0 -77
- llama_stack_api/llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/llama_stack_api/connectors.py +0 -146
- llama_stack_api/llama_stack_api/conversations.py +0 -270
- llama_stack_api/llama_stack_api/datasetio.py +0 -55
- llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/llama_stack_api/datatypes.py +0 -373
- llama_stack_api/llama_stack_api/eval.py +0 -137
- llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/llama_stack_api/files/api.py +0 -51
- llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/llama_stack_api/files/models.py +0 -107
- llama_stack_api/llama_stack_api/inference.py +0 -1169
- llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
- llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
- llama_stack_api/llama_stack_api/models.py +0 -171
- llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/llama_stack_api/post_training.py +0 -370
- llama_stack_api/llama_stack_api/prompts.py +0 -203
- llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/llama_stack_api/providers/api.py +0 -16
- llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/llama_stack_api/providers/models.py +0 -24
- llama_stack_api/llama_stack_api/py.typed +0 -0
- llama_stack_api/llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/llama_stack_api/resource.py +0 -37
- llama_stack_api/llama_stack_api/router_utils.py +0 -160
- llama_stack_api/llama_stack_api/safety.py +0 -132
- llama_stack_api/llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/llama_stack_api/scoring.py +0 -93
- llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/llama_stack_api/shields.py +0 -93
- llama_stack_api/llama_stack_api/tools.py +0 -226
- llama_stack_api/llama_stack_api/vector_io.py +0 -941
- llama_stack_api/llama_stack_api/vector_stores.py +0 -53
- llama_stack_api/llama_stack_api/version.py +0 -9
- llama_stack_api/models.py +0 -171
- llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/post_training.py +0 -370
- llama_stack_api/prompts.py +0 -203
- llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/providers/api.py +0 -16
- llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/providers/models.py +0 -24
- llama_stack_api/py.typed +0 -0
- llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/resource.py +0 -37
- llama_stack_api/router_utils.py +0 -160
- llama_stack_api/safety.py +0 -132
- llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/scoring.py +0 -93
- llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/shields.py +0 -93
- llama_stack_api/tools.py +0 -226
- llama_stack_api/vector_io.py +0 -941
- llama_stack_api/vector_stores.py +0 -53
- llama_stack_api/version.py +0 -9
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -37,7 +37,7 @@ from llama_stack_api import (
|
|
|
37
37
|
)
|
|
38
38
|
from llama_stack_api.internal.kvstore import KVStore
|
|
39
39
|
|
|
40
|
-
from .config import PGVectorVectorIOConfig
|
|
40
|
+
from .config import PGVectorIndexConfig, PGVectorIndexType, PGVectorVectorIOConfig
|
|
41
41
|
|
|
42
42
|
log = get_logger(name=__name__, category="vector_io::pgvector")
|
|
43
43
|
|
|
@@ -81,6 +81,26 @@ def upsert_models(conn, keys_models: list[tuple[str, BaseModel]]):
|
|
|
81
81
|
execute_values(cur, query, values, template="(%s, %s)")
|
|
82
82
|
|
|
83
83
|
|
|
84
|
+
def remove_vector_store_metadata(conn: psycopg2.extensions.connection, vector_store_id: str) -> None:
|
|
85
|
+
"""
|
|
86
|
+
Performs removal of vector store metadata from PGVector metadata_store table when vector store is unregistered
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
conn: active PostgreSQL connection
|
|
90
|
+
vector_store_id: identifier of VectorStore resource
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
with conn.cursor() as cur:
|
|
94
|
+
cur.execute("DELETE FROM metadata_store WHERE key = %s", (vector_store_id,))
|
|
95
|
+
if cur.rowcount > 0:
|
|
96
|
+
log.info(f"Removed metadata for vector store '{vector_store_id}' from PGVector metadata_store table.")
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise RuntimeError(
|
|
100
|
+
f"Error removing metadata from PGVector metadata_store for vector_store: {vector_store_id}"
|
|
101
|
+
) from e
|
|
102
|
+
|
|
103
|
+
|
|
84
104
|
def load_models(cur, cls):
|
|
85
105
|
cur.execute("SELECT key, data FROM metadata_store")
|
|
86
106
|
rows = cur.fetchall()
|
|
@@ -89,22 +109,35 @@ def load_models(cur, cls):
|
|
|
89
109
|
|
|
90
110
|
class PGVectorIndex(EmbeddingIndex):
|
|
91
111
|
# reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#querying
|
|
112
|
+
# Llama Stack supports only search functions that are applied for embeddings with vector type
|
|
92
113
|
PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION: dict[str, str] = {
|
|
93
114
|
"L2": "<->",
|
|
94
115
|
"L1": "<+>",
|
|
95
116
|
"COSINE": "<=>",
|
|
96
117
|
"INNER_PRODUCT": "<#>",
|
|
97
|
-
"HAMMING": "<~>",
|
|
98
|
-
"JACCARD": "<%>",
|
|
99
118
|
}
|
|
100
119
|
|
|
120
|
+
# reference: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw
|
|
121
|
+
# Llama Stack supports only index operator classes that are applied for embeddings with vector type
|
|
122
|
+
PGVECTOR_DISTANCE_METRIC_TO_INDEX_OPERATOR_CLASS: dict[str, str] = {
|
|
123
|
+
"L2": "vector_l2_ops",
|
|
124
|
+
"L1": "vector_l1_ops",
|
|
125
|
+
"COSINE": "vector_cosine_ops",
|
|
126
|
+
"INNER_PRODUCT": "vector_ip_ops",
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
# pgvector's maximum embedding dimension for HNSW/IVFFlat indexes on column with type vector
|
|
130
|
+
# references: https://github.com/pgvector/pgvector?tab=readme-ov-file#hnsw and https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat
|
|
131
|
+
MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX = 2000
|
|
132
|
+
|
|
101
133
|
def __init__(
|
|
102
134
|
self,
|
|
103
135
|
vector_store: VectorStore,
|
|
104
136
|
dimension: int,
|
|
105
137
|
conn: psycopg2.extensions.connection,
|
|
138
|
+
distance_metric: str,
|
|
139
|
+
vector_index: PGVectorIndexConfig,
|
|
106
140
|
kvstore: KVStore | None = None,
|
|
107
|
-
distance_metric: str = "COSINE",
|
|
108
141
|
):
|
|
109
142
|
self.vector_store = vector_store
|
|
110
143
|
self.dimension = dimension
|
|
@@ -112,6 +145,7 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
112
145
|
self.kvstore = kvstore
|
|
113
146
|
self.check_distance_metric_availability(distance_metric)
|
|
114
147
|
self.distance_metric = distance_metric
|
|
148
|
+
self.vector_index = vector_index
|
|
115
149
|
self.table_name = None
|
|
116
150
|
|
|
117
151
|
async def initialize(self) -> None:
|
|
@@ -135,6 +169,28 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
135
169
|
"""
|
|
136
170
|
)
|
|
137
171
|
|
|
172
|
+
# pgvector's embedding dimensions requirement to create an index for Approximate Nearest Neighbor (ANN) search is up to 2,000 dimensions for column with type vector
|
|
173
|
+
if self.dimension <= self.MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX:
|
|
174
|
+
if self.vector_index.type == PGVectorIndexType.HNSW:
|
|
175
|
+
await self.create_hnsw_vector_index(cur)
|
|
176
|
+
|
|
177
|
+
# Create the index only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
|
|
178
|
+
elif (
|
|
179
|
+
self.vector_index.type == PGVectorIndexType.IVFFlat
|
|
180
|
+
and not await self.check_conflicting_vector_index_exists(cur)
|
|
181
|
+
):
|
|
182
|
+
log.info(
|
|
183
|
+
f"Creation of {PGVectorIndexType.IVFFlat} vector index in vector_store: {self.vector_store.identifier} was deferred. It will be created when the table has some data."
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
else:
|
|
187
|
+
log.info(
|
|
188
|
+
f"Skip creation of {self.vector_index.type} vector index for embedding in PGVector for vector_store: {self.vector_store.identifier}"
|
|
189
|
+
)
|
|
190
|
+
log.info(
|
|
191
|
+
"PGVector requires embedding dimensions are up to 2,000 to successfully create a vector index."
|
|
192
|
+
)
|
|
193
|
+
|
|
138
194
|
# Create GIN index for full-text search performance
|
|
139
195
|
cur.execute(
|
|
140
196
|
f"""
|
|
@@ -177,6 +233,13 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
177
233
|
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
178
234
|
execute_values(cur, query, values, template="(%s, %s, %s::vector, %s, to_tsvector('english', %s))")
|
|
179
235
|
|
|
236
|
+
# Create the IVFFlat index only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
|
|
237
|
+
if (
|
|
238
|
+
self.vector_index.type == PGVectorIndexType.IVFFlat
|
|
239
|
+
and self.dimension <= self.MAX_EMBEDDING_DIMENSION_FOR_HNSW_AND_IVFFLAT_INDEX
|
|
240
|
+
):
|
|
241
|
+
await self.create_ivfflat_vector_index(cur)
|
|
242
|
+
|
|
180
243
|
async def query_vector(self, embedding: NDArray, k: int, score_threshold: float) -> QueryChunksResponse:
|
|
181
244
|
"""
|
|
182
245
|
Performs vector similarity search using PostgreSQL's search function. Default distance metric is COSINE.
|
|
@@ -192,6 +255,14 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
192
255
|
pgvector_search_function = self.get_pgvector_search_function()
|
|
193
256
|
|
|
194
257
|
with self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
|
|
258
|
+
# Specify the number of probes to allow PGVector to use Index Scan using IVFFlat index if it was configured (https://github.com/pgvector/pgvector?tab=readme-ov-file#query-options-1)
|
|
259
|
+
if self.vector_index.type == PGVectorIndexType.IVFFlat:
|
|
260
|
+
cur.execute(
|
|
261
|
+
f"""
|
|
262
|
+
SET ivfflat.probes = {self.vector_index.probes};
|
|
263
|
+
"""
|
|
264
|
+
)
|
|
265
|
+
|
|
195
266
|
cur.execute(
|
|
196
267
|
f"""
|
|
197
268
|
SELECT document, embedding {pgvector_search_function} %s::vector AS distance
|
|
@@ -324,6 +395,14 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
324
395
|
# Fix: Use proper tuple parameter binding with explicit array cast
|
|
325
396
|
cur.execute(f"DELETE FROM {self.table_name} WHERE id = ANY(%s::text[])", (chunk_ids,))
|
|
326
397
|
|
|
398
|
+
def get_pgvector_index_operator_class(self) -> str:
|
|
399
|
+
"""Get the pgvector index operator class for the current distance metric.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
The operator class name.
|
|
403
|
+
"""
|
|
404
|
+
return self.PGVECTOR_DISTANCE_METRIC_TO_INDEX_OPERATOR_CLASS[self.distance_metric]
|
|
405
|
+
|
|
327
406
|
def get_pgvector_search_function(self) -> str:
|
|
328
407
|
return self.PGVECTOR_DISTANCE_METRIC_TO_SEARCH_FUNCTION[self.distance_metric]
|
|
329
408
|
|
|
@@ -343,6 +422,160 @@ class PGVectorIndex(EmbeddingIndex):
|
|
|
343
422
|
f"Supported metrics are: {', '.join(supported_metrics)}"
|
|
344
423
|
)
|
|
345
424
|
|
|
425
|
+
async def create_hnsw_vector_index(self, cur: cursor) -> None:
|
|
426
|
+
"""Create PGVector HNSW vector index for Approximate Nearest Neighbor (ANN) search
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
cur: PostgreSQL cursor
|
|
430
|
+
|
|
431
|
+
Raises:
|
|
432
|
+
RuntimeError: If the error occurred when creating vector index in PGVector
|
|
433
|
+
"""
|
|
434
|
+
|
|
435
|
+
# prevents from creating index for the table that already has conflicting index (HNSW or IVFFlat)
|
|
436
|
+
if await self.check_conflicting_vector_index_exists(cur):
|
|
437
|
+
return
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
index_operator_class = self.get_pgvector_index_operator_class()
|
|
441
|
+
|
|
442
|
+
# Create HNSW (Hierarchical Navigable Small Worlds) index on embedding column to allow efficient and performant vector search in pgvector
|
|
443
|
+
# HNSW finds the approximate nearest neighbors by only calculating distance metric for vectors it visits during graph traversal instead of processing all vectors
|
|
444
|
+
cur.execute(
|
|
445
|
+
f"""
|
|
446
|
+
CREATE INDEX IF NOT EXISTS {self.table_name}_hnsw_idx
|
|
447
|
+
ON {self.table_name} USING hnsw(embedding {index_operator_class}) WITH (m = {self.vector_index.m}, ef_construction = {self.vector_index.ef_construction});
|
|
448
|
+
"""
|
|
449
|
+
)
|
|
450
|
+
log.info(
|
|
451
|
+
f"{PGVectorIndexType.HNSW} vector index was created with parameters m = {self.vector_index.m}, ef_construction = {self.vector_index.ef_construction} for vector_store: {self.vector_store.identifier}."
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
except psycopg2.Error as e:
|
|
455
|
+
raise RuntimeError(
|
|
456
|
+
f"Failed to create {PGVectorIndexType.HNSW} vector index for vector_store: {self.vector_store.identifier}: {e}"
|
|
457
|
+
) from e
|
|
458
|
+
|
|
459
|
+
async def create_ivfflat_vector_index(self, cur: cursor) -> None:
|
|
460
|
+
"""Create PGVector IVFFlat vector index for Approximate Nearest Neighbor (ANN) search
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
cur: PostgreSQL cursor
|
|
464
|
+
|
|
465
|
+
Raises:
|
|
466
|
+
RuntimeError: If the error occurred when creating vector index in PGVector
|
|
467
|
+
"""
|
|
468
|
+
|
|
469
|
+
# prevents from creating index for the table that already has conflicting index (HNSW or IVFFlat)
|
|
470
|
+
if await self.check_conflicting_vector_index_exists(cur):
|
|
471
|
+
return
|
|
472
|
+
|
|
473
|
+
# don't create index too early as it decreases a performance (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
|
|
474
|
+
# create IVFFLAT index only if vector store has rows >= lists * 1000
|
|
475
|
+
if await self.fetch_number_of_records(cur) < self.vector_index.lists * 1000:
|
|
476
|
+
log.info(
|
|
477
|
+
f"IVFFlat index wasn't created for vector_store {self.vector_store.identifier} because table doesn't have enough records."
|
|
478
|
+
)
|
|
479
|
+
return
|
|
480
|
+
|
|
481
|
+
try:
|
|
482
|
+
index_operator_class = self.get_pgvector_index_operator_class()
|
|
483
|
+
|
|
484
|
+
# Create Inverted File with Flat Compression (IVFFlat) index on embedding column to allow efficient and performant vector search in pgvector
|
|
485
|
+
# IVFFlat index divides vectors into lists, and then searches a subset of those lists that are closest to the query vector
|
|
486
|
+
# Index should be created only after the table has some data (https://github.com/pgvector/pgvector?tab=readme-ov-file#ivfflat)
|
|
487
|
+
cur.execute(
|
|
488
|
+
f"""
|
|
489
|
+
CREATE INDEX IF NOT EXISTS {self.table_name}_ivfflat_idx
|
|
490
|
+
ON {self.table_name} USING ivfflat(embedding {index_operator_class}) WITH (lists = {self.vector_index.lists});
|
|
491
|
+
"""
|
|
492
|
+
)
|
|
493
|
+
log.info(
|
|
494
|
+
f"{PGVectorIndexType.IVFFlat} vector index was created with parameter lists = {self.vector_index.lists} for vector_store: {self.vector_store.identifier}."
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
except psycopg2.Error as e:
|
|
498
|
+
raise RuntimeError(
|
|
499
|
+
f"Failed to create {PGVectorIndexType.IVFFlat} vector index for vector_store: {self.vector_store.identifier}: {e}"
|
|
500
|
+
) from e
|
|
501
|
+
|
|
502
|
+
async def check_conflicting_vector_index_exists(self, cur: cursor) -> bool:
|
|
503
|
+
"""Check if vector index of any type has already been created for the table to prevent the conflict
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
cur: PostgreSQL cursor
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
True if exists, otherwise False
|
|
510
|
+
|
|
511
|
+
Raises:
|
|
512
|
+
RuntimeError: If the error occurred when checking vector index exists in PGVector
|
|
513
|
+
"""
|
|
514
|
+
try:
|
|
515
|
+
log.info(
|
|
516
|
+
f"Checking vector_store: {self.vector_store.identifier} for conflicting vector index in PGVector..."
|
|
517
|
+
)
|
|
518
|
+
cur.execute(
|
|
519
|
+
"""
|
|
520
|
+
SELECT indexname FROM pg_indexes
|
|
521
|
+
WHERE (indexname LIKE %s OR indexname LIKE %s) AND tablename = %s;
|
|
522
|
+
""",
|
|
523
|
+
(
|
|
524
|
+
"%hnsw%",
|
|
525
|
+
"%ivfflat%",
|
|
526
|
+
self.table_name,
|
|
527
|
+
),
|
|
528
|
+
)
|
|
529
|
+
result = cur.fetchone()
|
|
530
|
+
|
|
531
|
+
if result:
|
|
532
|
+
log.warning(
|
|
533
|
+
f"Conflicting vector index {result[0]} already exists in vector_store: {self.vector_store.identifier}"
|
|
534
|
+
)
|
|
535
|
+
log.warning(
|
|
536
|
+
f"vector_store: {self.vector_store.identifier} will continue to use vector index {result[0]} to preserve performance."
|
|
537
|
+
)
|
|
538
|
+
return True
|
|
539
|
+
|
|
540
|
+
log.info(f"vector_store: {self.vector_store.identifier} currently doesn't have conflicting vector index")
|
|
541
|
+
log.info(f"Proceeding with creation of vector index for {self.vector_store.identifier}")
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
except psycopg2.Error as e:
|
|
545
|
+
raise RuntimeError(f"Failed to check if vector index exists in PGVector: {e}") from e
|
|
546
|
+
|
|
547
|
+
async def fetch_number_of_records(self, cur: cursor) -> int:
|
|
548
|
+
"""Returns number of records in a vector store
|
|
549
|
+
|
|
550
|
+
Args:
|
|
551
|
+
cur: PostgreSQL cursor
|
|
552
|
+
|
|
553
|
+
Returns:
|
|
554
|
+
number of records in a vector store
|
|
555
|
+
|
|
556
|
+
Raises:
|
|
557
|
+
RuntimeError: If the error occurred when fetching a number of records in a vector store in PGVector
|
|
558
|
+
"""
|
|
559
|
+
try:
|
|
560
|
+
log.info(f"Fetching number of records in vector_store: {self.vector_store.identifier}...")
|
|
561
|
+
cur.execute(
|
|
562
|
+
f"""
|
|
563
|
+
SELECT COUNT(DISTINCT id)
|
|
564
|
+
FROM {self.table_name};
|
|
565
|
+
"""
|
|
566
|
+
)
|
|
567
|
+
result = cur.fetchone()
|
|
568
|
+
|
|
569
|
+
if result:
|
|
570
|
+
log.info(f"vector_store: {self.vector_store.identifier} has {result[0]} records.")
|
|
571
|
+
return result[0]
|
|
572
|
+
|
|
573
|
+
log.info(f"vector_store: {self.vector_store.identifier} currently doesn't have any records.")
|
|
574
|
+
return 0
|
|
575
|
+
|
|
576
|
+
except psycopg2.Error as e:
|
|
577
|
+
raise RuntimeError(f"Failed to check if vector store has records in PGVector: {e}") from e
|
|
578
|
+
|
|
346
579
|
|
|
347
580
|
class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProtocolPrivate):
|
|
348
581
|
def __init__(
|
|
@@ -401,6 +634,8 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
|
|
|
401
634
|
dimension=vector_store.embedding_dimension,
|
|
402
635
|
conn=self.conn,
|
|
403
636
|
kvstore=self.kvstore,
|
|
637
|
+
distance_metric=self.config.distance_metric,
|
|
638
|
+
vector_index=self.config.vector_index,
|
|
404
639
|
)
|
|
405
640
|
await pgvector_index.initialize()
|
|
406
641
|
index = VectorStoreWithIndex(vector_store, index=pgvector_index, inference_api=self.inference_api)
|
|
@@ -427,7 +662,12 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
|
|
|
427
662
|
|
|
428
663
|
# Create and cache the PGVector index table for the vector DB
|
|
429
664
|
pgvector_index = PGVectorIndex(
|
|
430
|
-
vector_store=vector_store,
|
|
665
|
+
vector_store=vector_store,
|
|
666
|
+
dimension=vector_store.embedding_dimension,
|
|
667
|
+
conn=self.conn,
|
|
668
|
+
kvstore=self.kvstore,
|
|
669
|
+
distance_metric=self.config.distance_metric,
|
|
670
|
+
vector_index=self.config.vector_index,
|
|
431
671
|
)
|
|
432
672
|
await pgvector_index.initialize()
|
|
433
673
|
index = VectorStoreWithIndex(vector_store, index=pgvector_index, inference_api=self.inference_api)
|
|
@@ -444,6 +684,9 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
|
|
|
444
684
|
raise RuntimeError("KVStore not initialized. Call initialize() before unregistering vector stores.")
|
|
445
685
|
await self.kvstore.delete(key=f"{VECTOR_DBS_PREFIX}{vector_store_id}")
|
|
446
686
|
|
|
687
|
+
# Delete vector store metadata from PGVector metadata_store table
|
|
688
|
+
remove_vector_store_metadata(self.conn, vector_store_id)
|
|
689
|
+
|
|
447
690
|
async def insert_chunks(
|
|
448
691
|
self, vector_store_id: str, chunks: list[EmbeddedChunk], ttl_seconds: int | None = None
|
|
449
692
|
) -> None:
|
|
@@ -470,7 +713,13 @@ class PGVectorVectorIOAdapter(OpenAIVectorStoreMixin, VectorIO, VectorStoresProt
|
|
|
470
713
|
raise VectorStoreNotFoundError(vector_store_id)
|
|
471
714
|
|
|
472
715
|
vector_store = VectorStore.model_validate_json(vector_store_data)
|
|
473
|
-
index = PGVectorIndex(
|
|
716
|
+
index = PGVectorIndex(
|
|
717
|
+
vector_store,
|
|
718
|
+
vector_store.embedding_dimension,
|
|
719
|
+
self.conn,
|
|
720
|
+
distance_metric=self.config.distance_metric,
|
|
721
|
+
vector_index=self.config.vector_index,
|
|
722
|
+
)
|
|
474
723
|
await index.initialize()
|
|
475
724
|
self.cache[vector_store_id] = VectorStoreWithIndex(vector_store, index, self.inference_api)
|
|
476
725
|
return self.cache[vector_store_id]
|
|
@@ -16,6 +16,7 @@ from qdrant_client.models import PointStruct
|
|
|
16
16
|
from llama_stack.core.storage.kvstore import kvstore_impl
|
|
17
17
|
from llama_stack.log import get_logger
|
|
18
18
|
from llama_stack.providers.inline.vector_io.qdrant import QdrantVectorIOConfig as InlineQdrantVectorIOConfig
|
|
19
|
+
from llama_stack.providers.utils.inference.prompt_adapter import interleaved_content_as_str
|
|
19
20
|
from llama_stack.providers.utils.memory.openai_vector_store_mixin import OpenAIVectorStoreMixin
|
|
20
21
|
from llama_stack.providers.utils.memory.vector_store import ChunkForDeletion, EmbeddingIndex, VectorStoreWithIndex
|
|
21
22
|
from llama_stack.providers.utils.vector_io.vector_utils import load_embedded_chunk_with_backward_compat
|
|
@@ -80,11 +81,16 @@ class QdrantIndex(EmbeddingIndex):
|
|
|
80
81
|
points = []
|
|
81
82
|
for chunk in chunks:
|
|
82
83
|
chunk_id = chunk.chunk_id
|
|
84
|
+
content_text = interleaved_content_as_str(chunk.content)
|
|
83
85
|
points.append(
|
|
84
86
|
PointStruct(
|
|
85
87
|
id=convert_id(chunk_id),
|
|
86
|
-
vector=chunk.embedding,
|
|
87
|
-
payload={
|
|
88
|
+
vector=chunk.embedding,
|
|
89
|
+
payload={
|
|
90
|
+
"chunk_content": chunk.model_dump(),
|
|
91
|
+
"content_text": content_text,
|
|
92
|
+
CHUNK_ID_KEY: chunk_id,
|
|
93
|
+
},
|
|
88
94
|
)
|
|
89
95
|
)
|
|
90
96
|
|
|
@@ -144,32 +150,32 @@ class QdrantIndex(EmbeddingIndex):
|
|
|
144
150
|
QueryChunksResponse with chunks and scores matching the keyword query
|
|
145
151
|
"""
|
|
146
152
|
try:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
153
|
+
# Use scroll for keyword-only search since query_points requires a query vector
|
|
154
|
+
# Scroll allows filtering without a query vector
|
|
155
|
+
query_words = query_string.lower().split()
|
|
156
|
+
if not query_words:
|
|
157
|
+
return QueryChunksResponse(chunks=[], scores=[])
|
|
158
|
+
scroll_result = await self.client.scroll(
|
|
159
|
+
collection_name=self.collection_name,
|
|
160
|
+
scroll_filter=models.Filter(
|
|
161
|
+
should=[
|
|
162
|
+
models.FieldCondition(key="content_text", match=models.MatchText(text=word))
|
|
163
|
+
for word in query_words
|
|
164
|
+
]
|
|
165
|
+
),
|
|
166
|
+
limit=k,
|
|
167
|
+
with_payload=True,
|
|
168
|
+
with_vectors=False,
|
|
169
|
+
)
|
|
170
|
+
results = scroll_result[0]
|
|
163
171
|
except Exception as e:
|
|
164
172
|
log.error(f"Error querying keyword search in Qdrant collection {self.collection_name}: {e}")
|
|
165
173
|
raise
|
|
166
174
|
|
|
167
175
|
chunks, scores = [], []
|
|
168
176
|
for point in results:
|
|
169
|
-
if not isinstance(point, models.ScoredPoint):
|
|
170
|
-
raise RuntimeError(f"Expected ScoredPoint from Qdrant query, got {type(point).__name__}")
|
|
171
177
|
if point.payload is None:
|
|
172
|
-
raise RuntimeError("Qdrant
|
|
178
|
+
raise RuntimeError("Qdrant scroll returned point with no payload")
|
|
173
179
|
|
|
174
180
|
try:
|
|
175
181
|
chunk = load_embedded_chunk_with_backward_compat(point.payload["chunk_content"])
|
|
@@ -182,8 +188,13 @@ class QdrantIndex(EmbeddingIndex):
|
|
|
182
188
|
)
|
|
183
189
|
continue
|
|
184
190
|
|
|
191
|
+
# For keyword search, use a fixed score of 1.0 since we're not doing vector similarity
|
|
192
|
+
score = 1.0
|
|
193
|
+
if score < score_threshold:
|
|
194
|
+
continue
|
|
195
|
+
|
|
185
196
|
chunks.append(chunk)
|
|
186
|
-
scores.append(
|
|
197
|
+
scores.append(score)
|
|
187
198
|
|
|
188
199
|
return QueryChunksResponse(chunks=chunks, scores=scores)
|
|
189
200
|
|
|
@@ -214,22 +225,35 @@ class QdrantIndex(EmbeddingIndex):
|
|
|
214
225
|
QueryChunksResponse with filtered vector search results
|
|
215
226
|
"""
|
|
216
227
|
try:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
228
|
+
query_words = query_string.lower().split()
|
|
229
|
+
if not query_words:
|
|
230
|
+
# If no words, just do vector search without keyword filter
|
|
231
|
+
results = (
|
|
232
|
+
await self.client.query_points(
|
|
233
|
+
collection_name=self.collection_name,
|
|
234
|
+
query=embedding.tolist(),
|
|
235
|
+
limit=k,
|
|
236
|
+
with_payload=True,
|
|
237
|
+
score_threshold=score_threshold,
|
|
238
|
+
)
|
|
239
|
+
).points
|
|
240
|
+
else:
|
|
241
|
+
# Use should to match any of the query words
|
|
242
|
+
results = (
|
|
243
|
+
await self.client.query_points(
|
|
244
|
+
collection_name=self.collection_name,
|
|
245
|
+
query=embedding.tolist(),
|
|
246
|
+
query_filter=models.Filter(
|
|
247
|
+
should=[
|
|
248
|
+
models.FieldCondition(key="content_text", match=models.MatchText(text=word))
|
|
249
|
+
for word in query_words
|
|
250
|
+
]
|
|
251
|
+
),
|
|
252
|
+
limit=k,
|
|
253
|
+
with_payload=True,
|
|
254
|
+
score_threshold=score_threshold,
|
|
255
|
+
)
|
|
256
|
+
).points
|
|
233
257
|
except Exception as e:
|
|
234
258
|
log.error(f"Error querying hybrid search in Qdrant collection {self.collection_name}: {e}")
|
|
235
259
|
raise
|
|
@@ -49,9 +49,9 @@ def create_bedrock_client(config: BedrockBaseConfig, service_name: str = "bedroc
|
|
|
49
49
|
boto3_config = Config(**config_args)
|
|
50
50
|
|
|
51
51
|
session_args = {
|
|
52
|
-
"aws_access_key_id": config.aws_access_key_id,
|
|
53
|
-
"aws_secret_access_key": config.aws_secret_access_key,
|
|
54
|
-
"aws_session_token": config.aws_session_token,
|
|
52
|
+
"aws_access_key_id": config.aws_access_key_id.get_secret_value(),
|
|
53
|
+
"aws_secret_access_key": config.aws_secret_access_key.get_secret_value(),
|
|
54
|
+
"aws_session_token": config.aws_session_token.get_secret_value() if config.aws_session_token else None,
|
|
55
55
|
"region_name": config.region_name,
|
|
56
56
|
"profile_name": config.profile_name,
|
|
57
57
|
"session_ttl": config.session_ttl,
|
|
@@ -6,23 +6,23 @@
|
|
|
6
6
|
|
|
7
7
|
import os
|
|
8
8
|
|
|
9
|
-
from pydantic import Field
|
|
9
|
+
from pydantic import Field, SecretStr
|
|
10
10
|
|
|
11
11
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BedrockBaseConfig(RemoteInferenceProviderConfig):
|
|
15
15
|
auth_credential: None = Field(default=None, exclude=True)
|
|
16
|
-
aws_access_key_id:
|
|
17
|
-
default_factory=lambda: os.getenv("AWS_ACCESS_KEY_ID"),
|
|
16
|
+
aws_access_key_id: SecretStr | None = Field(
|
|
17
|
+
default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_ACCESS_KEY_ID")) else None,
|
|
18
18
|
description="The AWS access key to use. Default use environment variable: AWS_ACCESS_KEY_ID",
|
|
19
19
|
)
|
|
20
|
-
aws_secret_access_key:
|
|
21
|
-
default_factory=lambda: os.getenv("AWS_SECRET_ACCESS_KEY"),
|
|
20
|
+
aws_secret_access_key: SecretStr | None = Field(
|
|
21
|
+
default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_SECRET_ACCESS_KEY")) else None,
|
|
22
22
|
description="The AWS secret access key to use. Default use environment variable: AWS_SECRET_ACCESS_KEY",
|
|
23
23
|
)
|
|
24
|
-
aws_session_token:
|
|
25
|
-
default_factory=lambda: os.getenv("AWS_SESSION_TOKEN"),
|
|
24
|
+
aws_session_token: SecretStr | None = Field(
|
|
25
|
+
default_factory=lambda: SecretStr(val) if (val := os.getenv("AWS_SESSION_TOKEN")) else None,
|
|
26
26
|
description="The AWS session token to use. Default use environment variable: AWS_SESSION_TOKEN",
|
|
27
27
|
)
|
|
28
28
|
region_name: str | None = Field(
|
|
@@ -5,30 +5,5 @@
|
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
from llama_stack.models.llama.sku_list import all_registered_models
|
|
8
|
-
from llama_stack.models.llama.sku_types import * # noqa: F403
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def is_supported_safety_model(model: Model) -> bool:
|
|
12
|
-
if model.quantization_format != CheckpointQuantizationFormat.bf16:
|
|
13
|
-
return False
|
|
14
|
-
|
|
15
|
-
model_id = model.core_model_id
|
|
16
|
-
return model_id in [
|
|
17
|
-
CoreModelId.llama_guard_3_8b,
|
|
18
|
-
CoreModelId.llama_guard_3_1b,
|
|
19
|
-
CoreModelId.llama_guard_3_11b_vision,
|
|
20
|
-
]
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def supported_inference_models() -> list[Model]:
|
|
24
|
-
return [
|
|
25
|
-
m
|
|
26
|
-
for m in all_registered_models()
|
|
27
|
-
if (
|
|
28
|
-
m.model_family in {ModelFamily.llama3_1, ModelFamily.llama3_2, ModelFamily.llama3_3, ModelFamily.llama4}
|
|
29
|
-
or is_supported_safety_model(m)
|
|
30
|
-
)
|
|
31
|
-
]
|
|
32
|
-
|
|
33
8
|
|
|
34
9
|
ALL_HUGGINGFACE_REPOS_TO_MODEL_DESCRIPTOR = {m.huggingface_repo: m.descriptor() for m in all_registered_models()}
|
|
@@ -23,6 +23,7 @@ from llama_stack_api import (
|
|
|
23
23
|
OpenAIEmbeddingsRequestWithExtraBody,
|
|
24
24
|
OpenAIEmbeddingsResponse,
|
|
25
25
|
OpenAIEmbeddingUsage,
|
|
26
|
+
validate_embeddings_input_is_text,
|
|
26
27
|
)
|
|
27
28
|
|
|
28
29
|
EMBEDDING_MODELS: dict[str, "SentenceTransformer"] = {}
|
|
@@ -41,6 +42,9 @@ class SentenceTransformerEmbeddingMixin:
|
|
|
41
42
|
self,
|
|
42
43
|
params: OpenAIEmbeddingsRequestWithExtraBody,
|
|
43
44
|
) -> OpenAIEmbeddingsResponse:
|
|
45
|
+
# Validate that input contains only text, not token arrays
|
|
46
|
+
validate_embeddings_input_is_text(params)
|
|
47
|
+
|
|
44
48
|
# Convert input to list format if it's a single string
|
|
45
49
|
input_list = [params.input] if isinstance(params.input, str) else params.input
|
|
46
50
|
if not input_list:
|