llama-stack 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/__init__.py +0 -5
- llama_stack/cli/llama.py +3 -3
- llama_stack/cli/stack/_list_deps.py +12 -23
- llama_stack/cli/stack/list_stacks.py +37 -18
- llama_stack/cli/stack/run.py +121 -11
- llama_stack/cli/stack/utils.py +0 -127
- llama_stack/core/access_control/access_control.py +69 -28
- llama_stack/core/access_control/conditions.py +15 -5
- llama_stack/core/admin.py +267 -0
- llama_stack/core/build.py +6 -74
- llama_stack/core/client.py +1 -1
- llama_stack/core/configure.py +6 -6
- llama_stack/core/conversations/conversations.py +28 -25
- llama_stack/core/datatypes.py +271 -79
- llama_stack/core/distribution.py +15 -16
- llama_stack/core/external.py +3 -3
- llama_stack/core/inspect.py +98 -15
- llama_stack/core/library_client.py +73 -61
- llama_stack/core/prompts/prompts.py +12 -11
- llama_stack/core/providers.py +17 -11
- llama_stack/core/resolver.py +65 -56
- llama_stack/core/routers/__init__.py +8 -12
- llama_stack/core/routers/datasets.py +1 -4
- llama_stack/core/routers/eval_scoring.py +7 -4
- llama_stack/core/routers/inference.py +55 -271
- llama_stack/core/routers/safety.py +52 -24
- llama_stack/core/routers/tool_runtime.py +6 -48
- llama_stack/core/routers/vector_io.py +130 -51
- llama_stack/core/routing_tables/benchmarks.py +24 -20
- llama_stack/core/routing_tables/common.py +1 -4
- llama_stack/core/routing_tables/datasets.py +22 -22
- llama_stack/core/routing_tables/models.py +119 -6
- llama_stack/core/routing_tables/scoring_functions.py +7 -7
- llama_stack/core/routing_tables/shields.py +1 -2
- llama_stack/core/routing_tables/toolgroups.py +17 -7
- llama_stack/core/routing_tables/vector_stores.py +51 -16
- llama_stack/core/server/auth.py +5 -3
- llama_stack/core/server/auth_providers.py +36 -20
- llama_stack/core/server/fastapi_router_registry.py +84 -0
- llama_stack/core/server/quota.py +2 -2
- llama_stack/core/server/routes.py +79 -27
- llama_stack/core/server/server.py +102 -87
- llama_stack/core/stack.py +201 -58
- llama_stack/core/storage/datatypes.py +26 -3
- llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
- llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
- llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
- llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
- llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
- llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
- llama_stack/core/storage/sqlstore/__init__.py +17 -0
- llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/config.py +8 -2
- llama_stack/core/utils/config_resolution.py +32 -29
- llama_stack/core/utils/context.py +4 -10
- llama_stack/core/utils/exec.py +9 -0
- llama_stack/core/utils/type_inspection.py +45 -0
- llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/dell/dell.py +2 -2
- llama_stack/distributions/dell/run-with-safety.yaml +3 -2
- llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
- llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
- llama_stack/distributions/nvidia/nvidia.py +1 -1
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
- llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
- llama_stack/distributions/oci/config.yaml +134 -0
- llama_stack/distributions/oci/oci.py +108 -0
- llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
- llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
- llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/starter/starter.py +8 -5
- llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/template.py +13 -69
- llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/watsonx/watsonx.py +1 -1
- llama_stack/log.py +28 -11
- llama_stack/models/llama/checkpoint.py +6 -6
- llama_stack/models/llama/hadamard_utils.py +2 -0
- llama_stack/models/llama/llama3/generation.py +3 -1
- llama_stack/models/llama/llama3/interface.py +2 -5
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
- llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
- llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
- llama_stack/models/llama/llama3/tool_utils.py +2 -1
- llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
- llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
- llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
- llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
- llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
- llama_stack/providers/inline/batches/reference/__init__.py +2 -4
- llama_stack/providers/inline/batches/reference/batches.py +78 -60
- llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
- llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
- llama_stack/providers/inline/files/localfs/files.py +37 -28
- llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
- llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
- llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
- llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
- llama_stack/providers/inline/post_training/common/validator.py +1 -5
- llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
- llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
- llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
- llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
- llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
- llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
- llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
- llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
- llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
- llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/faiss.py +43 -28
- llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +40 -33
- llama_stack/providers/registry/agents.py +7 -3
- llama_stack/providers/registry/batches.py +1 -1
- llama_stack/providers/registry/datasetio.py +1 -1
- llama_stack/providers/registry/eval.py +1 -1
- llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
- llama_stack/providers/registry/files.py +11 -2
- llama_stack/providers/registry/inference.py +22 -3
- llama_stack/providers/registry/post_training.py +1 -1
- llama_stack/providers/registry/safety.py +1 -1
- llama_stack/providers/registry/scoring.py +1 -1
- llama_stack/providers/registry/tool_runtime.py +2 -2
- llama_stack/providers/registry/vector_io.py +7 -7
- llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
- llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
- llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
- llama_stack/providers/remote/files/openai/__init__.py +19 -0
- llama_stack/providers/remote/files/openai/config.py +28 -0
- llama_stack/providers/remote/files/openai/files.py +253 -0
- llama_stack/providers/remote/files/s3/files.py +52 -30
- llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
- llama_stack/providers/remote/inference/anthropic/config.py +1 -1
- llama_stack/providers/remote/inference/azure/azure.py +1 -3
- llama_stack/providers/remote/inference/azure/config.py +8 -7
- llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
- llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
- llama_stack/providers/remote/inference/bedrock/config.py +24 -3
- llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
- llama_stack/providers/remote/inference/cerebras/config.py +12 -5
- llama_stack/providers/remote/inference/databricks/config.py +13 -6
- llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
- llama_stack/providers/remote/inference/fireworks/config.py +5 -5
- llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
- llama_stack/providers/remote/inference/gemini/config.py +1 -1
- llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
- llama_stack/providers/remote/inference/groq/config.py +5 -5
- llama_stack/providers/remote/inference/groq/groq.py +1 -1
- llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
- llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
- llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
- llama_stack/providers/remote/inference/nvidia/config.py +21 -11
- llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
- llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
- llama_stack/providers/remote/inference/oci/__init__.py +17 -0
- llama_stack/providers/remote/inference/oci/auth.py +79 -0
- llama_stack/providers/remote/inference/oci/config.py +75 -0
- llama_stack/providers/remote/inference/oci/oci.py +162 -0
- llama_stack/providers/remote/inference/ollama/config.py +7 -5
- llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
- llama_stack/providers/remote/inference/openai/config.py +4 -4
- llama_stack/providers/remote/inference/openai/openai.py +1 -1
- llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
- llama_stack/providers/remote/inference/passthrough/config.py +5 -10
- llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
- llama_stack/providers/remote/inference/runpod/config.py +12 -5
- llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
- llama_stack/providers/remote/inference/sambanova/config.py +5 -5
- llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
- llama_stack/providers/remote/inference/tgi/config.py +7 -6
- llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
- llama_stack/providers/remote/inference/together/config.py +5 -5
- llama_stack/providers/remote/inference/together/together.py +15 -12
- llama_stack/providers/remote/inference/vertexai/config.py +1 -1
- llama_stack/providers/remote/inference/vllm/config.py +5 -5
- llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
- llama_stack/providers/remote/inference/watsonx/config.py +4 -4
- llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
- llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
- llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
- llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
- llama_stack/providers/remote/safety/bedrock/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
- llama_stack/providers/remote/safety/sambanova/config.py +1 -1
- llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
- llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
- llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
- llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
- llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
- llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/chroma/chroma.py +125 -20
- llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/milvus.py +27 -21
- llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +26 -18
- llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +141 -24
- llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/weaviate.py +26 -21
- llama_stack/providers/utils/common/data_schema_validator.py +1 -5
- llama_stack/providers/utils/files/form_data.py +1 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
- llama_stack/providers/utils/inference/inference_store.py +12 -21
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
- llama_stack/providers/utils/inference/model_registry.py +1 -3
- llama_stack/providers/utils/inference/openai_compat.py +44 -1171
- llama_stack/providers/utils/inference/openai_mixin.py +68 -42
- llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
- llama_stack/providers/utils/inference/stream_utils.py +23 -0
- llama_stack/providers/utils/memory/__init__.py +2 -0
- llama_stack/providers/utils/memory/file_utils.py +1 -1
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
- llama_stack/providers/utils/memory/vector_store.py +39 -38
- llama_stack/providers/utils/pagination.py +1 -1
- llama_stack/providers/utils/responses/responses_store.py +15 -25
- llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
- llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
- llama_stack/providers/utils/tools/mcp.py +93 -11
- llama_stack/telemetry/constants.py +27 -0
- llama_stack/telemetry/helpers.py +43 -0
- llama_stack/testing/api_recorder.py +25 -16
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/METADATA +56 -131
- llama_stack-0.4.0.dist-info/RECORD +588 -0
- llama_stack-0.4.0.dist-info/top_level.txt +2 -0
- llama_stack_api/__init__.py +945 -0
- llama_stack_api/admin/__init__.py +45 -0
- llama_stack_api/admin/api.py +72 -0
- llama_stack_api/admin/fastapi_routes.py +117 -0
- llama_stack_api/admin/models.py +113 -0
- llama_stack_api/agents.py +173 -0
- llama_stack_api/batches/__init__.py +40 -0
- llama_stack_api/batches/api.py +53 -0
- llama_stack_api/batches/fastapi_routes.py +113 -0
- llama_stack_api/batches/models.py +78 -0
- llama_stack_api/benchmarks/__init__.py +43 -0
- llama_stack_api/benchmarks/api.py +39 -0
- llama_stack_api/benchmarks/fastapi_routes.py +109 -0
- llama_stack_api/benchmarks/models.py +109 -0
- {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
- {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
- {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
- llama_stack_api/common/responses.py +77 -0
- {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
- {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
- llama_stack_api/connectors.py +146 -0
- {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
- {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
- llama_stack_api/datasets/__init__.py +61 -0
- llama_stack_api/datasets/api.py +35 -0
- llama_stack_api/datasets/fastapi_routes.py +104 -0
- llama_stack_api/datasets/models.py +152 -0
- {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
- {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
- llama_stack_api/file_processors/__init__.py +27 -0
- llama_stack_api/file_processors/api.py +64 -0
- llama_stack_api/file_processors/fastapi_routes.py +78 -0
- llama_stack_api/file_processors/models.py +42 -0
- llama_stack_api/files/__init__.py +35 -0
- llama_stack_api/files/api.py +51 -0
- llama_stack_api/files/fastapi_routes.py +124 -0
- llama_stack_api/files/models.py +107 -0
- {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
- llama_stack_api/inspect_api/__init__.py +37 -0
- llama_stack_api/inspect_api/api.py +25 -0
- llama_stack_api/inspect_api/fastapi_routes.py +76 -0
- llama_stack_api/inspect_api/models.py +28 -0
- {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
- llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
- llama_stack_api/internal/sqlstore.py +79 -0
- {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
- {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
- {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
- {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
- llama_stack_api/providers/__init__.py +33 -0
- llama_stack_api/providers/api.py +16 -0
- llama_stack_api/providers/fastapi_routes.py +57 -0
- llama_stack_api/providers/models.py +24 -0
- {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
- {llama_stack/apis → llama_stack_api}/resource.py +1 -1
- llama_stack_api/router_utils.py +160 -0
- {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
- {llama_stack → llama_stack_api}/schema_utils.py +94 -4
- {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
- {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
- {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
- {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
- {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
- {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
- llama_stack/apis/agents/agents.py +0 -894
- llama_stack/apis/batches/__init__.py +0 -9
- llama_stack/apis/batches/batches.py +0 -100
- llama_stack/apis/benchmarks/__init__.py +0 -7
- llama_stack/apis/benchmarks/benchmarks.py +0 -108
- llama_stack/apis/common/responses.py +0 -36
- llama_stack/apis/conversations/__init__.py +0 -31
- llama_stack/apis/datasets/datasets.py +0 -251
- llama_stack/apis/datatypes.py +0 -160
- llama_stack/apis/eval/__init__.py +0 -7
- llama_stack/apis/files/__init__.py +0 -7
- llama_stack/apis/files/files.py +0 -199
- llama_stack/apis/inference/__init__.py +0 -7
- llama_stack/apis/inference/event_logger.py +0 -43
- llama_stack/apis/inspect/__init__.py +0 -7
- llama_stack/apis/inspect/inspect.py +0 -94
- llama_stack/apis/models/__init__.py +0 -7
- llama_stack/apis/post_training/__init__.py +0 -7
- llama_stack/apis/prompts/__init__.py +0 -9
- llama_stack/apis/providers/__init__.py +0 -7
- llama_stack/apis/providers/providers.py +0 -69
- llama_stack/apis/safety/__init__.py +0 -7
- llama_stack/apis/scoring/__init__.py +0 -7
- llama_stack/apis/scoring_functions/__init__.py +0 -7
- llama_stack/apis/shields/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
- llama_stack/apis/telemetry/__init__.py +0 -7
- llama_stack/apis/telemetry/telemetry.py +0 -423
- llama_stack/apis/tools/__init__.py +0 -8
- llama_stack/apis/vector_io/__init__.py +0 -7
- llama_stack/apis/vector_stores/__init__.py +0 -7
- llama_stack/core/server/tracing.py +0 -80
- llama_stack/core/ui/app.py +0 -55
- llama_stack/core/ui/modules/__init__.py +0 -5
- llama_stack/core/ui/modules/api.py +0 -32
- llama_stack/core/ui/modules/utils.py +0 -42
- llama_stack/core/ui/page/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/datasets.py +0 -18
- llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
- llama_stack/core/ui/page/distribution/models.py +0 -18
- llama_stack/core/ui/page/distribution/providers.py +0 -27
- llama_stack/core/ui/page/distribution/resources.py +0 -48
- llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
- llama_stack/core/ui/page/distribution/shields.py +0 -19
- llama_stack/core/ui/page/evaluations/__init__.py +0 -5
- llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
- llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
- llama_stack/core/ui/page/playground/__init__.py +0 -5
- llama_stack/core/ui/page/playground/chat.py +0 -130
- llama_stack/core/ui/page/playground/tools.py +0 -352
- llama_stack/distributions/dell/build.yaml +0 -33
- llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
- llama_stack/distributions/nvidia/build.yaml +0 -29
- llama_stack/distributions/open-benchmark/build.yaml +0 -36
- llama_stack/distributions/postgres-demo/__init__.py +0 -7
- llama_stack/distributions/postgres-demo/build.yaml +0 -23
- llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
- llama_stack/distributions/starter/build.yaml +0 -61
- llama_stack/distributions/starter-gpu/build.yaml +0 -61
- llama_stack/distributions/watsonx/build.yaml +0 -33
- llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
- llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
- llama_stack/providers/inline/telemetry/__init__.py +0 -5
- llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
- llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
- llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
- llama_stack/providers/remote/inference/bedrock/models.py +0 -29
- llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
- llama_stack/providers/utils/sqlstore/__init__.py +0 -5
- llama_stack/providers/utils/sqlstore/api.py +0 -128
- llama_stack/providers/utils/telemetry/__init__.py +0 -5
- llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
- llama_stack/providers/utils/telemetry/tracing.py +0 -384
- llama_stack/strong_typing/__init__.py +0 -19
- llama_stack/strong_typing/auxiliary.py +0 -228
- llama_stack/strong_typing/classdef.py +0 -440
- llama_stack/strong_typing/core.py +0 -46
- llama_stack/strong_typing/deserializer.py +0 -877
- llama_stack/strong_typing/docstring.py +0 -409
- llama_stack/strong_typing/exception.py +0 -23
- llama_stack/strong_typing/inspection.py +0 -1085
- llama_stack/strong_typing/mapping.py +0 -40
- llama_stack/strong_typing/name.py +0 -182
- llama_stack/strong_typing/schema.py +0 -792
- llama_stack/strong_typing/serialization.py +0 -97
- llama_stack/strong_typing/serializer.py +0 -500
- llama_stack/strong_typing/slots.py +0 -27
- llama_stack/strong_typing/topological.py +0 -89
- llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
- llama_stack-0.3.4.dist-info/RECORD +0 -625
- llama_stack-0.3.4.dist-info/top_level.txt +0 -1
- /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
- /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
- /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/WHEEL +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
- {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
- {llama_stack/apis → llama_stack_api}/version.py +0 -0
|
@@ -6,8 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
8
|
import time
|
|
9
|
-
from collections.abc import
|
|
10
|
-
from datetime import UTC, datetime
|
|
9
|
+
from collections.abc import AsyncIterator
|
|
11
10
|
from typing import Annotated, Any
|
|
12
11
|
|
|
13
12
|
from fastapi import Body
|
|
@@ -15,23 +14,24 @@ from openai.types.chat import ChatCompletionToolChoiceOptionParam as OpenAIChatC
|
|
|
15
14
|
from openai.types.chat import ChatCompletionToolParam as OpenAIChatCompletionToolParam
|
|
16
15
|
from pydantic import TypeAdapter
|
|
17
16
|
|
|
18
|
-
from llama_stack.
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
from llama_stack.
|
|
22
|
-
from llama_stack.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
CompletionMessage,
|
|
27
|
-
CompletionResponse,
|
|
28
|
-
CompletionResponseStreamChunk,
|
|
17
|
+
from llama_stack.core.access_control.access_control import is_action_allowed
|
|
18
|
+
from llama_stack.core.datatypes import ModelWithOwner
|
|
19
|
+
from llama_stack.core.request_headers import get_authenticated_user
|
|
20
|
+
from llama_stack.log import get_logger
|
|
21
|
+
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
|
22
|
+
from llama_stack_api import (
|
|
23
|
+
HealthResponse,
|
|
24
|
+
HealthStatus,
|
|
29
25
|
Inference,
|
|
30
26
|
ListOpenAIChatCompletionResponse,
|
|
31
|
-
|
|
27
|
+
ModelNotFoundError,
|
|
28
|
+
ModelType,
|
|
29
|
+
ModelTypeError,
|
|
32
30
|
OpenAIAssistantMessageParam,
|
|
33
31
|
OpenAIChatCompletion,
|
|
34
32
|
OpenAIChatCompletionChunk,
|
|
33
|
+
OpenAIChatCompletionContentPartImageParam,
|
|
34
|
+
OpenAIChatCompletionContentPartTextParam,
|
|
35
35
|
OpenAIChatCompletionRequestWithExtraBody,
|
|
36
36
|
OpenAIChatCompletionToolCall,
|
|
37
37
|
OpenAIChatCompletionToolCallFunction,
|
|
@@ -43,25 +43,12 @@ from llama_stack.apis.inference import (
|
|
|
43
43
|
OpenAIEmbeddingsRequestWithExtraBody,
|
|
44
44
|
OpenAIEmbeddingsResponse,
|
|
45
45
|
OpenAIMessageParam,
|
|
46
|
+
OpenAITokenLogProb,
|
|
47
|
+
OpenAITopLogProb,
|
|
46
48
|
Order,
|
|
47
|
-
|
|
48
|
-
ToolPromptFormat,
|
|
49
|
-
)
|
|
50
|
-
from llama_stack.apis.models import Model, ModelType
|
|
51
|
-
from llama_stack.apis.telemetry import MetricEvent, MetricInResponse, Telemetry
|
|
52
|
-
from llama_stack.core.access_control.access_control import is_action_allowed
|
|
53
|
-
from llama_stack.core.datatypes import ModelWithOwner
|
|
54
|
-
from llama_stack.core.request_headers import get_authenticated_user
|
|
55
|
-
from llama_stack.log import get_logger
|
|
56
|
-
from llama_stack.models.llama.llama3.chat_format import ChatFormat
|
|
57
|
-
from llama_stack.models.llama.llama3.tokenizer import Tokenizer
|
|
58
|
-
from llama_stack.providers.datatypes import (
|
|
59
|
-
HealthResponse,
|
|
60
|
-
HealthStatus,
|
|
49
|
+
RerankResponse,
|
|
61
50
|
RoutingTable,
|
|
62
51
|
)
|
|
63
|
-
from llama_stack.providers.utils.inference.inference_store import InferenceStore
|
|
64
|
-
from llama_stack.providers.utils.telemetry.tracing import enqueue_event, get_current_span
|
|
65
52
|
|
|
66
53
|
logger = get_logger(name=__name__, category="core::routers")
|
|
67
54
|
|
|
@@ -72,16 +59,11 @@ class InferenceRouter(Inference):
|
|
|
72
59
|
def __init__(
|
|
73
60
|
self,
|
|
74
61
|
routing_table: RoutingTable,
|
|
75
|
-
telemetry: Telemetry | None = None,
|
|
76
62
|
store: InferenceStore | None = None,
|
|
77
63
|
) -> None:
|
|
78
64
|
logger.debug("Initializing InferenceRouter")
|
|
79
65
|
self.routing_table = routing_table
|
|
80
|
-
self.telemetry = telemetry
|
|
81
66
|
self.store = store
|
|
82
|
-
if self.telemetry:
|
|
83
|
-
self.tokenizer = Tokenizer.get_instance()
|
|
84
|
-
self.formatter = ChatFormat(self.tokenizer)
|
|
85
67
|
|
|
86
68
|
async def initialize(self) -> None:
|
|
87
69
|
logger.debug("InferenceRouter.initialize")
|
|
@@ -107,83 +89,6 @@ class InferenceRouter(Inference):
|
|
|
107
89
|
)
|
|
108
90
|
await self.routing_table.register_model(model_id, provider_model_id, provider_id, metadata, model_type)
|
|
109
91
|
|
|
110
|
-
def _construct_metrics(
|
|
111
|
-
self,
|
|
112
|
-
prompt_tokens: int,
|
|
113
|
-
completion_tokens: int,
|
|
114
|
-
total_tokens: int,
|
|
115
|
-
fully_qualified_model_id: str,
|
|
116
|
-
provider_id: str,
|
|
117
|
-
) -> list[MetricEvent]:
|
|
118
|
-
"""Constructs a list of MetricEvent objects containing token usage metrics.
|
|
119
|
-
|
|
120
|
-
Args:
|
|
121
|
-
prompt_tokens: Number of tokens in the prompt
|
|
122
|
-
completion_tokens: Number of tokens in the completion
|
|
123
|
-
total_tokens: Total number of tokens used
|
|
124
|
-
fully_qualified_model_id:
|
|
125
|
-
provider_id: The provider identifier
|
|
126
|
-
|
|
127
|
-
Returns:
|
|
128
|
-
List of MetricEvent objects with token usage metrics
|
|
129
|
-
"""
|
|
130
|
-
span = get_current_span()
|
|
131
|
-
if span is None:
|
|
132
|
-
logger.warning("No span found for token usage metrics")
|
|
133
|
-
return []
|
|
134
|
-
|
|
135
|
-
metrics = [
|
|
136
|
-
("prompt_tokens", prompt_tokens),
|
|
137
|
-
("completion_tokens", completion_tokens),
|
|
138
|
-
("total_tokens", total_tokens),
|
|
139
|
-
]
|
|
140
|
-
metric_events = []
|
|
141
|
-
for metric_name, value in metrics:
|
|
142
|
-
metric_events.append(
|
|
143
|
-
MetricEvent(
|
|
144
|
-
trace_id=span.trace_id,
|
|
145
|
-
span_id=span.span_id,
|
|
146
|
-
metric=metric_name,
|
|
147
|
-
value=value,
|
|
148
|
-
timestamp=datetime.now(UTC),
|
|
149
|
-
unit="tokens",
|
|
150
|
-
attributes={
|
|
151
|
-
"model_id": fully_qualified_model_id,
|
|
152
|
-
"provider_id": provider_id,
|
|
153
|
-
},
|
|
154
|
-
)
|
|
155
|
-
)
|
|
156
|
-
return metric_events
|
|
157
|
-
|
|
158
|
-
async def _compute_and_log_token_usage(
|
|
159
|
-
self,
|
|
160
|
-
prompt_tokens: int,
|
|
161
|
-
completion_tokens: int,
|
|
162
|
-
total_tokens: int,
|
|
163
|
-
model: Model,
|
|
164
|
-
) -> list[MetricInResponse]:
|
|
165
|
-
metrics = self._construct_metrics(
|
|
166
|
-
prompt_tokens, completion_tokens, total_tokens, model.model_id, model.provider_id
|
|
167
|
-
)
|
|
168
|
-
if self.telemetry:
|
|
169
|
-
for metric in metrics:
|
|
170
|
-
enqueue_event(metric)
|
|
171
|
-
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
|
|
172
|
-
|
|
173
|
-
async def _count_tokens(
|
|
174
|
-
self,
|
|
175
|
-
messages: list[Message] | InterleavedContent,
|
|
176
|
-
tool_prompt_format: ToolPromptFormat | None = None,
|
|
177
|
-
) -> int | None:
|
|
178
|
-
if not hasattr(self, "formatter") or self.formatter is None:
|
|
179
|
-
return None
|
|
180
|
-
|
|
181
|
-
if isinstance(messages, list):
|
|
182
|
-
encoded = self.formatter.encode_dialog_prompt(messages, tool_prompt_format)
|
|
183
|
-
else:
|
|
184
|
-
encoded = self.formatter.encode_content(messages)
|
|
185
|
-
return len(encoded.tokens) if encoded and encoded.tokens else 0
|
|
186
|
-
|
|
187
92
|
async def _get_model_provider(self, model_id: str, expected_model_type: str) -> tuple[Inference, str]:
|
|
188
93
|
model = await self.routing_table.get_object_by_identifier("model", model_id)
|
|
189
94
|
if model:
|
|
@@ -230,6 +135,17 @@ class InferenceRouter(Inference):
|
|
|
230
135
|
|
|
231
136
|
return self.routing_table.impls_by_provider_id[provider_id], provider_resource_id
|
|
232
137
|
|
|
138
|
+
async def rerank(
|
|
139
|
+
self,
|
|
140
|
+
model: str,
|
|
141
|
+
query: str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam,
|
|
142
|
+
items: list[str | OpenAIChatCompletionContentPartTextParam | OpenAIChatCompletionContentPartImageParam],
|
|
143
|
+
max_num_results: int | None = None,
|
|
144
|
+
) -> RerankResponse:
|
|
145
|
+
logger.debug(f"InferenceRouter.rerank: {model}")
|
|
146
|
+
provider, provider_resource_id = await self._get_model_provider(model, ModelType.rerank)
|
|
147
|
+
return await provider.rerank(provider_resource_id, query, items, max_num_results)
|
|
148
|
+
|
|
233
149
|
async def openai_completion(
|
|
234
150
|
self,
|
|
235
151
|
params: Annotated[OpenAICompletionRequestWithExtraBody, Body(...)],
|
|
@@ -243,26 +159,9 @@ class InferenceRouter(Inference):
|
|
|
243
159
|
|
|
244
160
|
if params.stream:
|
|
245
161
|
return await provider.openai_completion(params)
|
|
246
|
-
# TODO: Metrics do NOT work with openai_completion stream=True due to the fact
|
|
247
|
-
# that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
|
|
248
162
|
|
|
249
163
|
response = await provider.openai_completion(params)
|
|
250
164
|
response.model = request_model_id
|
|
251
|
-
if self.telemetry:
|
|
252
|
-
metrics = self._construct_metrics(
|
|
253
|
-
prompt_tokens=response.usage.prompt_tokens,
|
|
254
|
-
completion_tokens=response.usage.completion_tokens,
|
|
255
|
-
total_tokens=response.usage.total_tokens,
|
|
256
|
-
fully_qualified_model_id=request_model_id,
|
|
257
|
-
provider_id=provider.__provider_id__,
|
|
258
|
-
)
|
|
259
|
-
for metric in metrics:
|
|
260
|
-
enqueue_event(metric)
|
|
261
|
-
|
|
262
|
-
# these metrics will show up in the client response.
|
|
263
|
-
response.metrics = (
|
|
264
|
-
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
|
|
265
|
-
)
|
|
266
165
|
return response
|
|
267
166
|
|
|
268
167
|
async def openai_chat_completion(
|
|
@@ -311,20 +210,6 @@ class InferenceRouter(Inference):
|
|
|
311
210
|
if self.store:
|
|
312
211
|
asyncio.create_task(self.store.store_chat_completion(response, params.messages))
|
|
313
212
|
|
|
314
|
-
if self.telemetry:
|
|
315
|
-
metrics = self._construct_metrics(
|
|
316
|
-
prompt_tokens=response.usage.prompt_tokens,
|
|
317
|
-
completion_tokens=response.usage.completion_tokens,
|
|
318
|
-
total_tokens=response.usage.total_tokens,
|
|
319
|
-
fully_qualified_model_id=request_model_id,
|
|
320
|
-
provider_id=provider.__provider_id__,
|
|
321
|
-
)
|
|
322
|
-
for metric in metrics:
|
|
323
|
-
enqueue_event(metric)
|
|
324
|
-
# these metrics will show up in the client response.
|
|
325
|
-
response.metrics = (
|
|
326
|
-
metrics if not hasattr(response, "metrics") or response.metrics is None else response.metrics + metrics
|
|
327
|
-
)
|
|
328
213
|
return response
|
|
329
214
|
|
|
330
215
|
async def openai_embeddings(
|
|
@@ -392,121 +277,6 @@ class InferenceRouter(Inference):
|
|
|
392
277
|
)
|
|
393
278
|
return health_statuses
|
|
394
279
|
|
|
395
|
-
async def stream_tokens_and_compute_metrics(
|
|
396
|
-
self,
|
|
397
|
-
response,
|
|
398
|
-
prompt_tokens,
|
|
399
|
-
fully_qualified_model_id: str,
|
|
400
|
-
provider_id: str,
|
|
401
|
-
tool_prompt_format: ToolPromptFormat | None = None,
|
|
402
|
-
) -> AsyncGenerator[ChatCompletionResponseStreamChunk, None] | AsyncGenerator[CompletionResponseStreamChunk, None]:
|
|
403
|
-
completion_text = ""
|
|
404
|
-
async for chunk in response:
|
|
405
|
-
complete = False
|
|
406
|
-
if hasattr(chunk, "event"): # only ChatCompletions have .event
|
|
407
|
-
if chunk.event.event_type == ChatCompletionResponseEventType.progress:
|
|
408
|
-
if chunk.event.delta.type == "text":
|
|
409
|
-
completion_text += chunk.event.delta.text
|
|
410
|
-
if chunk.event.event_type == ChatCompletionResponseEventType.complete:
|
|
411
|
-
complete = True
|
|
412
|
-
completion_tokens = await self._count_tokens(
|
|
413
|
-
[
|
|
414
|
-
CompletionMessage(
|
|
415
|
-
content=completion_text,
|
|
416
|
-
stop_reason=StopReason.end_of_turn,
|
|
417
|
-
)
|
|
418
|
-
],
|
|
419
|
-
tool_prompt_format=tool_prompt_format,
|
|
420
|
-
)
|
|
421
|
-
else:
|
|
422
|
-
if hasattr(chunk, "delta"):
|
|
423
|
-
completion_text += chunk.delta
|
|
424
|
-
if hasattr(chunk, "stop_reason") and chunk.stop_reason and self.telemetry:
|
|
425
|
-
complete = True
|
|
426
|
-
completion_tokens = await self._count_tokens(completion_text)
|
|
427
|
-
# if we are done receiving tokens
|
|
428
|
-
if complete:
|
|
429
|
-
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
|
430
|
-
|
|
431
|
-
# Create a separate span for streaming completion metrics
|
|
432
|
-
if self.telemetry:
|
|
433
|
-
# Log metrics in the new span context
|
|
434
|
-
completion_metrics = self._construct_metrics(
|
|
435
|
-
prompt_tokens=prompt_tokens,
|
|
436
|
-
completion_tokens=completion_tokens,
|
|
437
|
-
total_tokens=total_tokens,
|
|
438
|
-
fully_qualified_model_id=fully_qualified_model_id,
|
|
439
|
-
provider_id=provider_id,
|
|
440
|
-
)
|
|
441
|
-
for metric in completion_metrics:
|
|
442
|
-
if metric.metric in [
|
|
443
|
-
"completion_tokens",
|
|
444
|
-
"total_tokens",
|
|
445
|
-
]: # Only log completion and total tokens
|
|
446
|
-
enqueue_event(metric)
|
|
447
|
-
|
|
448
|
-
# Return metrics in response
|
|
449
|
-
async_metrics = [
|
|
450
|
-
MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
|
|
451
|
-
]
|
|
452
|
-
chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
|
|
453
|
-
else:
|
|
454
|
-
# Fallback if no telemetry
|
|
455
|
-
completion_metrics = self._construct_metrics(
|
|
456
|
-
prompt_tokens or 0,
|
|
457
|
-
completion_tokens or 0,
|
|
458
|
-
total_tokens,
|
|
459
|
-
fully_qualified_model_id=fully_qualified_model_id,
|
|
460
|
-
provider_id=provider_id,
|
|
461
|
-
)
|
|
462
|
-
async_metrics = [
|
|
463
|
-
MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics
|
|
464
|
-
]
|
|
465
|
-
chunk.metrics = async_metrics if chunk.metrics is None else chunk.metrics + async_metrics
|
|
466
|
-
yield chunk
|
|
467
|
-
|
|
468
|
-
async def count_tokens_and_compute_metrics(
|
|
469
|
-
self,
|
|
470
|
-
response: ChatCompletionResponse | CompletionResponse,
|
|
471
|
-
prompt_tokens,
|
|
472
|
-
fully_qualified_model_id: str,
|
|
473
|
-
provider_id: str,
|
|
474
|
-
tool_prompt_format: ToolPromptFormat | None = None,
|
|
475
|
-
):
|
|
476
|
-
if isinstance(response, ChatCompletionResponse):
|
|
477
|
-
content = [response.completion_message]
|
|
478
|
-
else:
|
|
479
|
-
content = response.content
|
|
480
|
-
completion_tokens = await self._count_tokens(messages=content, tool_prompt_format=tool_prompt_format)
|
|
481
|
-
total_tokens = (prompt_tokens or 0) + (completion_tokens or 0)
|
|
482
|
-
|
|
483
|
-
# Create a separate span for completion metrics
|
|
484
|
-
if self.telemetry:
|
|
485
|
-
# Log metrics in the new span context
|
|
486
|
-
completion_metrics = self._construct_metrics(
|
|
487
|
-
prompt_tokens=prompt_tokens,
|
|
488
|
-
completion_tokens=completion_tokens,
|
|
489
|
-
total_tokens=total_tokens,
|
|
490
|
-
fully_qualified_model_id=fully_qualified_model_id,
|
|
491
|
-
provider_id=provider_id,
|
|
492
|
-
)
|
|
493
|
-
for metric in completion_metrics:
|
|
494
|
-
if metric.metric in ["completion_tokens", "total_tokens"]: # Only log completion and total tokens
|
|
495
|
-
enqueue_event(metric)
|
|
496
|
-
|
|
497
|
-
# Return metrics in response
|
|
498
|
-
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in completion_metrics]
|
|
499
|
-
|
|
500
|
-
# Fallback if no telemetry
|
|
501
|
-
metrics = self._construct_metrics(
|
|
502
|
-
prompt_tokens or 0,
|
|
503
|
-
completion_tokens or 0,
|
|
504
|
-
total_tokens,
|
|
505
|
-
fully_qualified_model_id=fully_qualified_model_id,
|
|
506
|
-
provider_id=provider_id,
|
|
507
|
-
)
|
|
508
|
-
return [MetricInResponse(metric=metric.metric, value=metric.value) for metric in metrics]
|
|
509
|
-
|
|
510
280
|
async def stream_tokens_and_compute_metrics_openai_chat(
|
|
511
281
|
self,
|
|
512
282
|
response: AsyncIterator[OpenAIChatCompletionChunk],
|
|
@@ -574,8 +344,34 @@ class InferenceRouter(Inference):
|
|
|
574
344
|
)
|
|
575
345
|
if choice_delta.finish_reason:
|
|
576
346
|
current_choice_data["finish_reason"] = choice_delta.finish_reason
|
|
347
|
+
|
|
348
|
+
# Convert logprobs from chat completion format to responses format
|
|
349
|
+
# Chat completion returns list of ChatCompletionTokenLogprob, but
|
|
350
|
+
# expecting list of OpenAITokenLogProb in OpenAIChoice
|
|
577
351
|
if choice_delta.logprobs and choice_delta.logprobs.content:
|
|
578
|
-
|
|
352
|
+
converted_logprobs = []
|
|
353
|
+
for token_logprob in choice_delta.logprobs.content:
|
|
354
|
+
top_logprobs = None
|
|
355
|
+
if token_logprob.top_logprobs:
|
|
356
|
+
top_logprobs = [
|
|
357
|
+
OpenAITopLogProb(
|
|
358
|
+
token=tlp.token,
|
|
359
|
+
bytes=tlp.bytes,
|
|
360
|
+
logprob=tlp.logprob,
|
|
361
|
+
)
|
|
362
|
+
for tlp in token_logprob.top_logprobs
|
|
363
|
+
]
|
|
364
|
+
converted_logprobs.append(
|
|
365
|
+
OpenAITokenLogProb(
|
|
366
|
+
token=token_logprob.token,
|
|
367
|
+
bytes=token_logprob.bytes,
|
|
368
|
+
logprob=token_logprob.logprob,
|
|
369
|
+
top_logprobs=top_logprobs,
|
|
370
|
+
)
|
|
371
|
+
)
|
|
372
|
+
# Update choice delta with the newly formatted logprobs object
|
|
373
|
+
choice_delta.logprobs.content = converted_logprobs
|
|
374
|
+
current_choice_data["logprobs_content_parts"].extend(converted_logprobs)
|
|
579
375
|
|
|
580
376
|
# Compute metrics on final chunk
|
|
581
377
|
if chunk.choices and chunk.choices[0].finish_reason:
|
|
@@ -583,18 +379,6 @@ class InferenceRouter(Inference):
|
|
|
583
379
|
for choice_data in choices_data.values():
|
|
584
380
|
completion_text += "".join(choice_data["content_parts"])
|
|
585
381
|
|
|
586
|
-
# Add metrics to the chunk
|
|
587
|
-
if self.telemetry and hasattr(chunk, "usage") and chunk.usage:
|
|
588
|
-
metrics = self._construct_metrics(
|
|
589
|
-
prompt_tokens=chunk.usage.prompt_tokens,
|
|
590
|
-
completion_tokens=chunk.usage.completion_tokens,
|
|
591
|
-
total_tokens=chunk.usage.total_tokens,
|
|
592
|
-
model_id=fully_qualified_model_id,
|
|
593
|
-
provider_id=provider_id,
|
|
594
|
-
)
|
|
595
|
-
for metric in metrics:
|
|
596
|
-
enqueue_event(metric)
|
|
597
|
-
|
|
598
382
|
yield chunk
|
|
599
383
|
finally:
|
|
600
384
|
# Store the final assembled completion
|
|
@@ -6,23 +6,26 @@
|
|
|
6
6
|
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
from llama_stack.
|
|
12
|
-
from llama_stack.apis.shields import Shield
|
|
9
|
+
from opentelemetry import trace
|
|
10
|
+
|
|
11
|
+
from llama_stack.core.datatypes import SafetyConfig
|
|
13
12
|
from llama_stack.log import get_logger
|
|
14
|
-
from llama_stack.
|
|
13
|
+
from llama_stack.telemetry.helpers import safety_request_span_attributes, safety_span_name
|
|
14
|
+
from llama_stack_api import ModerationObject, OpenAIMessageParam, RoutingTable, RunShieldResponse, Safety, Shield
|
|
15
15
|
|
|
16
16
|
logger = get_logger(name=__name__, category="core::routers")
|
|
17
|
+
tracer = trace.get_tracer(__name__)
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class SafetyRouter(Safety):
|
|
20
21
|
def __init__(
|
|
21
22
|
self,
|
|
22
23
|
routing_table: RoutingTable,
|
|
24
|
+
safety_config: SafetyConfig | None = None,
|
|
23
25
|
) -> None:
|
|
24
26
|
logger.debug("Initializing SafetyRouter")
|
|
25
27
|
self.routing_table = routing_table
|
|
28
|
+
self.safety_config = safety_config
|
|
26
29
|
|
|
27
30
|
async def initialize(self) -> None:
|
|
28
31
|
logger.debug("SafetyRouter.initialize")
|
|
@@ -49,37 +52,62 @@ class SafetyRouter(Safety):
|
|
|
49
52
|
async def run_shield(
|
|
50
53
|
self,
|
|
51
54
|
shield_id: str,
|
|
52
|
-
messages: list[
|
|
55
|
+
messages: list[OpenAIMessageParam],
|
|
53
56
|
params: dict[str, Any] = None,
|
|
54
57
|
) -> RunShieldResponse:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
58
|
+
with tracer.start_as_current_span(name=safety_span_name(shield_id)):
|
|
59
|
+
logger.debug(f"SafetyRouter.run_shield: {shield_id}")
|
|
60
|
+
provider = await self.routing_table.get_provider_impl(shield_id)
|
|
61
|
+
response = await provider.run_shield(
|
|
62
|
+
shield_id=shield_id,
|
|
63
|
+
messages=messages,
|
|
64
|
+
params=params,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
safety_request_span_attributes(shield_id, messages, response)
|
|
68
|
+
return response
|
|
62
69
|
|
|
63
|
-
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
list_shields_response = await self.routing_table.list_shields()
|
|
70
|
+
async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
|
|
71
|
+
list_shields_response = await self.routing_table.list_shields()
|
|
72
|
+
shields = list_shields_response.data
|
|
67
73
|
|
|
68
|
-
|
|
74
|
+
selected_shield: Shield | None = None
|
|
75
|
+
provider_model: str | None = model
|
|
69
76
|
|
|
77
|
+
if model:
|
|
78
|
+
matches: list[Shield] = [s for s in shields if model == s.provider_resource_id]
|
|
70
79
|
if not matches:
|
|
71
|
-
raise ValueError(
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"No shield associated with provider_resource id {model}: choose from {[s.provider_resource_id for s in shields]}"
|
|
82
|
+
)
|
|
72
83
|
if len(matches) > 1:
|
|
73
|
-
raise ValueError(
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"Multiple shields associated with provider_resource id {model}: matched shields {[s.identifier for s in matches]}"
|
|
86
|
+
)
|
|
87
|
+
selected_shield = matches[0]
|
|
88
|
+
else:
|
|
89
|
+
default_shield_id = self.safety_config.default_shield_id if self.safety_config else None
|
|
90
|
+
if not default_shield_id:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"No moderation model specified and no default_shield_id configured in safety config: select model "
|
|
93
|
+
f"from {[s.provider_resource_id or s.identifier for s in shields]}"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
selected_shield = next((s for s in shields if s.identifier == default_shield_id), None)
|
|
97
|
+
if selected_shield is None:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Default moderation model not found. Choose from {[s.provider_resource_id or s.identifier for s in shields]}."
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
provider_model = selected_shield.provider_resource_id
|
|
103
|
+
|
|
104
|
+
shield_id = selected_shield.identifier
|
|
77
105
|
logger.debug(f"SafetyRouter.run_moderation: {shield_id}")
|
|
78
106
|
provider = await self.routing_table.get_provider_impl(shield_id)
|
|
79
107
|
|
|
80
108
|
response = await provider.run_moderation(
|
|
81
109
|
input=input,
|
|
82
|
-
model=
|
|
110
|
+
model=provider_model,
|
|
83
111
|
)
|
|
84
112
|
|
|
85
113
|
return response
|
|
@@ -6,19 +6,12 @@
|
|
|
6
6
|
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from llama_stack.
|
|
9
|
+
from llama_stack.log import get_logger
|
|
10
|
+
from llama_stack_api import (
|
|
10
11
|
URL,
|
|
11
|
-
InterleavedContent,
|
|
12
|
-
)
|
|
13
|
-
from llama_stack.apis.tools import (
|
|
14
12
|
ListToolDefsResponse,
|
|
15
|
-
RAGDocument,
|
|
16
|
-
RAGQueryConfig,
|
|
17
|
-
RAGQueryResult,
|
|
18
|
-
RAGToolRuntime,
|
|
19
13
|
ToolRuntime,
|
|
20
14
|
)
|
|
21
|
-
from llama_stack.log import get_logger
|
|
22
15
|
|
|
23
16
|
from ..routing_tables.toolgroups import ToolGroupsRoutingTable
|
|
24
17
|
|
|
@@ -26,36 +19,6 @@ logger = get_logger(name=__name__, category="core::routers")
|
|
|
26
19
|
|
|
27
20
|
|
|
28
21
|
class ToolRuntimeRouter(ToolRuntime):
|
|
29
|
-
class RagToolImpl(RAGToolRuntime):
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
routing_table: ToolGroupsRoutingTable,
|
|
33
|
-
) -> None:
|
|
34
|
-
logger.debug("Initializing ToolRuntimeRouter.RagToolImpl")
|
|
35
|
-
self.routing_table = routing_table
|
|
36
|
-
|
|
37
|
-
async def query(
|
|
38
|
-
self,
|
|
39
|
-
content: InterleavedContent,
|
|
40
|
-
vector_store_ids: list[str],
|
|
41
|
-
query_config: RAGQueryConfig | None = None,
|
|
42
|
-
) -> RAGQueryResult:
|
|
43
|
-
logger.debug(f"ToolRuntimeRouter.RagToolImpl.query: {vector_store_ids}")
|
|
44
|
-
provider = await self.routing_table.get_provider_impl("knowledge_search")
|
|
45
|
-
return await provider.query(content, vector_store_ids, query_config)
|
|
46
|
-
|
|
47
|
-
async def insert(
|
|
48
|
-
self,
|
|
49
|
-
documents: list[RAGDocument],
|
|
50
|
-
vector_store_id: str,
|
|
51
|
-
chunk_size_in_tokens: int = 512,
|
|
52
|
-
) -> None:
|
|
53
|
-
logger.debug(
|
|
54
|
-
f"ToolRuntimeRouter.RagToolImpl.insert: {vector_store_id}, {len(documents)} documents, chunk_size={chunk_size_in_tokens}"
|
|
55
|
-
)
|
|
56
|
-
provider = await self.routing_table.get_provider_impl("insert_into_memory")
|
|
57
|
-
return await provider.insert(documents, vector_store_id, chunk_size_in_tokens)
|
|
58
|
-
|
|
59
22
|
def __init__(
|
|
60
23
|
self,
|
|
61
24
|
routing_table: ToolGroupsRoutingTable,
|
|
@@ -63,11 +26,6 @@ class ToolRuntimeRouter(ToolRuntime):
|
|
|
63
26
|
logger.debug("Initializing ToolRuntimeRouter")
|
|
64
27
|
self.routing_table = routing_table
|
|
65
28
|
|
|
66
|
-
# HACK ALERT this should be in sync with "get_all_api_endpoints()"
|
|
67
|
-
self.rag_tool = self.RagToolImpl(routing_table)
|
|
68
|
-
for method in ("query", "insert"):
|
|
69
|
-
setattr(self, f"rag_tool.{method}", getattr(self.rag_tool, method))
|
|
70
|
-
|
|
71
29
|
async def initialize(self) -> None:
|
|
72
30
|
logger.debug("ToolRuntimeRouter.initialize")
|
|
73
31
|
pass
|
|
@@ -76,16 +34,16 @@ class ToolRuntimeRouter(ToolRuntime):
|
|
|
76
34
|
logger.debug("ToolRuntimeRouter.shutdown")
|
|
77
35
|
pass
|
|
78
36
|
|
|
79
|
-
async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any]) -> Any:
|
|
37
|
+
async def invoke_tool(self, tool_name: str, kwargs: dict[str, Any], authorization: str | None = None) -> Any:
|
|
80
38
|
logger.debug(f"ToolRuntimeRouter.invoke_tool: {tool_name}")
|
|
81
39
|
provider = await self.routing_table.get_provider_impl(tool_name)
|
|
82
40
|
return await provider.invoke_tool(
|
|
83
41
|
tool_name=tool_name,
|
|
84
42
|
kwargs=kwargs,
|
|
43
|
+
authorization=authorization,
|
|
85
44
|
)
|
|
86
45
|
|
|
87
46
|
async def list_runtime_tools(
|
|
88
|
-
self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None
|
|
47
|
+
self, tool_group_id: str | None = None, mcp_endpoint: URL | None = None, authorization: str | None = None
|
|
89
48
|
) -> ListToolDefsResponse:
|
|
90
|
-
|
|
91
|
-
return await self.routing_table.list_tools(tool_group_id)
|
|
49
|
+
return await self.routing_table.list_tools(tool_group_id, authorization=authorization)
|