llama-stack 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/__init__.py +0 -5
- llama_stack/cli/llama.py +3 -3
- llama_stack/cli/stack/_list_deps.py +12 -23
- llama_stack/cli/stack/list_stacks.py +37 -18
- llama_stack/cli/stack/run.py +121 -11
- llama_stack/cli/stack/utils.py +0 -127
- llama_stack/core/access_control/access_control.py +69 -28
- llama_stack/core/access_control/conditions.py +15 -5
- llama_stack/core/admin.py +267 -0
- llama_stack/core/build.py +6 -74
- llama_stack/core/client.py +1 -1
- llama_stack/core/configure.py +6 -6
- llama_stack/core/conversations/conversations.py +28 -25
- llama_stack/core/datatypes.py +271 -79
- llama_stack/core/distribution.py +15 -16
- llama_stack/core/external.py +3 -3
- llama_stack/core/inspect.py +98 -15
- llama_stack/core/library_client.py +73 -61
- llama_stack/core/prompts/prompts.py +12 -11
- llama_stack/core/providers.py +17 -11
- llama_stack/core/resolver.py +65 -56
- llama_stack/core/routers/__init__.py +8 -12
- llama_stack/core/routers/datasets.py +1 -4
- llama_stack/core/routers/eval_scoring.py +7 -4
- llama_stack/core/routers/inference.py +55 -271
- llama_stack/core/routers/safety.py +52 -24
- llama_stack/core/routers/tool_runtime.py +6 -48
- llama_stack/core/routers/vector_io.py +130 -51
- llama_stack/core/routing_tables/benchmarks.py +24 -20
- llama_stack/core/routing_tables/common.py +1 -4
- llama_stack/core/routing_tables/datasets.py +22 -22
- llama_stack/core/routing_tables/models.py +119 -6
- llama_stack/core/routing_tables/scoring_functions.py +7 -7
- llama_stack/core/routing_tables/shields.py +1 -2
- llama_stack/core/routing_tables/toolgroups.py +17 -7
- llama_stack/core/routing_tables/vector_stores.py +51 -16
- llama_stack/core/server/auth.py +5 -3
- llama_stack/core/server/auth_providers.py +36 -20
- llama_stack/core/server/fastapi_router_registry.py +84 -0
- llama_stack/core/server/quota.py +2 -2
- llama_stack/core/server/routes.py +79 -27
- llama_stack/core/server/server.py +102 -87
- llama_stack/core/stack.py +201 -58
- llama_stack/core/storage/datatypes.py +26 -3
- llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
- llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
- llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
- llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
- llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
- llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
- llama_stack/core/storage/sqlstore/__init__.py +17 -0
- llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/config.py +8 -2
- llama_stack/core/utils/config_resolution.py +32 -29
- llama_stack/core/utils/context.py +4 -10
- llama_stack/core/utils/exec.py +9 -0
- llama_stack/core/utils/type_inspection.py +45 -0
- llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/dell/dell.py +2 -2
- llama_stack/distributions/dell/run-with-safety.yaml +3 -2
- llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
- llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
- llama_stack/distributions/nvidia/nvidia.py +1 -1
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
- llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
- llama_stack/distributions/oci/config.yaml +134 -0
- llama_stack/distributions/oci/oci.py +108 -0
- llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
- llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
- llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/starter/starter.py +8 -5
- llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/template.py +13 -69
- llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/watsonx/watsonx.py +1 -1
- llama_stack/log.py +28 -11
- llama_stack/models/llama/checkpoint.py +6 -6
- llama_stack/models/llama/hadamard_utils.py +2 -0
- llama_stack/models/llama/llama3/generation.py +3 -1
- llama_stack/models/llama/llama3/interface.py +2 -5
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
- llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
- llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
- llama_stack/models/llama/llama3/tool_utils.py +2 -1
- llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
- llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
- llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
- llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
- llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
- llama_stack/providers/inline/batches/reference/__init__.py +2 -4
- llama_stack/providers/inline/batches/reference/batches.py +78 -60
- llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
- llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
- llama_stack/providers/inline/files/localfs/files.py +37 -28
- llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
- llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
- llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
- llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
- llama_stack/providers/inline/post_training/common/validator.py +1 -5
- llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
- llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
- llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
- llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
- llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
- llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
- llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
- llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
- llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
- llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/faiss.py +43 -28
- llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +40 -33
- llama_stack/providers/registry/agents.py +7 -3
- llama_stack/providers/registry/batches.py +1 -1
- llama_stack/providers/registry/datasetio.py +1 -1
- llama_stack/providers/registry/eval.py +1 -1
- llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
- llama_stack/providers/registry/files.py +11 -2
- llama_stack/providers/registry/inference.py +22 -3
- llama_stack/providers/registry/post_training.py +1 -1
- llama_stack/providers/registry/safety.py +1 -1
- llama_stack/providers/registry/scoring.py +1 -1
- llama_stack/providers/registry/tool_runtime.py +2 -2
- llama_stack/providers/registry/vector_io.py +7 -7
- llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
- llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
- llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
- llama_stack/providers/remote/files/openai/__init__.py +19 -0
- llama_stack/providers/remote/files/openai/config.py +28 -0
- llama_stack/providers/remote/files/openai/files.py +253 -0
- llama_stack/providers/remote/files/s3/files.py +52 -30
- llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
- llama_stack/providers/remote/inference/anthropic/config.py +1 -1
- llama_stack/providers/remote/inference/azure/azure.py +1 -3
- llama_stack/providers/remote/inference/azure/config.py +8 -7
- llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
- llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
- llama_stack/providers/remote/inference/bedrock/config.py +24 -3
- llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
- llama_stack/providers/remote/inference/cerebras/config.py +12 -5
- llama_stack/providers/remote/inference/databricks/config.py +13 -6
- llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
- llama_stack/providers/remote/inference/fireworks/config.py +5 -5
- llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
- llama_stack/providers/remote/inference/gemini/config.py +1 -1
- llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
- llama_stack/providers/remote/inference/groq/config.py +5 -5
- llama_stack/providers/remote/inference/groq/groq.py +1 -1
- llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
- llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
- llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
- llama_stack/providers/remote/inference/nvidia/config.py +21 -11
- llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
- llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
- llama_stack/providers/remote/inference/oci/__init__.py +17 -0
- llama_stack/providers/remote/inference/oci/auth.py +79 -0
- llama_stack/providers/remote/inference/oci/config.py +75 -0
- llama_stack/providers/remote/inference/oci/oci.py +162 -0
- llama_stack/providers/remote/inference/ollama/config.py +7 -5
- llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
- llama_stack/providers/remote/inference/openai/config.py +4 -4
- llama_stack/providers/remote/inference/openai/openai.py +1 -1
- llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
- llama_stack/providers/remote/inference/passthrough/config.py +5 -10
- llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
- llama_stack/providers/remote/inference/runpod/config.py +12 -5
- llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
- llama_stack/providers/remote/inference/sambanova/config.py +5 -5
- llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
- llama_stack/providers/remote/inference/tgi/config.py +7 -6
- llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
- llama_stack/providers/remote/inference/together/config.py +5 -5
- llama_stack/providers/remote/inference/together/together.py +15 -12
- llama_stack/providers/remote/inference/vertexai/config.py +1 -1
- llama_stack/providers/remote/inference/vllm/config.py +5 -5
- llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
- llama_stack/providers/remote/inference/watsonx/config.py +4 -4
- llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
- llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
- llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
- llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
- llama_stack/providers/remote/safety/bedrock/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
- llama_stack/providers/remote/safety/sambanova/config.py +1 -1
- llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
- llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
- llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
- llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
- llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
- llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/chroma/chroma.py +125 -20
- llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/milvus.py +27 -21
- llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +26 -18
- llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +141 -24
- llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/weaviate.py +26 -21
- llama_stack/providers/utils/common/data_schema_validator.py +1 -5
- llama_stack/providers/utils/files/form_data.py +1 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
- llama_stack/providers/utils/inference/inference_store.py +7 -8
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
- llama_stack/providers/utils/inference/model_registry.py +1 -3
- llama_stack/providers/utils/inference/openai_compat.py +44 -1171
- llama_stack/providers/utils/inference/openai_mixin.py +68 -42
- llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
- llama_stack/providers/utils/inference/stream_utils.py +23 -0
- llama_stack/providers/utils/memory/__init__.py +2 -0
- llama_stack/providers/utils/memory/file_utils.py +1 -1
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
- llama_stack/providers/utils/memory/vector_store.py +39 -38
- llama_stack/providers/utils/pagination.py +1 -1
- llama_stack/providers/utils/responses/responses_store.py +15 -25
- llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
- llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
- llama_stack/providers/utils/tools/mcp.py +93 -11
- llama_stack/telemetry/constants.py +27 -0
- llama_stack/telemetry/helpers.py +43 -0
- llama_stack/testing/api_recorder.py +25 -16
- {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/METADATA +56 -54
- llama_stack-0.4.0.dist-info/RECORD +588 -0
- llama_stack-0.4.0.dist-info/top_level.txt +2 -0
- llama_stack_api/__init__.py +945 -0
- llama_stack_api/admin/__init__.py +45 -0
- llama_stack_api/admin/api.py +72 -0
- llama_stack_api/admin/fastapi_routes.py +117 -0
- llama_stack_api/admin/models.py +113 -0
- llama_stack_api/agents.py +173 -0
- llama_stack_api/batches/__init__.py +40 -0
- llama_stack_api/batches/api.py +53 -0
- llama_stack_api/batches/fastapi_routes.py +113 -0
- llama_stack_api/batches/models.py +78 -0
- llama_stack_api/benchmarks/__init__.py +43 -0
- llama_stack_api/benchmarks/api.py +39 -0
- llama_stack_api/benchmarks/fastapi_routes.py +109 -0
- llama_stack_api/benchmarks/models.py +109 -0
- {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
- {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
- {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
- llama_stack_api/common/responses.py +77 -0
- {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
- {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
- llama_stack_api/connectors.py +146 -0
- {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
- {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
- llama_stack_api/datasets/__init__.py +61 -0
- llama_stack_api/datasets/api.py +35 -0
- llama_stack_api/datasets/fastapi_routes.py +104 -0
- llama_stack_api/datasets/models.py +152 -0
- {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
- {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
- llama_stack_api/file_processors/__init__.py +27 -0
- llama_stack_api/file_processors/api.py +64 -0
- llama_stack_api/file_processors/fastapi_routes.py +78 -0
- llama_stack_api/file_processors/models.py +42 -0
- llama_stack_api/files/__init__.py +35 -0
- llama_stack_api/files/api.py +51 -0
- llama_stack_api/files/fastapi_routes.py +124 -0
- llama_stack_api/files/models.py +107 -0
- {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
- llama_stack_api/inspect_api/__init__.py +37 -0
- llama_stack_api/inspect_api/api.py +25 -0
- llama_stack_api/inspect_api/fastapi_routes.py +76 -0
- llama_stack_api/inspect_api/models.py +28 -0
- {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
- llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
- llama_stack_api/internal/sqlstore.py +79 -0
- {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
- {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
- {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
- {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
- llama_stack_api/providers/__init__.py +33 -0
- llama_stack_api/providers/api.py +16 -0
- llama_stack_api/providers/fastapi_routes.py +57 -0
- llama_stack_api/providers/models.py +24 -0
- {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
- {llama_stack/apis → llama_stack_api}/resource.py +1 -1
- llama_stack_api/router_utils.py +160 -0
- {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
- {llama_stack → llama_stack_api}/schema_utils.py +94 -4
- {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
- {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
- {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
- {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
- {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
- {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
- llama_stack/apis/agents/agents.py +0 -894
- llama_stack/apis/batches/__init__.py +0 -9
- llama_stack/apis/batches/batches.py +0 -100
- llama_stack/apis/benchmarks/__init__.py +0 -7
- llama_stack/apis/benchmarks/benchmarks.py +0 -108
- llama_stack/apis/common/responses.py +0 -36
- llama_stack/apis/conversations/__init__.py +0 -31
- llama_stack/apis/datasets/datasets.py +0 -251
- llama_stack/apis/datatypes.py +0 -160
- llama_stack/apis/eval/__init__.py +0 -7
- llama_stack/apis/files/__init__.py +0 -7
- llama_stack/apis/files/files.py +0 -199
- llama_stack/apis/inference/__init__.py +0 -7
- llama_stack/apis/inference/event_logger.py +0 -43
- llama_stack/apis/inspect/__init__.py +0 -7
- llama_stack/apis/inspect/inspect.py +0 -94
- llama_stack/apis/models/__init__.py +0 -7
- llama_stack/apis/post_training/__init__.py +0 -7
- llama_stack/apis/prompts/__init__.py +0 -9
- llama_stack/apis/providers/__init__.py +0 -7
- llama_stack/apis/providers/providers.py +0 -69
- llama_stack/apis/safety/__init__.py +0 -7
- llama_stack/apis/scoring/__init__.py +0 -7
- llama_stack/apis/scoring_functions/__init__.py +0 -7
- llama_stack/apis/shields/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
- llama_stack/apis/telemetry/__init__.py +0 -7
- llama_stack/apis/telemetry/telemetry.py +0 -423
- llama_stack/apis/tools/__init__.py +0 -8
- llama_stack/apis/vector_io/__init__.py +0 -7
- llama_stack/apis/vector_stores/__init__.py +0 -7
- llama_stack/core/server/tracing.py +0 -80
- llama_stack/core/ui/app.py +0 -55
- llama_stack/core/ui/modules/__init__.py +0 -5
- llama_stack/core/ui/modules/api.py +0 -32
- llama_stack/core/ui/modules/utils.py +0 -42
- llama_stack/core/ui/page/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/datasets.py +0 -18
- llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
- llama_stack/core/ui/page/distribution/models.py +0 -18
- llama_stack/core/ui/page/distribution/providers.py +0 -27
- llama_stack/core/ui/page/distribution/resources.py +0 -48
- llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
- llama_stack/core/ui/page/distribution/shields.py +0 -19
- llama_stack/core/ui/page/evaluations/__init__.py +0 -5
- llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
- llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
- llama_stack/core/ui/page/playground/__init__.py +0 -5
- llama_stack/core/ui/page/playground/chat.py +0 -130
- llama_stack/core/ui/page/playground/tools.py +0 -352
- llama_stack/distributions/dell/build.yaml +0 -33
- llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
- llama_stack/distributions/nvidia/build.yaml +0 -29
- llama_stack/distributions/open-benchmark/build.yaml +0 -36
- llama_stack/distributions/postgres-demo/__init__.py +0 -7
- llama_stack/distributions/postgres-demo/build.yaml +0 -23
- llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
- llama_stack/distributions/starter/build.yaml +0 -61
- llama_stack/distributions/starter-gpu/build.yaml +0 -61
- llama_stack/distributions/watsonx/build.yaml +0 -33
- llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
- llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
- llama_stack/providers/inline/telemetry/__init__.py +0 -5
- llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
- llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
- llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
- llama_stack/providers/remote/inference/bedrock/models.py +0 -29
- llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
- llama_stack/providers/utils/sqlstore/__init__.py +0 -5
- llama_stack/providers/utils/sqlstore/api.py +0 -128
- llama_stack/providers/utils/telemetry/__init__.py +0 -5
- llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
- llama_stack/providers/utils/telemetry/tracing.py +0 -384
- llama_stack/strong_typing/__init__.py +0 -19
- llama_stack/strong_typing/auxiliary.py +0 -228
- llama_stack/strong_typing/classdef.py +0 -440
- llama_stack/strong_typing/core.py +0 -46
- llama_stack/strong_typing/deserializer.py +0 -877
- llama_stack/strong_typing/docstring.py +0 -409
- llama_stack/strong_typing/exception.py +0 -23
- llama_stack/strong_typing/inspection.py +0 -1085
- llama_stack/strong_typing/mapping.py +0 -40
- llama_stack/strong_typing/name.py +0 -182
- llama_stack/strong_typing/schema.py +0 -792
- llama_stack/strong_typing/serialization.py +0 -97
- llama_stack/strong_typing/serializer.py +0 -500
- llama_stack/strong_typing/slots.py +0 -27
- llama_stack/strong_typing/topological.py +0 -89
- llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
- llama_stack-0.3.5.dist-info/RECORD +0 -625
- llama_stack-0.3.5.dist-info/top_level.txt +0 -1
- /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
- /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
- /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
- {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/WHEEL +0 -0
- {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.3.5.dist-info → llama_stack-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
- {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
- {llama_stack/apis → llama_stack_api}/version.py +0 -0
|
@@ -5,29 +5,26 @@
|
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
import math
|
|
8
|
-
from collections.abc import Generator
|
|
9
8
|
from typing import Optional
|
|
10
9
|
|
|
11
10
|
import torch
|
|
12
11
|
from lmformatenforcer import JsonSchemaParser, TokenEnforcer, TokenEnforcerTokenizerData
|
|
13
12
|
|
|
14
|
-
from llama_stack.
|
|
15
|
-
GreedySamplingStrategy,
|
|
16
|
-
JsonSchemaResponseFormat,
|
|
17
|
-
ResponseFormat,
|
|
18
|
-
SamplingParams,
|
|
19
|
-
TopPSamplingStrategy,
|
|
20
|
-
)
|
|
21
|
-
from llama_stack.models.llama.datatypes import QuantizationMode
|
|
13
|
+
from llama_stack.models.llama.datatypes import QuantizationMode, ToolPromptFormat
|
|
22
14
|
from llama_stack.models.llama.llama3.generation import Llama3
|
|
23
15
|
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
|
|
24
16
|
from llama_stack.models.llama.llama4.generation import Llama4
|
|
25
17
|
from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
|
|
26
18
|
from llama_stack.models.llama.sku_types import Model, ModelFamily
|
|
27
|
-
from
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
19
|
+
from llama_stack_api import (
|
|
20
|
+
GreedySamplingStrategy,
|
|
21
|
+
JsonSchemaResponseFormat,
|
|
22
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
23
|
+
OpenAIResponseFormatJSONSchema,
|
|
24
|
+
ResponseFormat,
|
|
25
|
+
ResponseFormatType,
|
|
26
|
+
SamplingParams,
|
|
27
|
+
TopPSamplingStrategy,
|
|
31
28
|
)
|
|
32
29
|
|
|
33
30
|
from .common import model_checkpoint_dir
|
|
@@ -106,14 +103,6 @@ def _infer_sampling_params(sampling_params: SamplingParams):
|
|
|
106
103
|
return temperature, top_p
|
|
107
104
|
|
|
108
105
|
|
|
109
|
-
def _infer_tool_prompt_format(request: ChatCompletionRequestWithRawContent):
|
|
110
|
-
tool_config = request.tool_config
|
|
111
|
-
if tool_config is not None and tool_config.tool_prompt_format is not None:
|
|
112
|
-
return tool_config.tool_prompt_format
|
|
113
|
-
else:
|
|
114
|
-
return get_default_tool_prompt_format(request.model)
|
|
115
|
-
|
|
116
|
-
|
|
117
106
|
class LlamaGenerator:
|
|
118
107
|
def __init__(
|
|
119
108
|
self,
|
|
@@ -157,55 +146,56 @@ class LlamaGenerator:
|
|
|
157
146
|
self.args = self.inner_generator.args
|
|
158
147
|
self.formatter = self.inner_generator.formatter
|
|
159
148
|
|
|
160
|
-
def completion(
|
|
161
|
-
self,
|
|
162
|
-
request_batch: list[CompletionRequestWithRawContent],
|
|
163
|
-
) -> Generator:
|
|
164
|
-
first_request = request_batch[0]
|
|
165
|
-
sampling_params = first_request.sampling_params or SamplingParams()
|
|
166
|
-
max_gen_len = sampling_params.max_tokens
|
|
167
|
-
if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
|
|
168
|
-
max_gen_len = self.args.max_seq_len - 1
|
|
169
|
-
|
|
170
|
-
temperature, top_p = _infer_sampling_params(sampling_params)
|
|
171
|
-
yield from self.inner_generator.generate(
|
|
172
|
-
llm_inputs=[self.formatter.encode_content(request.content) for request in request_batch],
|
|
173
|
-
max_gen_len=max_gen_len,
|
|
174
|
-
temperature=temperature,
|
|
175
|
-
top_p=top_p,
|
|
176
|
-
logprobs=bool(first_request.logprobs),
|
|
177
|
-
echo=False,
|
|
178
|
-
logits_processor=get_logits_processor(
|
|
179
|
-
self.tokenizer,
|
|
180
|
-
self.args.vocab_size,
|
|
181
|
-
first_request.response_format,
|
|
182
|
-
),
|
|
183
|
-
)
|
|
184
|
-
|
|
185
149
|
def chat_completion(
|
|
186
150
|
self,
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
151
|
+
request: OpenAIChatCompletionRequestWithExtraBody,
|
|
152
|
+
raw_messages: list,
|
|
153
|
+
):
|
|
154
|
+
"""Generate chat completion using OpenAI request format.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
request: OpenAI chat completion request
|
|
158
|
+
raw_messages: Pre-converted list of RawMessage objects
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
# Determine tool prompt format
|
|
162
|
+
tool_prompt_format = ToolPromptFormat.json if request.tools else ToolPromptFormat.json
|
|
163
|
+
|
|
164
|
+
# Prepare sampling params
|
|
165
|
+
sampling_params = SamplingParams()
|
|
166
|
+
if request.temperature is not None or request.top_p is not None:
|
|
167
|
+
sampling_params.strategy = TopPSamplingStrategy(
|
|
168
|
+
temperature=request.temperature if request.temperature is not None else 1.0,
|
|
169
|
+
top_p=request.top_p if request.top_p is not None else 1.0,
|
|
170
|
+
)
|
|
171
|
+
if request.max_tokens:
|
|
172
|
+
sampling_params.max_tokens = request.max_tokens
|
|
173
|
+
|
|
191
174
|
max_gen_len = sampling_params.max_tokens
|
|
192
175
|
if max_gen_len is None or max_gen_len == 0 or max_gen_len >= self.args.max_seq_len:
|
|
193
176
|
max_gen_len = self.args.max_seq_len - 1
|
|
194
177
|
|
|
195
178
|
temperature, top_p = _infer_sampling_params(sampling_params)
|
|
179
|
+
|
|
180
|
+
# Get logits processor for response format
|
|
181
|
+
logits_processor = None
|
|
182
|
+
if request.response_format:
|
|
183
|
+
if isinstance(request.response_format, OpenAIResponseFormatJSONSchema):
|
|
184
|
+
# Extract the actual schema from OpenAIJSONSchema TypedDict
|
|
185
|
+
schema_dict = request.response_format.json_schema.get("schema") or {}
|
|
186
|
+
json_schema_format = JsonSchemaResponseFormat(
|
|
187
|
+
type=ResponseFormatType.json_schema,
|
|
188
|
+
json_schema=schema_dict,
|
|
189
|
+
)
|
|
190
|
+
logits_processor = get_logits_processor(self.tokenizer, self.args.vocab_size, json_schema_format)
|
|
191
|
+
|
|
192
|
+
# Generate
|
|
196
193
|
yield from self.inner_generator.generate(
|
|
197
|
-
llm_inputs=[
|
|
198
|
-
self.formatter.encode_dialog_prompt(request.messages, _infer_tool_prompt_format(request))
|
|
199
|
-
for request in request_batch
|
|
200
|
-
],
|
|
194
|
+
llm_inputs=[self.formatter.encode_dialog_prompt(raw_messages, tool_prompt_format)],
|
|
201
195
|
max_gen_len=max_gen_len,
|
|
202
196
|
temperature=temperature,
|
|
203
197
|
top_p=top_p,
|
|
204
|
-
logprobs=
|
|
198
|
+
logprobs=False,
|
|
205
199
|
echo=False,
|
|
206
|
-
logits_processor=
|
|
207
|
-
self.tokenizer,
|
|
208
|
-
self.args.vocab_size,
|
|
209
|
-
first_request.response_format,
|
|
210
|
-
),
|
|
200
|
+
logits_processor=logits_processor,
|
|
211
201
|
)
|
|
@@ -5,27 +5,25 @@
|
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
+
import time
|
|
9
|
+
import uuid
|
|
8
10
|
from collections.abc import AsyncIterator
|
|
9
11
|
|
|
10
|
-
from llama_stack.apis.inference import (
|
|
11
|
-
InferenceProvider,
|
|
12
|
-
OpenAIChatCompletionRequestWithExtraBody,
|
|
13
|
-
OpenAICompletionRequestWithExtraBody,
|
|
14
|
-
)
|
|
15
|
-
from llama_stack.apis.inference.inference import (
|
|
16
|
-
OpenAIChatCompletion,
|
|
17
|
-
OpenAIChatCompletionChunk,
|
|
18
|
-
OpenAICompletion,
|
|
19
|
-
)
|
|
20
|
-
from llama_stack.apis.models import Model, ModelType
|
|
21
12
|
from llama_stack.log import get_logger
|
|
13
|
+
from llama_stack.models.llama.datatypes import RawMessage, RawTextItem, ToolDefinition
|
|
22
14
|
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
|
15
|
+
from llama_stack.models.llama.llama3.prompt_templates import (
|
|
16
|
+
JsonCustomToolGenerator,
|
|
17
|
+
SystemDefaultGenerator,
|
|
18
|
+
)
|
|
23
19
|
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
|
|
24
20
|
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
|
21
|
+
from llama_stack.models.llama.llama4.prompt_templates.system_prompts import (
|
|
22
|
+
PythonListCustomToolGenerator as PythonListCustomToolGeneratorLlama4,
|
|
23
|
+
)
|
|
25
24
|
from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
|
|
26
25
|
from llama_stack.models.llama.sku_list import resolve_model
|
|
27
|
-
from llama_stack.models.llama.sku_types import ModelFamily
|
|
28
|
-
from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
|
26
|
+
from llama_stack.models.llama.sku_types import ModelFamily, is_multimodal
|
|
29
27
|
from llama_stack.providers.utils.inference.embedding_mixin import (
|
|
30
28
|
SentenceTransformerEmbeddingMixin,
|
|
31
29
|
)
|
|
@@ -33,6 +31,22 @@ from llama_stack.providers.utils.inference.model_registry import (
|
|
|
33
31
|
ModelRegistryHelper,
|
|
34
32
|
build_hf_repo_model_entry,
|
|
35
33
|
)
|
|
34
|
+
from llama_stack_api import (
|
|
35
|
+
InferenceProvider,
|
|
36
|
+
Model,
|
|
37
|
+
ModelsProtocolPrivate,
|
|
38
|
+
ModelType,
|
|
39
|
+
OpenAIAssistantMessageParam,
|
|
40
|
+
OpenAIChatCompletion,
|
|
41
|
+
OpenAIChatCompletionChunk,
|
|
42
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
43
|
+
OpenAIChatCompletionUsage,
|
|
44
|
+
OpenAIChoice,
|
|
45
|
+
OpenAICompletion,
|
|
46
|
+
OpenAICompletionRequestWithExtraBody,
|
|
47
|
+
OpenAIUserMessageParam,
|
|
48
|
+
ToolChoice,
|
|
49
|
+
)
|
|
36
50
|
|
|
37
51
|
from .config import MetaReferenceInferenceConfig
|
|
38
52
|
from .generators import LlamaGenerator
|
|
@@ -44,6 +58,170 @@ log = get_logger(__name__, category="inference")
|
|
|
44
58
|
SEMAPHORE = asyncio.Semaphore(1)
|
|
45
59
|
|
|
46
60
|
|
|
61
|
+
def _convert_openai_tool_to_tool_definition(tool) -> ToolDefinition:
|
|
62
|
+
"""Convert OpenAI tool format to ToolDefinition format."""
|
|
63
|
+
# OpenAI tools have function.name and function.parameters
|
|
64
|
+
return ToolDefinition(
|
|
65
|
+
tool_name=tool.function.name,
|
|
66
|
+
description=tool.function.description or "",
|
|
67
|
+
parameters=tool.function.parameters or {},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get_tool_choice_prompt(tool_choice, tools) -> str:
|
|
72
|
+
"""Generate prompt text for tool_choice behavior."""
|
|
73
|
+
if not tool_choice or tool_choice == ToolChoice.auto or tool_choice == "auto":
|
|
74
|
+
return ""
|
|
75
|
+
elif tool_choice == ToolChoice.required or tool_choice == "required":
|
|
76
|
+
return "You MUST use one of the provided functions/tools to answer the user query."
|
|
77
|
+
elif tool_choice == ToolChoice.none or tool_choice == "none":
|
|
78
|
+
return ""
|
|
79
|
+
else:
|
|
80
|
+
# Specific tool specified
|
|
81
|
+
return f"You MUST use the tool `{tool_choice}` to answer the user query."
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _raw_content_as_str(content) -> str:
|
|
85
|
+
"""Convert RawContent to string for system messages."""
|
|
86
|
+
if isinstance(content, str):
|
|
87
|
+
return content
|
|
88
|
+
elif isinstance(content, RawTextItem):
|
|
89
|
+
return content.text
|
|
90
|
+
elif isinstance(content, list):
|
|
91
|
+
return "\n".join(_raw_content_as_str(c) for c in content)
|
|
92
|
+
else:
|
|
93
|
+
return "<media>"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _augment_raw_messages_for_tools_llama_3_1(
|
|
97
|
+
raw_messages: list[RawMessage],
|
|
98
|
+
tools: list,
|
|
99
|
+
tool_choice,
|
|
100
|
+
) -> list[RawMessage]:
|
|
101
|
+
"""Augment raw messages with tool definitions for Llama 3.1 style models."""
|
|
102
|
+
messages = raw_messages.copy()
|
|
103
|
+
existing_system_message = None
|
|
104
|
+
if messages and messages[0].role == "system":
|
|
105
|
+
existing_system_message = messages.pop(0)
|
|
106
|
+
|
|
107
|
+
sys_content = ""
|
|
108
|
+
|
|
109
|
+
# Add tool definitions first (if present)
|
|
110
|
+
if tools:
|
|
111
|
+
# Convert OpenAI tools to ToolDefinitions
|
|
112
|
+
tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
|
|
113
|
+
|
|
114
|
+
# For OpenAI format, all tools are custom (have string names)
|
|
115
|
+
tool_gen = JsonCustomToolGenerator()
|
|
116
|
+
tool_template = tool_gen.gen(tool_definitions)
|
|
117
|
+
sys_content += tool_template.render()
|
|
118
|
+
sys_content += "\n"
|
|
119
|
+
|
|
120
|
+
# Add default system prompt
|
|
121
|
+
default_gen = SystemDefaultGenerator()
|
|
122
|
+
default_template = default_gen.gen()
|
|
123
|
+
sys_content += default_template.render()
|
|
124
|
+
|
|
125
|
+
# Add existing system message if present
|
|
126
|
+
if existing_system_message:
|
|
127
|
+
sys_content += "\n" + _raw_content_as_str(existing_system_message.content)
|
|
128
|
+
|
|
129
|
+
# Add tool choice prompt if needed
|
|
130
|
+
if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
|
|
131
|
+
sys_content += "\n" + tool_choice_prompt
|
|
132
|
+
|
|
133
|
+
# Create new system message
|
|
134
|
+
new_system_message = RawMessage(
|
|
135
|
+
role="system",
|
|
136
|
+
content=[RawTextItem(text=sys_content.strip())],
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
return [new_system_message] + messages
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _augment_raw_messages_for_tools_llama_4(
|
|
143
|
+
raw_messages: list[RawMessage],
|
|
144
|
+
tools: list,
|
|
145
|
+
tool_choice,
|
|
146
|
+
) -> list[RawMessage]:
|
|
147
|
+
"""Augment raw messages with tool definitions for Llama 4/3.2/3.3 style models."""
|
|
148
|
+
messages = raw_messages.copy()
|
|
149
|
+
existing_system_message = None
|
|
150
|
+
if messages and messages[0].role == "system":
|
|
151
|
+
existing_system_message = messages.pop(0)
|
|
152
|
+
|
|
153
|
+
sys_content = ""
|
|
154
|
+
|
|
155
|
+
# Add tool definitions if present
|
|
156
|
+
if tools:
|
|
157
|
+
# Convert OpenAI tools to ToolDefinitions
|
|
158
|
+
tool_definitions = [_convert_openai_tool_to_tool_definition(t) for t in tools]
|
|
159
|
+
|
|
160
|
+
# Use python_list format for Llama 4
|
|
161
|
+
tool_gen = PythonListCustomToolGeneratorLlama4()
|
|
162
|
+
system_prompt = None
|
|
163
|
+
if existing_system_message:
|
|
164
|
+
system_prompt = _raw_content_as_str(existing_system_message.content)
|
|
165
|
+
|
|
166
|
+
tool_template = tool_gen.gen(tool_definitions, system_prompt)
|
|
167
|
+
sys_content = tool_template.render()
|
|
168
|
+
elif existing_system_message:
|
|
169
|
+
# No tools, just use existing system message
|
|
170
|
+
sys_content = _raw_content_as_str(existing_system_message.content)
|
|
171
|
+
|
|
172
|
+
# Add tool choice prompt if needed
|
|
173
|
+
if tool_choice_prompt := _get_tool_choice_prompt(tool_choice, tools):
|
|
174
|
+
sys_content += "\n" + tool_choice_prompt
|
|
175
|
+
|
|
176
|
+
if sys_content:
|
|
177
|
+
new_system_message = RawMessage(
|
|
178
|
+
role="system",
|
|
179
|
+
content=[RawTextItem(text=sys_content.strip())],
|
|
180
|
+
)
|
|
181
|
+
return [new_system_message] + messages
|
|
182
|
+
|
|
183
|
+
return messages
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def augment_raw_messages_for_tools(
|
|
187
|
+
raw_messages: list[RawMessage],
|
|
188
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
189
|
+
llama_model,
|
|
190
|
+
) -> list[RawMessage]:
|
|
191
|
+
"""Augment raw messages with tool definitions based on model family."""
|
|
192
|
+
if not params.tools:
|
|
193
|
+
return raw_messages
|
|
194
|
+
|
|
195
|
+
# Determine augmentation strategy based on model family
|
|
196
|
+
if llama_model.model_family == ModelFamily.llama3_1 or (
|
|
197
|
+
llama_model.model_family == ModelFamily.llama3_2 and is_multimodal(llama_model.core_model_id)
|
|
198
|
+
):
|
|
199
|
+
# Llama 3.1 and Llama 3.2 multimodal use JSON format
|
|
200
|
+
return _augment_raw_messages_for_tools_llama_3_1(
|
|
201
|
+
raw_messages,
|
|
202
|
+
params.tools,
|
|
203
|
+
params.tool_choice,
|
|
204
|
+
)
|
|
205
|
+
elif llama_model.model_family in (
|
|
206
|
+
ModelFamily.llama3_2,
|
|
207
|
+
ModelFamily.llama3_3,
|
|
208
|
+
ModelFamily.llama4,
|
|
209
|
+
):
|
|
210
|
+
# Llama 3.2/3.3/4 use python_list format
|
|
211
|
+
return _augment_raw_messages_for_tools_llama_4(
|
|
212
|
+
raw_messages,
|
|
213
|
+
params.tools,
|
|
214
|
+
params.tool_choice,
|
|
215
|
+
)
|
|
216
|
+
else:
|
|
217
|
+
# Default to Llama 3.1 style
|
|
218
|
+
return _augment_raw_messages_for_tools_llama_3_1(
|
|
219
|
+
raw_messages,
|
|
220
|
+
params.tools,
|
|
221
|
+
params.tool_choice,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
47
225
|
def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
|
|
48
226
|
return LlamaGenerator(config, model_id, llama_model)
|
|
49
227
|
|
|
@@ -68,7 +246,7 @@ class MetaReferenceInferenceImpl(
|
|
|
68
246
|
async def openai_completion(
|
|
69
247
|
self,
|
|
70
248
|
params: OpenAICompletionRequestWithExtraBody,
|
|
71
|
-
) -> OpenAICompletion:
|
|
249
|
+
) -> OpenAICompletion | AsyncIterator[OpenAICompletion]:
|
|
72
250
|
raise NotImplementedError("OpenAI completion not supported by meta reference provider")
|
|
73
251
|
|
|
74
252
|
async def should_refresh_models(self) -> bool:
|
|
@@ -136,17 +314,20 @@ class MetaReferenceInferenceImpl(
|
|
|
136
314
|
self.llama_model = llama_model
|
|
137
315
|
|
|
138
316
|
log.info("Warming up...")
|
|
317
|
+
|
|
139
318
|
await self.openai_chat_completion(
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
319
|
+
params=OpenAIChatCompletionRequestWithExtraBody(
|
|
320
|
+
model=model_id,
|
|
321
|
+
messages=[OpenAIUserMessageParam(role="user", content="Hi how are you?")],
|
|
322
|
+
max_tokens=20,
|
|
323
|
+
)
|
|
143
324
|
)
|
|
144
325
|
log.info("Warmed up!")
|
|
145
326
|
|
|
146
327
|
def check_model(self, request) -> None:
|
|
147
328
|
if self.model_id is None or self.llama_model is None:
|
|
148
329
|
raise RuntimeError(
|
|
149
|
-
"No
|
|
330
|
+
"No available model yet, please register your requested model or add your model in the resources first"
|
|
150
331
|
)
|
|
151
332
|
elif request.model != self.model_id:
|
|
152
333
|
raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
|
|
@@ -155,4 +336,207 @@ class MetaReferenceInferenceImpl(
|
|
|
155
336
|
self,
|
|
156
337
|
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
157
338
|
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
158
|
-
|
|
339
|
+
self.check_model(params)
|
|
340
|
+
|
|
341
|
+
# Convert OpenAI messages to RawMessages
|
|
342
|
+
from llama_stack.models.llama.datatypes import StopReason
|
|
343
|
+
from llama_stack.providers.utils.inference.prompt_adapter import (
|
|
344
|
+
convert_openai_message_to_raw_message,
|
|
345
|
+
decode_assistant_message,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
raw_messages = [await convert_openai_message_to_raw_message(msg) for msg in params.messages]
|
|
349
|
+
|
|
350
|
+
# Augment messages with tool definitions if tools are present
|
|
351
|
+
raw_messages = augment_raw_messages_for_tools(raw_messages, params, self.llama_model)
|
|
352
|
+
|
|
353
|
+
# Call generator's chat_completion method (works for both single-GPU and model-parallel)
|
|
354
|
+
if isinstance(self.generator, LlamaGenerator):
|
|
355
|
+
generator = self.generator.chat_completion(params, raw_messages)
|
|
356
|
+
else:
|
|
357
|
+
# Model parallel: submit task to process group
|
|
358
|
+
generator = self.generator.group.run_inference(("chat_completion", [params, raw_messages]))
|
|
359
|
+
|
|
360
|
+
# Check if streaming is requested
|
|
361
|
+
if params.stream:
|
|
362
|
+
return self._stream_chat_completion(generator, params)
|
|
363
|
+
|
|
364
|
+
# Non-streaming: collect all generated text
|
|
365
|
+
generated_text = ""
|
|
366
|
+
for result_batch in generator:
|
|
367
|
+
for result in result_batch:
|
|
368
|
+
if not result.ignore_token and result.source == "output":
|
|
369
|
+
generated_text += result.text
|
|
370
|
+
|
|
371
|
+
# Decode assistant message to extract tool calls and determine stop_reason
|
|
372
|
+
# Default to end_of_turn if generation completed normally
|
|
373
|
+
decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
|
|
374
|
+
|
|
375
|
+
# Convert tool calls to OpenAI format
|
|
376
|
+
openai_tool_calls = None
|
|
377
|
+
if decoded_message.tool_calls:
|
|
378
|
+
from llama_stack_api import (
|
|
379
|
+
OpenAIChatCompletionToolCall,
|
|
380
|
+
OpenAIChatCompletionToolCallFunction,
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
openai_tool_calls = [
|
|
384
|
+
OpenAIChatCompletionToolCall(
|
|
385
|
+
# generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
|
|
386
|
+
id=f"call_{uuid.uuid4().hex[:24]}",
|
|
387
|
+
type="function",
|
|
388
|
+
function=OpenAIChatCompletionToolCallFunction(
|
|
389
|
+
name=str(tc.tool_name),
|
|
390
|
+
arguments=tc.arguments,
|
|
391
|
+
),
|
|
392
|
+
)
|
|
393
|
+
for tc in decoded_message.tool_calls
|
|
394
|
+
]
|
|
395
|
+
|
|
396
|
+
# Determine finish_reason based on whether tool calls are present
|
|
397
|
+
finish_reason = "tool_calls" if openai_tool_calls else "stop"
|
|
398
|
+
|
|
399
|
+
# Extract content from decoded message
|
|
400
|
+
content = ""
|
|
401
|
+
if isinstance(decoded_message.content, str):
|
|
402
|
+
content = decoded_message.content
|
|
403
|
+
elif isinstance(decoded_message.content, list):
|
|
404
|
+
for item in decoded_message.content:
|
|
405
|
+
if isinstance(item, RawTextItem):
|
|
406
|
+
content += item.text
|
|
407
|
+
|
|
408
|
+
# Create OpenAI response
|
|
409
|
+
# generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
|
|
410
|
+
response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
|
|
411
|
+
created = int(time.time())
|
|
412
|
+
|
|
413
|
+
return OpenAIChatCompletion(
|
|
414
|
+
id=response_id,
|
|
415
|
+
object="chat.completion",
|
|
416
|
+
created=created,
|
|
417
|
+
model=params.model,
|
|
418
|
+
choices=[
|
|
419
|
+
OpenAIChoice(
|
|
420
|
+
index=0,
|
|
421
|
+
message=OpenAIAssistantMessageParam(
|
|
422
|
+
role="assistant",
|
|
423
|
+
content=content,
|
|
424
|
+
tool_calls=openai_tool_calls,
|
|
425
|
+
),
|
|
426
|
+
finish_reason=finish_reason,
|
|
427
|
+
logprobs=None,
|
|
428
|
+
)
|
|
429
|
+
],
|
|
430
|
+
usage=OpenAIChatCompletionUsage(
|
|
431
|
+
prompt_tokens=0, # TODO: calculate properly
|
|
432
|
+
completion_tokens=0, # TODO: calculate properly
|
|
433
|
+
total_tokens=0, # TODO: calculate properly
|
|
434
|
+
),
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
async def _stream_chat_completion(
|
|
438
|
+
self,
|
|
439
|
+
generator,
|
|
440
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
441
|
+
) -> AsyncIterator[OpenAIChatCompletionChunk]:
|
|
442
|
+
"""Stream chat completion chunks as they're generated."""
|
|
443
|
+
from llama_stack.models.llama.datatypes import StopReason
|
|
444
|
+
from llama_stack.providers.utils.inference.prompt_adapter import decode_assistant_message
|
|
445
|
+
from llama_stack_api import (
|
|
446
|
+
OpenAIChatCompletionChunk,
|
|
447
|
+
OpenAIChatCompletionToolCall,
|
|
448
|
+
OpenAIChatCompletionToolCallFunction,
|
|
449
|
+
OpenAIChoiceDelta,
|
|
450
|
+
OpenAIChunkChoice,
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
response_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
|
|
454
|
+
created = int(time.time())
|
|
455
|
+
generated_text = ""
|
|
456
|
+
|
|
457
|
+
# Yield chunks as tokens are generated
|
|
458
|
+
for result_batch in generator:
|
|
459
|
+
for result in result_batch:
|
|
460
|
+
if result.ignore_token or result.source != "output":
|
|
461
|
+
continue
|
|
462
|
+
|
|
463
|
+
generated_text += result.text
|
|
464
|
+
|
|
465
|
+
# Yield delta chunk with the new text
|
|
466
|
+
chunk = OpenAIChatCompletionChunk(
|
|
467
|
+
id=response_id,
|
|
468
|
+
object="chat.completion.chunk",
|
|
469
|
+
created=created,
|
|
470
|
+
model=params.model,
|
|
471
|
+
choices=[
|
|
472
|
+
OpenAIChunkChoice(
|
|
473
|
+
index=0,
|
|
474
|
+
delta=OpenAIChoiceDelta(
|
|
475
|
+
role="assistant",
|
|
476
|
+
content=result.text,
|
|
477
|
+
),
|
|
478
|
+
finish_reason="",
|
|
479
|
+
logprobs=None,
|
|
480
|
+
)
|
|
481
|
+
],
|
|
482
|
+
)
|
|
483
|
+
yield chunk
|
|
484
|
+
|
|
485
|
+
# After generation completes, decode the full message to extract tool calls
|
|
486
|
+
decoded_message = decode_assistant_message(generated_text, StopReason.end_of_turn)
|
|
487
|
+
|
|
488
|
+
# If tool calls are present, yield a final chunk with tool_calls
|
|
489
|
+
if decoded_message.tool_calls:
|
|
490
|
+
openai_tool_calls = [
|
|
491
|
+
OpenAIChatCompletionToolCall(
|
|
492
|
+
# generate a uuid for the call id. This is the only inline provider that does this, so need to get creative.
|
|
493
|
+
id=f"call_{uuid.uuid4().hex[:24]}",
|
|
494
|
+
type="function",
|
|
495
|
+
function=OpenAIChatCompletionToolCallFunction(
|
|
496
|
+
name=str(tc.tool_name),
|
|
497
|
+
arguments=tc.arguments,
|
|
498
|
+
),
|
|
499
|
+
)
|
|
500
|
+
for tc in decoded_message.tool_calls
|
|
501
|
+
]
|
|
502
|
+
|
|
503
|
+
# Yield chunk with tool_calls
|
|
504
|
+
chunk = OpenAIChatCompletionChunk(
|
|
505
|
+
id=response_id,
|
|
506
|
+
object="chat.completion.chunk",
|
|
507
|
+
created=created,
|
|
508
|
+
model=params.model,
|
|
509
|
+
choices=[
|
|
510
|
+
OpenAIChunkChoice(
|
|
511
|
+
index=0,
|
|
512
|
+
delta=OpenAIChoiceDelta(
|
|
513
|
+
role="assistant",
|
|
514
|
+
tool_calls=openai_tool_calls,
|
|
515
|
+
),
|
|
516
|
+
finish_reason="",
|
|
517
|
+
logprobs=None,
|
|
518
|
+
)
|
|
519
|
+
],
|
|
520
|
+
)
|
|
521
|
+
yield chunk
|
|
522
|
+
|
|
523
|
+
finish_reason = "tool_calls"
|
|
524
|
+
else:
|
|
525
|
+
finish_reason = "stop"
|
|
526
|
+
|
|
527
|
+
# Yield final chunk with finish_reason
|
|
528
|
+
final_chunk = OpenAIChatCompletionChunk(
|
|
529
|
+
id=response_id,
|
|
530
|
+
object="chat.completion.chunk",
|
|
531
|
+
created=created,
|
|
532
|
+
model=params.model,
|
|
533
|
+
choices=[
|
|
534
|
+
OpenAIChunkChoice(
|
|
535
|
+
index=0,
|
|
536
|
+
delta=OpenAIChoiceDelta(),
|
|
537
|
+
finish_reason=finish_reason,
|
|
538
|
+
logprobs=None,
|
|
539
|
+
)
|
|
540
|
+
],
|
|
541
|
+
)
|
|
542
|
+
yield final_chunk
|
|
@@ -4,17 +4,12 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
-
from collections.abc import Callable
|
|
8
|
-
from copy import deepcopy
|
|
7
|
+
from collections.abc import Callable
|
|
9
8
|
from functools import partial
|
|
10
9
|
from typing import Any
|
|
11
10
|
|
|
12
11
|
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
|
13
12
|
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
|
14
|
-
from llama_stack.providers.utils.inference.prompt_adapter import (
|
|
15
|
-
ChatCompletionRequestWithRawContent,
|
|
16
|
-
CompletionRequestWithRawContent,
|
|
17
|
-
)
|
|
18
13
|
|
|
19
14
|
from .parallel_utils import ModelParallelProcessGroup
|
|
20
15
|
|
|
@@ -23,12 +18,14 @@ class ModelRunner:
|
|
|
23
18
|
def __init__(self, llama):
|
|
24
19
|
self.llama = llama
|
|
25
20
|
|
|
26
|
-
# the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
|
|
27
21
|
def __call__(self, task: Any):
|
|
28
|
-
|
|
29
|
-
|
|
22
|
+
task_type = task[0]
|
|
23
|
+
if task_type == "chat_completion":
|
|
24
|
+
# task[1] is [params, raw_messages]
|
|
25
|
+
params, raw_messages = task[1]
|
|
26
|
+
return self.llama.chat_completion(params, raw_messages)
|
|
30
27
|
else:
|
|
31
|
-
raise ValueError(f"Unexpected task type {
|
|
28
|
+
raise ValueError(f"Unexpected task type {task_type}")
|
|
32
29
|
|
|
33
30
|
|
|
34
31
|
def init_model_cb(
|
|
@@ -78,19 +75,3 @@ class LlamaModelParallelGenerator:
|
|
|
78
75
|
|
|
79
76
|
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
80
77
|
self.group.stop()
|
|
81
|
-
|
|
82
|
-
def completion(
|
|
83
|
-
self,
|
|
84
|
-
request_batch: list[CompletionRequestWithRawContent],
|
|
85
|
-
) -> Generator:
|
|
86
|
-
req_obj = deepcopy(request_batch)
|
|
87
|
-
gen = self.group.run_inference(("completion", req_obj))
|
|
88
|
-
yield from gen
|
|
89
|
-
|
|
90
|
-
def chat_completion(
|
|
91
|
-
self,
|
|
92
|
-
request_batch: list[ChatCompletionRequestWithRawContent],
|
|
93
|
-
) -> Generator:
|
|
94
|
-
req_obj = deepcopy(request_batch)
|
|
95
|
-
gen = self.group.run_inference(("chat_completion", req_obj))
|
|
96
|
-
yield from gen
|