llama-stack 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/__init__.py +0 -5
- llama_stack/cli/llama.py +3 -3
- llama_stack/cli/stack/_list_deps.py +12 -23
- llama_stack/cli/stack/list_stacks.py +37 -18
- llama_stack/cli/stack/run.py +121 -11
- llama_stack/cli/stack/utils.py +0 -127
- llama_stack/core/access_control/access_control.py +69 -28
- llama_stack/core/access_control/conditions.py +15 -5
- llama_stack/core/admin.py +267 -0
- llama_stack/core/build.py +6 -74
- llama_stack/core/client.py +1 -1
- llama_stack/core/configure.py +6 -6
- llama_stack/core/conversations/conversations.py +28 -25
- llama_stack/core/datatypes.py +271 -79
- llama_stack/core/distribution.py +15 -16
- llama_stack/core/external.py +3 -3
- llama_stack/core/inspect.py +98 -15
- llama_stack/core/library_client.py +73 -61
- llama_stack/core/prompts/prompts.py +12 -11
- llama_stack/core/providers.py +17 -11
- llama_stack/core/resolver.py +65 -56
- llama_stack/core/routers/__init__.py +8 -12
- llama_stack/core/routers/datasets.py +1 -4
- llama_stack/core/routers/eval_scoring.py +7 -4
- llama_stack/core/routers/inference.py +55 -271
- llama_stack/core/routers/safety.py +52 -24
- llama_stack/core/routers/tool_runtime.py +6 -48
- llama_stack/core/routers/vector_io.py +130 -51
- llama_stack/core/routing_tables/benchmarks.py +24 -20
- llama_stack/core/routing_tables/common.py +1 -4
- llama_stack/core/routing_tables/datasets.py +22 -22
- llama_stack/core/routing_tables/models.py +119 -6
- llama_stack/core/routing_tables/scoring_functions.py +7 -7
- llama_stack/core/routing_tables/shields.py +1 -2
- llama_stack/core/routing_tables/toolgroups.py +17 -7
- llama_stack/core/routing_tables/vector_stores.py +51 -16
- llama_stack/core/server/auth.py +5 -3
- llama_stack/core/server/auth_providers.py +36 -20
- llama_stack/core/server/fastapi_router_registry.py +84 -0
- llama_stack/core/server/quota.py +2 -2
- llama_stack/core/server/routes.py +79 -27
- llama_stack/core/server/server.py +102 -87
- llama_stack/core/stack.py +201 -58
- llama_stack/core/storage/datatypes.py +26 -3
- llama_stack/{providers/utils → core/storage}/kvstore/__init__.py +2 -0
- llama_stack/{providers/utils → core/storage}/kvstore/kvstore.py +55 -24
- llama_stack/{providers/utils → core/storage}/kvstore/mongodb/mongodb.py +13 -10
- llama_stack/{providers/utils → core/storage}/kvstore/postgres/postgres.py +28 -17
- llama_stack/{providers/utils → core/storage}/kvstore/redis/redis.py +41 -16
- llama_stack/{providers/utils → core/storage}/kvstore/sqlite/sqlite.py +1 -1
- llama_stack/core/storage/sqlstore/__init__.py +17 -0
- llama_stack/{providers/utils → core/storage}/sqlstore/authorized_sqlstore.py +69 -49
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlalchemy_sqlstore.py +47 -17
- llama_stack/{providers/utils → core/storage}/sqlstore/sqlstore.py +25 -8
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/config.py +8 -2
- llama_stack/core/utils/config_resolution.py +32 -29
- llama_stack/core/utils/context.py +4 -10
- llama_stack/core/utils/exec.py +9 -0
- llama_stack/core/utils/type_inspection.py +45 -0
- llama_stack/distributions/dell/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/dell/dell.py +2 -2
- llama_stack/distributions/dell/run-with-safety.yaml +3 -2
- llama_stack/distributions/meta-reference-gpu/{run.yaml → config.yaml} +3 -2
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +2 -2
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +3 -2
- llama_stack/distributions/nvidia/{run.yaml → config.yaml} +4 -4
- llama_stack/distributions/nvidia/nvidia.py +1 -1
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -4
- llama_stack/{apis/datasetio → distributions/oci}/__init__.py +1 -1
- llama_stack/distributions/oci/config.yaml +134 -0
- llama_stack/distributions/oci/oci.py +108 -0
- llama_stack/distributions/open-benchmark/{run.yaml → config.yaml} +5 -4
- llama_stack/distributions/open-benchmark/open_benchmark.py +2 -3
- llama_stack/distributions/postgres-demo/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/starter/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/starter/starter.py +8 -5
- llama_stack/distributions/starter-gpu/{run.yaml → config.yaml} +64 -13
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +64 -13
- llama_stack/distributions/template.py +13 -69
- llama_stack/distributions/watsonx/{run.yaml → config.yaml} +4 -3
- llama_stack/distributions/watsonx/watsonx.py +1 -1
- llama_stack/log.py +28 -11
- llama_stack/models/llama/checkpoint.py +6 -6
- llama_stack/models/llama/hadamard_utils.py +2 -0
- llama_stack/models/llama/llama3/generation.py +3 -1
- llama_stack/models/llama/llama3/interface.py +2 -5
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +3 -3
- llama_stack/models/llama/llama3/multimodal/image_transform.py +6 -6
- llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +1 -1
- llama_stack/models/llama/llama3/tool_utils.py +2 -1
- llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +1 -1
- llama_stack/providers/inline/agents/meta_reference/__init__.py +3 -3
- llama_stack/providers/inline/agents/meta_reference/agents.py +44 -261
- llama_stack/providers/inline/agents/meta_reference/config.py +6 -1
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +207 -57
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +308 -47
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +162 -96
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +23 -8
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +201 -33
- llama_stack/providers/inline/agents/meta_reference/safety.py +8 -13
- llama_stack/providers/inline/batches/reference/__init__.py +2 -4
- llama_stack/providers/inline/batches/reference/batches.py +78 -60
- llama_stack/providers/inline/datasetio/localfs/datasetio.py +2 -5
- llama_stack/providers/inline/eval/meta_reference/eval.py +16 -61
- llama_stack/providers/inline/files/localfs/files.py +37 -28
- llama_stack/providers/inline/inference/meta_reference/config.py +2 -2
- llama_stack/providers/inline/inference/meta_reference/generators.py +50 -60
- llama_stack/providers/inline/inference/meta_reference/inference.py +403 -19
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +7 -26
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +2 -12
- llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +10 -15
- llama_stack/providers/inline/post_training/common/validator.py +1 -5
- llama_stack/providers/inline/post_training/huggingface/post_training.py +8 -8
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +18 -10
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +12 -9
- llama_stack/providers/inline/post_training/huggingface/utils.py +27 -6
- llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +1 -1
- llama_stack/providers/inline/post_training/torchtune/post_training.py +8 -8
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +16 -16
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +13 -9
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +18 -15
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +9 -9
- llama_stack/providers/inline/scoring/basic/scoring.py +6 -13
- llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +2 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +1 -2
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +12 -15
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +2 -2
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +7 -14
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +2 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +1 -2
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +1 -3
- llama_stack/providers/inline/tool_runtime/rag/__init__.py +1 -1
- llama_stack/providers/inline/tool_runtime/rag/config.py +8 -1
- llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +7 -6
- llama_stack/providers/inline/tool_runtime/rag/memory.py +64 -48
- llama_stack/providers/inline/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/chroma/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/config.py +1 -1
- llama_stack/providers/inline/vector_io/faiss/faiss.py +43 -28
- llama_stack/providers/inline/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/milvus/config.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +1 -1
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +40 -33
- llama_stack/providers/registry/agents.py +7 -3
- llama_stack/providers/registry/batches.py +1 -1
- llama_stack/providers/registry/datasetio.py +1 -1
- llama_stack/providers/registry/eval.py +1 -1
- llama_stack/{apis/datasets/__init__.py → providers/registry/file_processors.py} +5 -1
- llama_stack/providers/registry/files.py +11 -2
- llama_stack/providers/registry/inference.py +22 -3
- llama_stack/providers/registry/post_training.py +1 -1
- llama_stack/providers/registry/safety.py +1 -1
- llama_stack/providers/registry/scoring.py +1 -1
- llama_stack/providers/registry/tool_runtime.py +2 -2
- llama_stack/providers/registry/vector_io.py +7 -7
- llama_stack/providers/remote/datasetio/huggingface/huggingface.py +2 -5
- llama_stack/providers/remote/datasetio/nvidia/datasetio.py +1 -4
- llama_stack/providers/remote/eval/nvidia/eval.py +15 -9
- llama_stack/providers/remote/files/openai/__init__.py +19 -0
- llama_stack/providers/remote/files/openai/config.py +28 -0
- llama_stack/providers/remote/files/openai/files.py +253 -0
- llama_stack/providers/remote/files/s3/files.py +52 -30
- llama_stack/providers/remote/inference/anthropic/anthropic.py +2 -1
- llama_stack/providers/remote/inference/anthropic/config.py +1 -1
- llama_stack/providers/remote/inference/azure/azure.py +1 -3
- llama_stack/providers/remote/inference/azure/config.py +8 -7
- llama_stack/providers/remote/inference/bedrock/__init__.py +1 -1
- llama_stack/providers/remote/inference/bedrock/bedrock.py +82 -105
- llama_stack/providers/remote/inference/bedrock/config.py +24 -3
- llama_stack/providers/remote/inference/cerebras/cerebras.py +5 -5
- llama_stack/providers/remote/inference/cerebras/config.py +12 -5
- llama_stack/providers/remote/inference/databricks/config.py +13 -6
- llama_stack/providers/remote/inference/databricks/databricks.py +16 -6
- llama_stack/providers/remote/inference/fireworks/config.py +5 -5
- llama_stack/providers/remote/inference/fireworks/fireworks.py +1 -1
- llama_stack/providers/remote/inference/gemini/config.py +1 -1
- llama_stack/providers/remote/inference/gemini/gemini.py +13 -14
- llama_stack/providers/remote/inference/groq/config.py +5 -5
- llama_stack/providers/remote/inference/groq/groq.py +1 -1
- llama_stack/providers/remote/inference/llama_openai_compat/config.py +5 -5
- llama_stack/providers/remote/inference/llama_openai_compat/llama.py +8 -6
- llama_stack/providers/remote/inference/nvidia/__init__.py +1 -1
- llama_stack/providers/remote/inference/nvidia/config.py +21 -11
- llama_stack/providers/remote/inference/nvidia/nvidia.py +115 -3
- llama_stack/providers/remote/inference/nvidia/utils.py +1 -1
- llama_stack/providers/remote/inference/oci/__init__.py +17 -0
- llama_stack/providers/remote/inference/oci/auth.py +79 -0
- llama_stack/providers/remote/inference/oci/config.py +75 -0
- llama_stack/providers/remote/inference/oci/oci.py +162 -0
- llama_stack/providers/remote/inference/ollama/config.py +7 -5
- llama_stack/providers/remote/inference/ollama/ollama.py +17 -8
- llama_stack/providers/remote/inference/openai/config.py +4 -4
- llama_stack/providers/remote/inference/openai/openai.py +1 -1
- llama_stack/providers/remote/inference/passthrough/__init__.py +2 -2
- llama_stack/providers/remote/inference/passthrough/config.py +5 -10
- llama_stack/providers/remote/inference/passthrough/passthrough.py +97 -75
- llama_stack/providers/remote/inference/runpod/config.py +12 -5
- llama_stack/providers/remote/inference/runpod/runpod.py +2 -20
- llama_stack/providers/remote/inference/sambanova/config.py +5 -5
- llama_stack/providers/remote/inference/sambanova/sambanova.py +1 -1
- llama_stack/providers/remote/inference/tgi/config.py +7 -6
- llama_stack/providers/remote/inference/tgi/tgi.py +19 -11
- llama_stack/providers/remote/inference/together/config.py +5 -5
- llama_stack/providers/remote/inference/together/together.py +15 -12
- llama_stack/providers/remote/inference/vertexai/config.py +1 -1
- llama_stack/providers/remote/inference/vllm/config.py +5 -5
- llama_stack/providers/remote/inference/vllm/vllm.py +13 -14
- llama_stack/providers/remote/inference/watsonx/config.py +4 -4
- llama_stack/providers/remote/inference/watsonx/watsonx.py +21 -94
- llama_stack/providers/remote/post_training/nvidia/post_training.py +4 -4
- llama_stack/providers/remote/post_training/nvidia/utils.py +1 -1
- llama_stack/providers/remote/safety/bedrock/bedrock.py +6 -6
- llama_stack/providers/remote/safety/bedrock/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/config.py +1 -1
- llama_stack/providers/remote/safety/nvidia/nvidia.py +11 -5
- llama_stack/providers/remote/safety/sambanova/config.py +1 -1
- llama_stack/providers/remote/safety/sambanova/sambanova.py +6 -6
- llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +12 -7
- llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +8 -2
- llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +57 -15
- llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +11 -6
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +11 -6
- llama_stack/providers/remote/vector_io/chroma/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/chroma/chroma.py +125 -20
- llama_stack/providers/remote/vector_io/chroma/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/config.py +1 -1
- llama_stack/providers/remote/vector_io/milvus/milvus.py +27 -21
- llama_stack/providers/remote/vector_io/pgvector/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/config.py +1 -1
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +26 -18
- llama_stack/providers/remote/vector_io/qdrant/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/config.py +1 -1
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +141 -24
- llama_stack/providers/remote/vector_io/weaviate/__init__.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/config.py +1 -1
- llama_stack/providers/remote/vector_io/weaviate/weaviate.py +26 -21
- llama_stack/providers/utils/common/data_schema_validator.py +1 -5
- llama_stack/providers/utils/files/form_data.py +1 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +1 -1
- llama_stack/providers/utils/inference/inference_store.py +12 -21
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +79 -79
- llama_stack/providers/utils/inference/model_registry.py +1 -3
- llama_stack/providers/utils/inference/openai_compat.py +44 -1171
- llama_stack/providers/utils/inference/openai_mixin.py +68 -42
- llama_stack/providers/utils/inference/prompt_adapter.py +50 -265
- llama_stack/providers/utils/inference/stream_utils.py +23 -0
- llama_stack/providers/utils/memory/__init__.py +2 -0
- llama_stack/providers/utils/memory/file_utils.py +1 -1
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +181 -84
- llama_stack/providers/utils/memory/vector_store.py +39 -38
- llama_stack/providers/utils/pagination.py +1 -1
- llama_stack/providers/utils/responses/responses_store.py +15 -25
- llama_stack/providers/utils/scoring/aggregation_utils.py +1 -2
- llama_stack/providers/utils/scoring/base_scoring_fn.py +1 -2
- llama_stack/providers/utils/tools/mcp.py +93 -11
- llama_stack/telemetry/constants.py +27 -0
- llama_stack/telemetry/helpers.py +43 -0
- llama_stack/testing/api_recorder.py +25 -16
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/METADATA +56 -131
- llama_stack-0.4.0.dist-info/RECORD +588 -0
- llama_stack-0.4.0.dist-info/top_level.txt +2 -0
- llama_stack_api/__init__.py +945 -0
- llama_stack_api/admin/__init__.py +45 -0
- llama_stack_api/admin/api.py +72 -0
- llama_stack_api/admin/fastapi_routes.py +117 -0
- llama_stack_api/admin/models.py +113 -0
- llama_stack_api/agents.py +173 -0
- llama_stack_api/batches/__init__.py +40 -0
- llama_stack_api/batches/api.py +53 -0
- llama_stack_api/batches/fastapi_routes.py +113 -0
- llama_stack_api/batches/models.py +78 -0
- llama_stack_api/benchmarks/__init__.py +43 -0
- llama_stack_api/benchmarks/api.py +39 -0
- llama_stack_api/benchmarks/fastapi_routes.py +109 -0
- llama_stack_api/benchmarks/models.py +109 -0
- {llama_stack/apis → llama_stack_api}/common/content_types.py +1 -43
- {llama_stack/apis → llama_stack_api}/common/errors.py +0 -8
- {llama_stack/apis → llama_stack_api}/common/job_types.py +1 -1
- llama_stack_api/common/responses.py +77 -0
- {llama_stack/apis → llama_stack_api}/common/training_types.py +1 -1
- {llama_stack/apis → llama_stack_api}/common/type_system.py +2 -14
- llama_stack_api/connectors.py +146 -0
- {llama_stack/apis/conversations → llama_stack_api}/conversations.py +23 -39
- {llama_stack/apis/datasetio → llama_stack_api}/datasetio.py +4 -8
- llama_stack_api/datasets/__init__.py +61 -0
- llama_stack_api/datasets/api.py +35 -0
- llama_stack_api/datasets/fastapi_routes.py +104 -0
- llama_stack_api/datasets/models.py +152 -0
- {llama_stack/providers → llama_stack_api}/datatypes.py +166 -10
- {llama_stack/apis/eval → llama_stack_api}/eval.py +8 -40
- llama_stack_api/file_processors/__init__.py +27 -0
- llama_stack_api/file_processors/api.py +64 -0
- llama_stack_api/file_processors/fastapi_routes.py +78 -0
- llama_stack_api/file_processors/models.py +42 -0
- llama_stack_api/files/__init__.py +35 -0
- llama_stack_api/files/api.py +51 -0
- llama_stack_api/files/fastapi_routes.py +124 -0
- llama_stack_api/files/models.py +107 -0
- {llama_stack/apis/inference → llama_stack_api}/inference.py +90 -194
- llama_stack_api/inspect_api/__init__.py +37 -0
- llama_stack_api/inspect_api/api.py +25 -0
- llama_stack_api/inspect_api/fastapi_routes.py +76 -0
- llama_stack_api/inspect_api/models.py +28 -0
- {llama_stack/apis/agents → llama_stack_api/internal}/__init__.py +3 -1
- llama_stack/providers/utils/kvstore/api.py → llama_stack_api/internal/kvstore.py +5 -0
- llama_stack_api/internal/sqlstore.py +79 -0
- {llama_stack/apis/models → llama_stack_api}/models.py +11 -9
- {llama_stack/apis/agents → llama_stack_api}/openai_responses.py +184 -27
- {llama_stack/apis/post_training → llama_stack_api}/post_training.py +7 -11
- {llama_stack/apis/prompts → llama_stack_api}/prompts.py +3 -4
- llama_stack_api/providers/__init__.py +33 -0
- llama_stack_api/providers/api.py +16 -0
- llama_stack_api/providers/fastapi_routes.py +57 -0
- llama_stack_api/providers/models.py +24 -0
- {llama_stack/apis/tools → llama_stack_api}/rag_tool.py +2 -52
- {llama_stack/apis → llama_stack_api}/resource.py +1 -1
- llama_stack_api/router_utils.py +160 -0
- {llama_stack/apis/safety → llama_stack_api}/safety.py +6 -9
- {llama_stack → llama_stack_api}/schema_utils.py +94 -4
- {llama_stack/apis/scoring → llama_stack_api}/scoring.py +3 -3
- {llama_stack/apis/scoring_functions → llama_stack_api}/scoring_functions.py +9 -6
- {llama_stack/apis/shields → llama_stack_api}/shields.py +6 -7
- {llama_stack/apis/tools → llama_stack_api}/tools.py +26 -21
- {llama_stack/apis/vector_io → llama_stack_api}/vector_io.py +133 -152
- {llama_stack/apis/vector_stores → llama_stack_api}/vector_stores.py +1 -1
- llama_stack/apis/agents/agents.py +0 -894
- llama_stack/apis/batches/__init__.py +0 -9
- llama_stack/apis/batches/batches.py +0 -100
- llama_stack/apis/benchmarks/__init__.py +0 -7
- llama_stack/apis/benchmarks/benchmarks.py +0 -108
- llama_stack/apis/common/responses.py +0 -36
- llama_stack/apis/conversations/__init__.py +0 -31
- llama_stack/apis/datasets/datasets.py +0 -251
- llama_stack/apis/datatypes.py +0 -160
- llama_stack/apis/eval/__init__.py +0 -7
- llama_stack/apis/files/__init__.py +0 -7
- llama_stack/apis/files/files.py +0 -199
- llama_stack/apis/inference/__init__.py +0 -7
- llama_stack/apis/inference/event_logger.py +0 -43
- llama_stack/apis/inspect/__init__.py +0 -7
- llama_stack/apis/inspect/inspect.py +0 -94
- llama_stack/apis/models/__init__.py +0 -7
- llama_stack/apis/post_training/__init__.py +0 -7
- llama_stack/apis/prompts/__init__.py +0 -9
- llama_stack/apis/providers/__init__.py +0 -7
- llama_stack/apis/providers/providers.py +0 -69
- llama_stack/apis/safety/__init__.py +0 -7
- llama_stack/apis/scoring/__init__.py +0 -7
- llama_stack/apis/scoring_functions/__init__.py +0 -7
- llama_stack/apis/shields/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/__init__.py +0 -7
- llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +0 -77
- llama_stack/apis/telemetry/__init__.py +0 -7
- llama_stack/apis/telemetry/telemetry.py +0 -423
- llama_stack/apis/tools/__init__.py +0 -8
- llama_stack/apis/vector_io/__init__.py +0 -7
- llama_stack/apis/vector_stores/__init__.py +0 -7
- llama_stack/core/server/tracing.py +0 -80
- llama_stack/core/ui/app.py +0 -55
- llama_stack/core/ui/modules/__init__.py +0 -5
- llama_stack/core/ui/modules/api.py +0 -32
- llama_stack/core/ui/modules/utils.py +0 -42
- llama_stack/core/ui/page/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/__init__.py +0 -5
- llama_stack/core/ui/page/distribution/datasets.py +0 -18
- llama_stack/core/ui/page/distribution/eval_tasks.py +0 -20
- llama_stack/core/ui/page/distribution/models.py +0 -18
- llama_stack/core/ui/page/distribution/providers.py +0 -27
- llama_stack/core/ui/page/distribution/resources.py +0 -48
- llama_stack/core/ui/page/distribution/scoring_functions.py +0 -18
- llama_stack/core/ui/page/distribution/shields.py +0 -19
- llama_stack/core/ui/page/evaluations/__init__.py +0 -5
- llama_stack/core/ui/page/evaluations/app_eval.py +0 -143
- llama_stack/core/ui/page/evaluations/native_eval.py +0 -253
- llama_stack/core/ui/page/playground/__init__.py +0 -5
- llama_stack/core/ui/page/playground/chat.py +0 -130
- llama_stack/core/ui/page/playground/tools.py +0 -352
- llama_stack/distributions/dell/build.yaml +0 -33
- llama_stack/distributions/meta-reference-gpu/build.yaml +0 -32
- llama_stack/distributions/nvidia/build.yaml +0 -29
- llama_stack/distributions/open-benchmark/build.yaml +0 -36
- llama_stack/distributions/postgres-demo/__init__.py +0 -7
- llama_stack/distributions/postgres-demo/build.yaml +0 -23
- llama_stack/distributions/postgres-demo/postgres_demo.py +0 -125
- llama_stack/distributions/starter/build.yaml +0 -61
- llama_stack/distributions/starter-gpu/build.yaml +0 -61
- llama_stack/distributions/watsonx/build.yaml +0 -33
- llama_stack/providers/inline/agents/meta_reference/agent_instance.py +0 -1024
- llama_stack/providers/inline/agents/meta_reference/persistence.py +0 -228
- llama_stack/providers/inline/telemetry/__init__.py +0 -5
- llama_stack/providers/inline/telemetry/meta_reference/__init__.py +0 -21
- llama_stack/providers/inline/telemetry/meta_reference/config.py +0 -47
- llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +0 -252
- llama_stack/providers/remote/inference/bedrock/models.py +0 -29
- llama_stack/providers/utils/kvstore/sqlite/config.py +0 -20
- llama_stack/providers/utils/sqlstore/__init__.py +0 -5
- llama_stack/providers/utils/sqlstore/api.py +0 -128
- llama_stack/providers/utils/telemetry/__init__.py +0 -5
- llama_stack/providers/utils/telemetry/trace_protocol.py +0 -142
- llama_stack/providers/utils/telemetry/tracing.py +0 -384
- llama_stack/strong_typing/__init__.py +0 -19
- llama_stack/strong_typing/auxiliary.py +0 -228
- llama_stack/strong_typing/classdef.py +0 -440
- llama_stack/strong_typing/core.py +0 -46
- llama_stack/strong_typing/deserializer.py +0 -877
- llama_stack/strong_typing/docstring.py +0 -409
- llama_stack/strong_typing/exception.py +0 -23
- llama_stack/strong_typing/inspection.py +0 -1085
- llama_stack/strong_typing/mapping.py +0 -40
- llama_stack/strong_typing/name.py +0 -182
- llama_stack/strong_typing/schema.py +0 -792
- llama_stack/strong_typing/serialization.py +0 -97
- llama_stack/strong_typing/serializer.py +0 -500
- llama_stack/strong_typing/slots.py +0 -27
- llama_stack/strong_typing/topological.py +0 -89
- llama_stack/ui/node_modules/flatted/python/flatted.py +0 -149
- llama_stack-0.3.4.dist-info/RECORD +0 -625
- llama_stack-0.3.4.dist-info/top_level.txt +0 -1
- /llama_stack/{providers/utils → core/storage}/kvstore/config.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/mongodb/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/postgres/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/redis/__init__.py +0 -0
- /llama_stack/{providers/utils → core/storage}/kvstore/sqlite/__init__.py +0 -0
- /llama_stack/{apis → providers/inline/file_processor}/__init__.py +0 -0
- /llama_stack/{apis/common → telemetry}/__init__.py +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/WHEEL +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.3.4.dist-info → llama_stack-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {llama_stack/core/ui → llama_stack_api/common}/__init__.py +0 -0
- {llama_stack/strong_typing → llama_stack_api}/py.typed +0 -0
- {llama_stack/apis → llama_stack_api}/version.py +0 -0
|
@@ -5,28 +5,29 @@
|
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
from pydantic import BaseModel, Field, SecretStr
|
|
8
|
+
from pydantic import BaseModel, Field, HttpUrl, SecretStr
|
|
9
9
|
|
|
10
10
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
11
|
-
from
|
|
11
|
+
from llama_stack_api import json_schema_type
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
@json_schema_type
|
|
15
15
|
class TGIImplConfig(RemoteInferenceProviderConfig):
|
|
16
16
|
auth_credential: SecretStr | None = Field(default=None, exclude=True)
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
base_url: HttpUrl | None = Field(
|
|
19
|
+
default=None,
|
|
20
|
+
description="The URL for the TGI serving endpoint (should include /v1 path)",
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
@classmethod
|
|
23
24
|
def sample_run_config(
|
|
24
25
|
cls,
|
|
25
|
-
|
|
26
|
+
base_url: str = "${env.TGI_URL:=}",
|
|
26
27
|
**kwargs,
|
|
27
28
|
):
|
|
28
29
|
return {
|
|
29
|
-
"
|
|
30
|
+
"base_url": base_url,
|
|
30
31
|
}
|
|
31
32
|
|
|
32
33
|
|
|
@@ -8,14 +8,14 @@
|
|
|
8
8
|
from collections.abc import Iterable
|
|
9
9
|
|
|
10
10
|
from huggingface_hub import AsyncInferenceClient, HfApi
|
|
11
|
-
from pydantic import SecretStr
|
|
11
|
+
from pydantic import HttpUrl, SecretStr
|
|
12
12
|
|
|
13
|
-
from llama_stack.
|
|
13
|
+
from llama_stack.log import get_logger
|
|
14
|
+
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
15
|
+
from llama_stack_api import (
|
|
14
16
|
OpenAIEmbeddingsRequestWithExtraBody,
|
|
15
17
|
OpenAIEmbeddingsResponse,
|
|
16
18
|
)
|
|
17
|
-
from llama_stack.log import get_logger
|
|
18
|
-
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
19
19
|
|
|
20
20
|
from .config import InferenceAPIImplConfig, InferenceEndpointImplConfig, TGIImplConfig
|
|
21
21
|
|
|
@@ -23,7 +23,7 @@ log = get_logger(name=__name__, category="inference::tgi")
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class _HfAdapter(OpenAIMixin):
|
|
26
|
-
|
|
26
|
+
base_url: HttpUrl
|
|
27
27
|
api_key: SecretStr
|
|
28
28
|
|
|
29
29
|
hf_client: AsyncInferenceClient
|
|
@@ -36,7 +36,7 @@ class _HfAdapter(OpenAIMixin):
|
|
|
36
36
|
return "NO KEY REQUIRED"
|
|
37
37
|
|
|
38
38
|
def get_base_url(self):
|
|
39
|
-
return self.
|
|
39
|
+
return self.base_url
|
|
40
40
|
|
|
41
41
|
async def list_provider_model_ids(self) -> Iterable[str]:
|
|
42
42
|
return [self.model_id]
|
|
@@ -50,14 +50,22 @@ class _HfAdapter(OpenAIMixin):
|
|
|
50
50
|
|
|
51
51
|
class TGIAdapter(_HfAdapter):
|
|
52
52
|
async def initialize(self, config: TGIImplConfig) -> None:
|
|
53
|
-
if not config.
|
|
54
|
-
raise ValueError(
|
|
55
|
-
|
|
56
|
-
|
|
53
|
+
if not config.base_url:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
"You must provide a URL in config.yaml (or via the TGI_URL environment variable) to use TGI."
|
|
56
|
+
)
|
|
57
|
+
log.info(f"Initializing TGI client with url={config.base_url}")
|
|
58
|
+
# Extract base URL without /v1 for HF client initialization
|
|
59
|
+
base_url_str = str(config.base_url).rstrip("/")
|
|
60
|
+
if base_url_str.endswith("/v1"):
|
|
61
|
+
base_url_for_client = base_url_str[:-3]
|
|
62
|
+
else:
|
|
63
|
+
base_url_for_client = base_url_str
|
|
64
|
+
self.hf_client = AsyncInferenceClient(model=base_url_for_client, provider="hf-inference")
|
|
57
65
|
endpoint_info = await self.hf_client.get_endpoint_info()
|
|
58
66
|
self.max_tokens = endpoint_info["max_total_tokens"]
|
|
59
67
|
self.model_id = endpoint_info["model_id"]
|
|
60
|
-
self.
|
|
68
|
+
self.base_url = config.base_url
|
|
61
69
|
self.api_key = SecretStr("NO_KEY")
|
|
62
70
|
|
|
63
71
|
|
|
@@ -6,22 +6,22 @@
|
|
|
6
6
|
|
|
7
7
|
from typing import Any
|
|
8
8
|
|
|
9
|
-
from pydantic import Field
|
|
9
|
+
from pydantic import Field, HttpUrl
|
|
10
10
|
|
|
11
11
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
12
|
-
from
|
|
12
|
+
from llama_stack_api import json_schema_type
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@json_schema_type
|
|
16
16
|
class TogetherImplConfig(RemoteInferenceProviderConfig):
|
|
17
|
-
|
|
18
|
-
default="https://api.together.xyz/v1",
|
|
17
|
+
base_url: HttpUrl | None = Field(
|
|
18
|
+
default=HttpUrl("https://api.together.xyz/v1"),
|
|
19
19
|
description="The URL for the Together AI server",
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
@classmethod
|
|
23
23
|
def sample_run_config(cls, **kwargs) -> dict[str, Any]:
|
|
24
24
|
return {
|
|
25
|
-
"
|
|
25
|
+
"base_url": "https://api.together.xyz/v1",
|
|
26
26
|
"api_key": "${env.TOGETHER_API_KEY:=}",
|
|
27
27
|
}
|
|
@@ -6,19 +6,19 @@
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
from collections.abc import Iterable
|
|
9
|
+
from typing import Any, cast
|
|
9
10
|
|
|
10
|
-
from together import AsyncTogether
|
|
11
|
-
from together.constants import BASE_URL
|
|
11
|
+
from together import AsyncTogether # type: ignore[import-untyped]
|
|
12
12
|
|
|
13
|
-
from llama_stack.apis.inference import (
|
|
14
|
-
OpenAIEmbeddingsRequestWithExtraBody,
|
|
15
|
-
OpenAIEmbeddingsResponse,
|
|
16
|
-
)
|
|
17
|
-
from llama_stack.apis.inference.inference import OpenAIEmbeddingUsage
|
|
18
|
-
from llama_stack.apis.models import Model
|
|
19
13
|
from llama_stack.core.request_headers import NeedsRequestProviderData
|
|
20
14
|
from llama_stack.log import get_logger
|
|
21
15
|
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
16
|
+
from llama_stack_api import (
|
|
17
|
+
Model,
|
|
18
|
+
OpenAIEmbeddingsRequestWithExtraBody,
|
|
19
|
+
OpenAIEmbeddingsResponse,
|
|
20
|
+
OpenAIEmbeddingUsage,
|
|
21
|
+
)
|
|
22
22
|
|
|
23
23
|
from .config import TogetherImplConfig
|
|
24
24
|
|
|
@@ -41,7 +41,7 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
|
|
|
41
41
|
provider_data_api_key_field: str = "together_api_key"
|
|
42
42
|
|
|
43
43
|
def get_base_url(self):
|
|
44
|
-
return
|
|
44
|
+
return str(self.config.base_url)
|
|
45
45
|
|
|
46
46
|
def _get_client(self) -> AsyncTogether:
|
|
47
47
|
together_api_key = None
|
|
@@ -81,10 +81,11 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
|
|
|
81
81
|
if params.dimensions is not None:
|
|
82
82
|
raise ValueError("Together's embeddings endpoint does not support dimensions param.")
|
|
83
83
|
|
|
84
|
+
# Cast encoding_format to match OpenAI SDK's expected Literal type
|
|
84
85
|
response = await self.client.embeddings.create(
|
|
85
86
|
model=await self._get_provider_model_id(params.model),
|
|
86
87
|
input=params.input,
|
|
87
|
-
encoding_format=params.encoding_format,
|
|
88
|
+
encoding_format=cast(Any, params.encoding_format),
|
|
88
89
|
)
|
|
89
90
|
|
|
90
91
|
response.model = (
|
|
@@ -97,6 +98,8 @@ class TogetherInferenceAdapter(OpenAIMixin, NeedsRequestProviderData):
|
|
|
97
98
|
logger.warning(
|
|
98
99
|
f"Together's embedding endpoint for {params.model} did not return usage information, substituting -1s."
|
|
99
100
|
)
|
|
100
|
-
|
|
101
|
+
# Cast to allow monkey-patching the response object
|
|
102
|
+
response.usage = cast(Any, OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1))
|
|
101
103
|
|
|
102
|
-
|
|
104
|
+
# Together's CreateEmbeddingResponse is compatible with OpenAIEmbeddingsResponse after monkey-patching
|
|
105
|
+
return cast(OpenAIEmbeddingsResponse, response)
|
|
@@ -9,7 +9,7 @@ from typing import Any
|
|
|
9
9
|
from pydantic import BaseModel, Field, SecretStr
|
|
10
10
|
|
|
11
11
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
12
|
-
from
|
|
12
|
+
from llama_stack_api import json_schema_type
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class VertexAIProviderDataValidator(BaseModel):
|
|
@@ -6,15 +6,15 @@
|
|
|
6
6
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
-
from pydantic import Field, SecretStr, field_validator
|
|
9
|
+
from pydantic import Field, HttpUrl, SecretStr, field_validator
|
|
10
10
|
|
|
11
11
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
12
|
-
from
|
|
12
|
+
from llama_stack_api import json_schema_type
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@json_schema_type
|
|
16
16
|
class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
|
|
17
|
-
|
|
17
|
+
base_url: HttpUrl | None = Field(
|
|
18
18
|
default=None,
|
|
19
19
|
description="The URL for the vLLM model serving endpoint",
|
|
20
20
|
)
|
|
@@ -48,11 +48,11 @@ class VLLMInferenceAdapterConfig(RemoteInferenceProviderConfig):
|
|
|
48
48
|
@classmethod
|
|
49
49
|
def sample_run_config(
|
|
50
50
|
cls,
|
|
51
|
-
|
|
51
|
+
base_url: str = "${env.VLLM_URL:=}",
|
|
52
52
|
**kwargs,
|
|
53
53
|
):
|
|
54
54
|
return {
|
|
55
|
-
"
|
|
55
|
+
"base_url": base_url,
|
|
56
56
|
"max_tokens": "${env.VLLM_MAX_TOKENS:=4096}",
|
|
57
57
|
"api_token": "${env.VLLM_API_TOKEN:=fake}",
|
|
58
58
|
"tls_verify": "${env.VLLM_TLS_VERIFY:=true}",
|
|
@@ -7,22 +7,18 @@ from collections.abc import AsyncIterator
|
|
|
7
7
|
from urllib.parse import urljoin
|
|
8
8
|
|
|
9
9
|
import httpx
|
|
10
|
-
from openai.types.chat.chat_completion_chunk import (
|
|
11
|
-
ChatCompletionChunk as OpenAIChatCompletionChunk,
|
|
12
|
-
)
|
|
13
10
|
from pydantic import ConfigDict
|
|
14
11
|
|
|
15
|
-
from llama_stack.apis.inference import (
|
|
16
|
-
OpenAIChatCompletion,
|
|
17
|
-
OpenAIChatCompletionRequestWithExtraBody,
|
|
18
|
-
ToolChoice,
|
|
19
|
-
)
|
|
20
12
|
from llama_stack.log import get_logger
|
|
21
|
-
from llama_stack.providers.
|
|
13
|
+
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
14
|
+
from llama_stack_api import (
|
|
22
15
|
HealthResponse,
|
|
23
16
|
HealthStatus,
|
|
17
|
+
OpenAIChatCompletion,
|
|
18
|
+
OpenAIChatCompletionChunk,
|
|
19
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
20
|
+
ToolChoice,
|
|
24
21
|
)
|
|
25
|
-
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
26
22
|
|
|
27
23
|
from .config import VLLMInferenceAdapterConfig
|
|
28
24
|
|
|
@@ -34,6 +30,9 @@ class VLLMInferenceAdapter(OpenAIMixin):
|
|
|
34
30
|
|
|
35
31
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
36
32
|
|
|
33
|
+
# vLLM does not support the stream_options parameter
|
|
34
|
+
supports_stream_options: bool = False
|
|
35
|
+
|
|
37
36
|
provider_data_api_key_field: str = "vllm_api_token"
|
|
38
37
|
|
|
39
38
|
def get_api_key(self) -> str | None:
|
|
@@ -43,14 +42,14 @@ class VLLMInferenceAdapter(OpenAIMixin):
|
|
|
43
42
|
|
|
44
43
|
def get_base_url(self) -> str:
|
|
45
44
|
"""Get the base URL from config."""
|
|
46
|
-
if not self.config.
|
|
45
|
+
if not self.config.base_url:
|
|
47
46
|
raise ValueError("No base URL configured")
|
|
48
|
-
return self.config.
|
|
47
|
+
return str(self.config.base_url)
|
|
49
48
|
|
|
50
49
|
async def initialize(self) -> None:
|
|
51
|
-
if not self.config.
|
|
50
|
+
if not self.config.base_url:
|
|
52
51
|
raise ValueError(
|
|
53
|
-
"You must provide a URL in
|
|
52
|
+
"You must provide a URL in config.yaml (or via the VLLM_URL environment variable) to use vLLM."
|
|
54
53
|
)
|
|
55
54
|
|
|
56
55
|
async def health(self) -> HealthResponse:
|
|
@@ -7,10 +7,10 @@
|
|
|
7
7
|
import os
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
from pydantic import BaseModel, Field
|
|
10
|
+
from pydantic import BaseModel, Field, HttpUrl
|
|
11
11
|
|
|
12
12
|
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
13
|
-
from
|
|
13
|
+
from llama_stack_api import json_schema_type
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class WatsonXProviderDataValidator(BaseModel):
|
|
@@ -23,7 +23,7 @@ class WatsonXProviderDataValidator(BaseModel):
|
|
|
23
23
|
|
|
24
24
|
@json_schema_type
|
|
25
25
|
class WatsonXConfig(RemoteInferenceProviderConfig):
|
|
26
|
-
|
|
26
|
+
base_url: HttpUrl | None = Field(
|
|
27
27
|
default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
|
|
28
28
|
description="A base url for accessing the watsonx.ai",
|
|
29
29
|
)
|
|
@@ -39,7 +39,7 @@ class WatsonXConfig(RemoteInferenceProviderConfig):
|
|
|
39
39
|
@classmethod
|
|
40
40
|
def sample_run_config(cls, **kwargs) -> dict[str, Any]:
|
|
41
41
|
return {
|
|
42
|
-
"
|
|
42
|
+
"base_url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
|
|
43
43
|
"api_key": "${env.WATSONX_API_KEY:=}",
|
|
44
44
|
"project_id": "${env.WATSONX_PROJECT_ID:=}",
|
|
45
45
|
}
|
|
@@ -10,23 +10,20 @@ from typing import Any
|
|
|
10
10
|
import litellm
|
|
11
11
|
import requests
|
|
12
12
|
|
|
13
|
-
from llama_stack.
|
|
13
|
+
from llama_stack.log import get_logger
|
|
14
|
+
from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
|
|
15
|
+
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
|
|
16
|
+
from llama_stack_api import (
|
|
17
|
+
Model,
|
|
18
|
+
ModelType,
|
|
14
19
|
OpenAIChatCompletion,
|
|
15
20
|
OpenAIChatCompletionChunk,
|
|
16
21
|
OpenAIChatCompletionRequestWithExtraBody,
|
|
17
22
|
OpenAIChatCompletionUsage,
|
|
18
|
-
OpenAICompletion,
|
|
19
23
|
OpenAICompletionRequestWithExtraBody,
|
|
20
24
|
OpenAIEmbeddingsRequestWithExtraBody,
|
|
21
25
|
OpenAIEmbeddingsResponse,
|
|
22
26
|
)
|
|
23
|
-
from llama_stack.apis.models import Model
|
|
24
|
-
from llama_stack.apis.models.models import ModelType
|
|
25
|
-
from llama_stack.log import get_logger
|
|
26
|
-
from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
|
|
27
|
-
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
|
|
28
|
-
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
|
29
|
-
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
|
30
27
|
|
|
31
28
|
logger = get_logger(name=__name__, category="providers::remote::watsonx")
|
|
32
29
|
|
|
@@ -48,57 +45,25 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
|
48
45
|
openai_compat_api_base=self.get_base_url(),
|
|
49
46
|
)
|
|
50
47
|
|
|
48
|
+
def _litellm_extra_request_params(
|
|
49
|
+
self,
|
|
50
|
+
params: OpenAIChatCompletionRequestWithExtraBody | OpenAICompletionRequestWithExtraBody,
|
|
51
|
+
) -> dict[str, Any]:
|
|
52
|
+
# These are watsonx-specific parameters used by LiteLLM.
|
|
53
|
+
return {"timeout": self.config.timeout, "project_id": self.config.project_id}
|
|
54
|
+
|
|
51
55
|
async def openai_chat_completion(
|
|
52
56
|
self,
|
|
53
57
|
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
54
58
|
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
55
59
|
"""
|
|
56
|
-
Override parent method to
|
|
60
|
+
Override parent method to inject usage object when missing.
|
|
61
|
+
|
|
57
62
|
This works around a LiteLLM defect where usage block is sometimes dropped.
|
|
63
|
+
Note: request parameter construction (including telemetry-driven stream_options injection)
|
|
64
|
+
is handled by LiteLLMOpenAIMixin via _litellm_extra_request_params().
|
|
58
65
|
"""
|
|
59
|
-
|
|
60
|
-
# Add usage tracking for streaming when telemetry is active
|
|
61
|
-
stream_options = params.stream_options
|
|
62
|
-
if params.stream and get_current_span() is not None:
|
|
63
|
-
if stream_options is None:
|
|
64
|
-
stream_options = {"include_usage": True}
|
|
65
|
-
elif "include_usage" not in stream_options:
|
|
66
|
-
stream_options = {**stream_options, "include_usage": True}
|
|
67
|
-
|
|
68
|
-
model_obj = await self.model_store.get_model(params.model)
|
|
69
|
-
|
|
70
|
-
request_params = await prepare_openai_completion_params(
|
|
71
|
-
model=self.get_litellm_model_name(model_obj.provider_resource_id),
|
|
72
|
-
messages=params.messages,
|
|
73
|
-
frequency_penalty=params.frequency_penalty,
|
|
74
|
-
function_call=params.function_call,
|
|
75
|
-
functions=params.functions,
|
|
76
|
-
logit_bias=params.logit_bias,
|
|
77
|
-
logprobs=params.logprobs,
|
|
78
|
-
max_completion_tokens=params.max_completion_tokens,
|
|
79
|
-
max_tokens=params.max_tokens,
|
|
80
|
-
n=params.n,
|
|
81
|
-
parallel_tool_calls=params.parallel_tool_calls,
|
|
82
|
-
presence_penalty=params.presence_penalty,
|
|
83
|
-
response_format=params.response_format,
|
|
84
|
-
seed=params.seed,
|
|
85
|
-
stop=params.stop,
|
|
86
|
-
stream=params.stream,
|
|
87
|
-
stream_options=stream_options,
|
|
88
|
-
temperature=params.temperature,
|
|
89
|
-
tool_choice=params.tool_choice,
|
|
90
|
-
tools=params.tools,
|
|
91
|
-
top_logprobs=params.top_logprobs,
|
|
92
|
-
top_p=params.top_p,
|
|
93
|
-
user=params.user,
|
|
94
|
-
api_key=self.get_api_key(),
|
|
95
|
-
api_base=self.api_base,
|
|
96
|
-
# These are watsonx-specific parameters
|
|
97
|
-
timeout=self.config.timeout,
|
|
98
|
-
project_id=self.config.project_id,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
result = await litellm.acompletion(**request_params)
|
|
66
|
+
result = await super().openai_chat_completion(params)
|
|
102
67
|
|
|
103
68
|
# If not streaming, check and inject usage if missing
|
|
104
69
|
if not params.stream:
|
|
@@ -175,44 +140,6 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
|
175
140
|
logger.error(f"Error normalizing stream: {e}", exc_info=True)
|
|
176
141
|
raise
|
|
177
142
|
|
|
178
|
-
async def openai_completion(
|
|
179
|
-
self,
|
|
180
|
-
params: OpenAICompletionRequestWithExtraBody,
|
|
181
|
-
) -> OpenAICompletion:
|
|
182
|
-
"""
|
|
183
|
-
Override parent method to add watsonx-specific parameters.
|
|
184
|
-
"""
|
|
185
|
-
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
|
186
|
-
|
|
187
|
-
model_obj = await self.model_store.get_model(params.model)
|
|
188
|
-
|
|
189
|
-
request_params = await prepare_openai_completion_params(
|
|
190
|
-
model=self.get_litellm_model_name(model_obj.provider_resource_id),
|
|
191
|
-
prompt=params.prompt,
|
|
192
|
-
best_of=params.best_of,
|
|
193
|
-
echo=params.echo,
|
|
194
|
-
frequency_penalty=params.frequency_penalty,
|
|
195
|
-
logit_bias=params.logit_bias,
|
|
196
|
-
logprobs=params.logprobs,
|
|
197
|
-
max_tokens=params.max_tokens,
|
|
198
|
-
n=params.n,
|
|
199
|
-
presence_penalty=params.presence_penalty,
|
|
200
|
-
seed=params.seed,
|
|
201
|
-
stop=params.stop,
|
|
202
|
-
stream=params.stream,
|
|
203
|
-
stream_options=params.stream_options,
|
|
204
|
-
temperature=params.temperature,
|
|
205
|
-
top_p=params.top_p,
|
|
206
|
-
user=params.user,
|
|
207
|
-
suffix=params.suffix,
|
|
208
|
-
api_key=self.get_api_key(),
|
|
209
|
-
api_base=self.api_base,
|
|
210
|
-
# These are watsonx-specific parameters
|
|
211
|
-
timeout=self.config.timeout,
|
|
212
|
-
project_id=self.config.project_id,
|
|
213
|
-
)
|
|
214
|
-
return await litellm.atext_completion(**request_params)
|
|
215
|
-
|
|
216
143
|
async def openai_embeddings(
|
|
217
144
|
self,
|
|
218
145
|
params: OpenAIEmbeddingsRequestWithExtraBody,
|
|
@@ -238,8 +165,8 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
|
238
165
|
)
|
|
239
166
|
|
|
240
167
|
# Convert response to OpenAI format
|
|
241
|
-
from llama_stack.apis.inference import OpenAIEmbeddingUsage
|
|
242
168
|
from llama_stack.providers.utils.inference.litellm_openai_mixin import b64_encode_openai_embeddings_response
|
|
169
|
+
from llama_stack_api import OpenAIEmbeddingUsage
|
|
243
170
|
|
|
244
171
|
data = b64_encode_openai_embeddings_response(response.data, params.encoding_format)
|
|
245
172
|
|
|
@@ -255,7 +182,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
|
255
182
|
)
|
|
256
183
|
|
|
257
184
|
def get_base_url(self) -> str:
|
|
258
|
-
return self.config.
|
|
185
|
+
return str(self.config.base_url)
|
|
259
186
|
|
|
260
187
|
# Copied from OpenAIMixin
|
|
261
188
|
async def check_model_availability(self, model: str) -> bool:
|
|
@@ -316,7 +243,7 @@ class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
|
316
243
|
"""
|
|
317
244
|
Retrieves foundation model specifications from the watsonx.ai API.
|
|
318
245
|
"""
|
|
319
|
-
url = f"{self.config.
|
|
246
|
+
url = f"{str(self.config.base_url)}/ml/v1/foundation_model_specs?version=2023-10-25"
|
|
320
247
|
headers = {
|
|
321
248
|
# Note that there is no authorization header. Listing models does not require authentication.
|
|
322
249
|
"Content-Type": "application/json",
|
|
@@ -10,7 +10,10 @@ from typing import Any, Literal
|
|
|
10
10
|
import aiohttp
|
|
11
11
|
from pydantic import BaseModel, ConfigDict
|
|
12
12
|
|
|
13
|
-
from llama_stack.
|
|
13
|
+
from llama_stack.providers.remote.post_training.nvidia.config import NvidiaPostTrainingConfig
|
|
14
|
+
from llama_stack.providers.remote.post_training.nvidia.utils import warn_unsupported_params
|
|
15
|
+
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
|
16
|
+
from llama_stack_api import (
|
|
14
17
|
AlgorithmConfig,
|
|
15
18
|
DPOAlignmentConfig,
|
|
16
19
|
JobStatus,
|
|
@@ -19,9 +22,6 @@ from llama_stack.apis.post_training import (
|
|
|
19
22
|
PostTrainingJobStatusResponse,
|
|
20
23
|
TrainingConfig,
|
|
21
24
|
)
|
|
22
|
-
from llama_stack.providers.remote.post_training.nvidia.config import NvidiaPostTrainingConfig
|
|
23
|
-
from llama_stack.providers.remote.post_training.nvidia.utils import warn_unsupported_params
|
|
24
|
-
from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
|
|
25
25
|
|
|
26
26
|
from .models import _MODEL_ENTRIES
|
|
27
27
|
|
|
@@ -9,9 +9,9 @@ from typing import Any
|
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel
|
|
11
11
|
|
|
12
|
-
from llama_stack.apis.post_training import TrainingConfig
|
|
13
12
|
from llama_stack.log import get_logger
|
|
14
13
|
from llama_stack.providers.remote.post_training.nvidia.config import SFTLoRADefaultConfig
|
|
14
|
+
from llama_stack_api import TrainingConfig
|
|
15
15
|
|
|
16
16
|
from .config import NvidiaPostTrainingConfig
|
|
17
17
|
|
|
@@ -7,17 +7,17 @@
|
|
|
7
7
|
import json
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
|
-
from llama_stack.
|
|
11
|
-
from llama_stack.
|
|
10
|
+
from llama_stack.log import get_logger
|
|
11
|
+
from llama_stack.providers.utils.bedrock.client import create_bedrock_client
|
|
12
|
+
from llama_stack_api import (
|
|
13
|
+
OpenAIMessageParam,
|
|
12
14
|
RunShieldResponse,
|
|
13
15
|
Safety,
|
|
14
16
|
SafetyViolation,
|
|
17
|
+
Shield,
|
|
18
|
+
ShieldsProtocolPrivate,
|
|
15
19
|
ViolationLevel,
|
|
16
20
|
)
|
|
17
|
-
from llama_stack.apis.shields import Shield
|
|
18
|
-
from llama_stack.log import get_logger
|
|
19
|
-
from llama_stack.providers.datatypes import ShieldsProtocolPrivate
|
|
20
|
-
from llama_stack.providers.utils.bedrock.client import create_bedrock_client
|
|
21
21
|
|
|
22
22
|
from .config import BedrockSafetyConfig
|
|
23
23
|
|
|
@@ -8,11 +8,17 @@ from typing import Any
|
|
|
8
8
|
|
|
9
9
|
import requests
|
|
10
10
|
|
|
11
|
-
from llama_stack.apis.inference import OpenAIMessageParam
|
|
12
|
-
from llama_stack.apis.safety import ModerationObject, RunShieldResponse, Safety, SafetyViolation, ViolationLevel
|
|
13
|
-
from llama_stack.apis.shields import Shield
|
|
14
11
|
from llama_stack.log import get_logger
|
|
15
|
-
from
|
|
12
|
+
from llama_stack_api import (
|
|
13
|
+
ModerationObject,
|
|
14
|
+
OpenAIMessageParam,
|
|
15
|
+
RunShieldResponse,
|
|
16
|
+
Safety,
|
|
17
|
+
SafetyViolation,
|
|
18
|
+
Shield,
|
|
19
|
+
ShieldsProtocolPrivate,
|
|
20
|
+
ViolationLevel,
|
|
21
|
+
)
|
|
16
22
|
|
|
17
23
|
from .config import NVIDIASafetyConfig
|
|
18
24
|
|
|
@@ -66,7 +72,7 @@ class NVIDIASafetyAdapter(Safety, ShieldsProtocolPrivate):
|
|
|
66
72
|
self.shield = NeMoGuardrails(self.config, shield.shield_id)
|
|
67
73
|
return await self.shield.run(messages)
|
|
68
74
|
|
|
69
|
-
async def run_moderation(self, input: str | list[str], model: str) -> ModerationObject:
|
|
75
|
+
async def run_moderation(self, input: str | list[str], model: str | None = None) -> ModerationObject:
|
|
70
76
|
raise NotImplementedError("NVIDIA safety provider currently does not implement run_moderation")
|
|
71
77
|
|
|
72
78
|
|
|
@@ -9,17 +9,17 @@ from typing import Any
|
|
|
9
9
|
import litellm
|
|
10
10
|
import requests
|
|
11
11
|
|
|
12
|
-
from llama_stack.
|
|
13
|
-
from llama_stack.
|
|
12
|
+
from llama_stack.core.request_headers import NeedsRequestProviderData
|
|
13
|
+
from llama_stack.log import get_logger
|
|
14
|
+
from llama_stack_api import (
|
|
15
|
+
OpenAIMessageParam,
|
|
14
16
|
RunShieldResponse,
|
|
15
17
|
Safety,
|
|
16
18
|
SafetyViolation,
|
|
19
|
+
Shield,
|
|
20
|
+
ShieldsProtocolPrivate,
|
|
17
21
|
ViolationLevel,
|
|
18
22
|
)
|
|
19
|
-
from llama_stack.apis.shields import Shield
|
|
20
|
-
from llama_stack.core.request_headers import NeedsRequestProviderData
|
|
21
|
-
from llama_stack.log import get_logger
|
|
22
|
-
from llama_stack.providers.datatypes import ShieldsProtocolPrivate
|
|
23
23
|
|
|
24
24
|
from .config import SambaNovaSafetyConfig
|
|
25
25
|
|