llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/__init__.py +5 -0
- llama_stack/apis/agents/__init__.py +1 -1
- llama_stack/apis/agents/agents.py +700 -281
- llama_stack/apis/agents/openai_responses.py +1311 -0
- llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
- llama_stack/apis/batches/batches.py +100 -0
- llama_stack/apis/benchmarks/__init__.py +7 -0
- llama_stack/apis/benchmarks/benchmarks.py +108 -0
- llama_stack/apis/common/content_types.py +143 -0
- llama_stack/apis/common/errors.py +103 -0
- llama_stack/apis/common/job_types.py +38 -0
- llama_stack/apis/common/responses.py +36 -0
- llama_stack/apis/common/training_types.py +36 -5
- llama_stack/apis/common/type_system.py +158 -0
- llama_stack/apis/conversations/__init__.py +31 -0
- llama_stack/apis/conversations/conversations.py +286 -0
- llama_stack/apis/datasetio/__init__.py +7 -0
- llama_stack/apis/datasetio/datasetio.py +59 -0
- llama_stack/apis/datasets/__init__.py +7 -0
- llama_stack/apis/datasets/datasets.py +251 -0
- llama_stack/apis/datatypes.py +160 -0
- llama_stack/apis/eval/__init__.py +7 -0
- llama_stack/apis/eval/eval.py +169 -0
- llama_stack/apis/files/__init__.py +7 -0
- llama_stack/apis/files/files.py +199 -0
- llama_stack/apis/inference/__init__.py +1 -1
- llama_stack/apis/inference/inference.py +1169 -113
- llama_stack/apis/inspect/__init__.py +1 -1
- llama_stack/apis/inspect/inspect.py +69 -16
- llama_stack/apis/models/__init__.py +1 -1
- llama_stack/apis/models/models.py +148 -21
- llama_stack/apis/post_training/__init__.py +1 -1
- llama_stack/apis/post_training/post_training.py +265 -120
- llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
- llama_stack/apis/prompts/prompts.py +204 -0
- llama_stack/apis/providers/__init__.py +7 -0
- llama_stack/apis/providers/providers.py +69 -0
- llama_stack/apis/resource.py +37 -0
- llama_stack/apis/safety/__init__.py +1 -1
- llama_stack/apis/safety/safety.py +95 -12
- llama_stack/apis/scoring/__init__.py +7 -0
- llama_stack/apis/scoring/scoring.py +93 -0
- llama_stack/apis/scoring_functions/__init__.py +7 -0
- llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
- llama_stack/apis/shields/__init__.py +1 -1
- llama_stack/apis/shields/shields.py +76 -33
- llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
- llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
- llama_stack/apis/telemetry/__init__.py +1 -1
- llama_stack/apis/telemetry/telemetry.py +322 -31
- llama_stack/apis/{dataset → tools}/__init__.py +2 -1
- llama_stack/apis/tools/rag_tool.py +218 -0
- llama_stack/apis/tools/tools.py +221 -0
- llama_stack/apis/vector_io/__init__.py +7 -0
- llama_stack/apis/vector_io/vector_io.py +960 -0
- llama_stack/apis/vector_stores/__init__.py +7 -0
- llama_stack/apis/vector_stores/vector_stores.py +51 -0
- llama_stack/apis/version.py +9 -0
- llama_stack/cli/llama.py +13 -5
- llama_stack/cli/stack/_list_deps.py +182 -0
- llama_stack/cli/stack/list_apis.py +1 -1
- llama_stack/cli/stack/list_deps.py +55 -0
- llama_stack/cli/stack/list_providers.py +24 -10
- llama_stack/cli/stack/list_stacks.py +56 -0
- llama_stack/cli/stack/remove.py +115 -0
- llama_stack/cli/stack/run.py +169 -56
- llama_stack/cli/stack/stack.py +18 -4
- llama_stack/cli/stack/utils.py +151 -0
- llama_stack/cli/table.py +23 -61
- llama_stack/cli/utils.py +29 -0
- llama_stack/core/access_control/access_control.py +131 -0
- llama_stack/core/access_control/conditions.py +129 -0
- llama_stack/core/access_control/datatypes.py +107 -0
- llama_stack/core/build.py +164 -0
- llama_stack/core/client.py +205 -0
- llama_stack/core/common.sh +37 -0
- llama_stack/{distribution → core}/configure.py +74 -55
- llama_stack/core/conversations/conversations.py +309 -0
- llama_stack/core/datatypes.py +625 -0
- llama_stack/core/distribution.py +276 -0
- llama_stack/core/external.py +54 -0
- llama_stack/core/id_generation.py +42 -0
- llama_stack/core/inspect.py +86 -0
- llama_stack/core/library_client.py +539 -0
- llama_stack/core/prompts/prompts.py +234 -0
- llama_stack/core/providers.py +137 -0
- llama_stack/core/request_headers.py +115 -0
- llama_stack/core/resolver.py +506 -0
- llama_stack/core/routers/__init__.py +101 -0
- llama_stack/core/routers/datasets.py +73 -0
- llama_stack/core/routers/eval_scoring.py +155 -0
- llama_stack/core/routers/inference.py +645 -0
- llama_stack/core/routers/safety.py +85 -0
- llama_stack/core/routers/tool_runtime.py +91 -0
- llama_stack/core/routers/vector_io.py +442 -0
- llama_stack/core/routing_tables/benchmarks.py +62 -0
- llama_stack/core/routing_tables/common.py +254 -0
- llama_stack/core/routing_tables/datasets.py +91 -0
- llama_stack/core/routing_tables/models.py +163 -0
- llama_stack/core/routing_tables/scoring_functions.py +66 -0
- llama_stack/core/routing_tables/shields.py +61 -0
- llama_stack/core/routing_tables/toolgroups.py +129 -0
- llama_stack/core/routing_tables/vector_stores.py +292 -0
- llama_stack/core/server/auth.py +187 -0
- llama_stack/core/server/auth_providers.py +494 -0
- llama_stack/core/server/quota.py +110 -0
- llama_stack/core/server/routes.py +141 -0
- llama_stack/core/server/server.py +542 -0
- llama_stack/core/server/tracing.py +80 -0
- llama_stack/core/stack.py +546 -0
- llama_stack/core/start_stack.sh +117 -0
- llama_stack/core/storage/datatypes.py +283 -0
- llama_stack/{cli/model → core/store}/__init__.py +1 -1
- llama_stack/core/store/registry.py +199 -0
- llama_stack/core/testing_context.py +49 -0
- llama_stack/core/ui/app.py +55 -0
- llama_stack/core/ui/modules/api.py +32 -0
- llama_stack/core/ui/modules/utils.py +42 -0
- llama_stack/core/ui/page/distribution/datasets.py +18 -0
- llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
- llama_stack/core/ui/page/distribution/models.py +18 -0
- llama_stack/core/ui/page/distribution/providers.py +27 -0
- llama_stack/core/ui/page/distribution/resources.py +48 -0
- llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
- llama_stack/core/ui/page/distribution/shields.py +19 -0
- llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
- llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
- llama_stack/core/ui/page/playground/chat.py +130 -0
- llama_stack/core/ui/page/playground/tools.py +352 -0
- llama_stack/core/utils/config.py +30 -0
- llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
- llama_stack/core/utils/config_resolution.py +125 -0
- llama_stack/core/utils/context.py +84 -0
- llama_stack/core/utils/exec.py +96 -0
- llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
- llama_stack/{distribution → core}/utils/model_utils.py +2 -2
- llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
- llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
- llama_stack/distributions/dell/build.yaml +33 -0
- llama_stack/distributions/dell/dell.py +158 -0
- llama_stack/distributions/dell/run-with-safety.yaml +141 -0
- llama_stack/distributions/dell/run.yaml +132 -0
- llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
- llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
- llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
- llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
- llama_stack/distributions/nvidia/build.yaml +29 -0
- llama_stack/distributions/nvidia/nvidia.py +154 -0
- llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
- llama_stack/distributions/nvidia/run.yaml +116 -0
- llama_stack/distributions/open-benchmark/__init__.py +7 -0
- llama_stack/distributions/open-benchmark/build.yaml +36 -0
- llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
- llama_stack/distributions/open-benchmark/run.yaml +252 -0
- llama_stack/distributions/postgres-demo/__init__.py +7 -0
- llama_stack/distributions/postgres-demo/build.yaml +23 -0
- llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
- llama_stack/distributions/postgres-demo/run.yaml +115 -0
- llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
- llama_stack/distributions/starter/build.yaml +61 -0
- llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
- llama_stack/distributions/starter/run.yaml +276 -0
- llama_stack/distributions/starter/starter.py +345 -0
- llama_stack/distributions/starter-gpu/__init__.py +7 -0
- llama_stack/distributions/starter-gpu/build.yaml +61 -0
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
- llama_stack/distributions/starter-gpu/run.yaml +279 -0
- llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
- llama_stack/distributions/template.py +456 -0
- llama_stack/distributions/watsonx/__init__.py +7 -0
- llama_stack/distributions/watsonx/build.yaml +33 -0
- llama_stack/distributions/watsonx/run.yaml +133 -0
- llama_stack/distributions/watsonx/watsonx.py +95 -0
- llama_stack/env.py +24 -0
- llama_stack/log.py +314 -0
- llama_stack/models/llama/checkpoint.py +164 -0
- llama_stack/models/llama/datatypes.py +164 -0
- llama_stack/models/llama/hadamard_utils.py +86 -0
- llama_stack/models/llama/llama3/args.py +74 -0
- llama_stack/models/llama/llama3/chat_format.py +286 -0
- llama_stack/models/llama/llama3/generation.py +376 -0
- llama_stack/models/llama/llama3/interface.py +255 -0
- llama_stack/models/llama/llama3/model.py +304 -0
- llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
- llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
- llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
- llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
- llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
- llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
- llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
- llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
- llama_stack/models/llama/llama3/quantization/loader.py +316 -0
- llama_stack/models/llama/llama3/template_data.py +116 -0
- llama_stack/models/llama/llama3/tokenizer.model +128000 -0
- llama_stack/models/llama/llama3/tokenizer.py +198 -0
- llama_stack/models/llama/llama3/tool_utils.py +266 -0
- llama_stack/models/llama/llama3_1/__init__.py +12 -0
- llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
- llama_stack/models/llama/llama3_1/prompts.py +258 -0
- llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
- llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
- llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
- llama_stack/models/llama/llama3_3/prompts.py +259 -0
- llama_stack/models/llama/llama4/args.py +107 -0
- llama_stack/models/llama/llama4/chat_format.py +317 -0
- llama_stack/models/llama/llama4/datatypes.py +56 -0
- llama_stack/models/llama/llama4/ffn.py +58 -0
- llama_stack/models/llama/llama4/generation.py +313 -0
- llama_stack/models/llama/llama4/model.py +437 -0
- llama_stack/models/llama/llama4/moe.py +214 -0
- llama_stack/models/llama/llama4/preprocess.py +435 -0
- llama_stack/models/llama/llama4/prompt_format.md +304 -0
- llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
- llama_stack/models/llama/llama4/prompts.py +279 -0
- llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
- llama_stack/models/llama/llama4/quantization/loader.py +226 -0
- llama_stack/models/llama/llama4/tokenizer.model +200000 -0
- llama_stack/models/llama/llama4/tokenizer.py +263 -0
- llama_stack/models/llama/llama4/vision/__init__.py +5 -0
- llama_stack/models/llama/llama4/vision/embedding.py +210 -0
- llama_stack/models/llama/llama4/vision/encoder.py +412 -0
- llama_stack/models/llama/prompt_format.py +191 -0
- llama_stack/models/llama/quantize_impls.py +316 -0
- llama_stack/models/llama/sku_list.py +1029 -0
- llama_stack/models/llama/sku_types.py +233 -0
- llama_stack/models/llama/tokenizer_utils.py +40 -0
- llama_stack/providers/datatypes.py +136 -107
- llama_stack/providers/inline/__init__.py +5 -0
- llama_stack/providers/inline/agents/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
- llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
- llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
- llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
- llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
- llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
- llama_stack/providers/inline/batches/__init__.py +5 -0
- llama_stack/providers/inline/batches/reference/__init__.py +36 -0
- llama_stack/providers/inline/batches/reference/batches.py +679 -0
- llama_stack/providers/inline/batches/reference/config.py +40 -0
- llama_stack/providers/inline/datasetio/__init__.py +5 -0
- llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
- llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
- llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
- llama_stack/providers/inline/eval/__init__.py +5 -0
- llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
- llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
- llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
- llama_stack/providers/inline/files/localfs/__init__.py +20 -0
- llama_stack/providers/inline/files/localfs/config.py +31 -0
- llama_stack/providers/inline/files/localfs/files.py +219 -0
- llama_stack/providers/inline/inference/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
- llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
- llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
- llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
- llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
- llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
- llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
- llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
- llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
- llama_stack/providers/inline/post_training/__init__.py +5 -0
- llama_stack/providers/inline/post_training/common/__init__.py +5 -0
- llama_stack/providers/inline/post_training/common/utils.py +35 -0
- llama_stack/providers/inline/post_training/common/validator.py +36 -0
- llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
- llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
- llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
- llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
- llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
- llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
- llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
- llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
- llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
- llama_stack/providers/inline/safety/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
- llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
- llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
- llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
- llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
- llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
- llama_stack/providers/inline/scoring/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
- llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
- llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
- llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
- llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
- llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
- llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
- llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
- llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
- llama_stack/providers/inline/telemetry/__init__.py +5 -0
- llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
- llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
- llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
- llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
- llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
- llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
- llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
- llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
- llama_stack/providers/inline/vector_io/__init__.py +5 -0
- llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
- llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
- llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
- llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
- llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
- llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
- llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
- llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
- llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
- llama_stack/providers/registry/agents.py +16 -18
- llama_stack/providers/registry/batches.py +26 -0
- llama_stack/providers/registry/datasetio.py +49 -0
- llama_stack/providers/registry/eval.py +46 -0
- llama_stack/providers/registry/files.py +31 -0
- llama_stack/providers/registry/inference.py +273 -118
- llama_stack/providers/registry/post_training.py +69 -0
- llama_stack/providers/registry/safety.py +46 -41
- llama_stack/providers/registry/scoring.py +51 -0
- llama_stack/providers/registry/tool_runtime.py +87 -0
- llama_stack/providers/registry/vector_io.py +828 -0
- llama_stack/providers/remote/__init__.py +5 -0
- llama_stack/providers/remote/agents/__init__.py +5 -0
- llama_stack/providers/remote/datasetio/__init__.py +5 -0
- llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
- llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
- llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
- llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
- llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
- llama_stack/providers/remote/eval/__init__.py +5 -0
- llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
- llama_stack/providers/remote/eval/nvidia/config.py +29 -0
- llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
- llama_stack/providers/remote/files/s3/__init__.py +19 -0
- llama_stack/providers/remote/files/s3/config.py +42 -0
- llama_stack/providers/remote/files/s3/files.py +313 -0
- llama_stack/providers/remote/inference/__init__.py +5 -0
- llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
- llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
- llama_stack/providers/remote/inference/anthropic/config.py +28 -0
- llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
- llama_stack/providers/remote/inference/azure/azure.py +25 -0
- llama_stack/providers/remote/inference/azure/config.py +61 -0
- llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
- llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
- llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
- llama_stack/providers/remote/inference/bedrock/models.py +29 -0
- llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
- llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
- llama_stack/providers/remote/inference/cerebras/config.py +30 -0
- llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
- llama_stack/providers/remote/inference/databricks/config.py +37 -0
- llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
- llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
- llama_stack/providers/remote/inference/fireworks/config.py +27 -0
- llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
- llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
- llama_stack/providers/remote/inference/gemini/config.py +28 -0
- llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
- llama_stack/providers/remote/inference/groq/__init__.py +15 -0
- llama_stack/providers/remote/inference/groq/config.py +34 -0
- llama_stack/providers/remote/inference/groq/groq.py +18 -0
- llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
- llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
- llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
- llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/inference/nvidia/config.py +64 -0
- llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
- llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
- llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
- llama_stack/providers/remote/inference/ollama/config.py +25 -0
- llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
- llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
- llama_stack/providers/remote/inference/openai/config.py +39 -0
- llama_stack/providers/remote/inference/openai/openai.py +38 -0
- llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
- llama_stack/providers/remote/inference/passthrough/config.py +34 -0
- llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
- llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
- llama_stack/providers/remote/inference/runpod/config.py +32 -0
- llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
- llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
- llama_stack/providers/remote/inference/sambanova/config.py +34 -0
- llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
- llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
- llama_stack/providers/remote/inference/tgi/config.py +76 -0
- llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
- llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
- llama_stack/providers/remote/inference/together/config.py +27 -0
- llama_stack/providers/remote/inference/together/together.py +102 -0
- llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
- llama_stack/providers/remote/inference/vertexai/config.py +48 -0
- llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
- llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
- llama_stack/providers/remote/inference/vllm/config.py +59 -0
- llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
- llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
- llama_stack/providers/remote/inference/watsonx/config.py +45 -0
- llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
- llama_stack/providers/remote/post_training/__init__.py +5 -0
- llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
- llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
- llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
- llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
- llama_stack/providers/remote/safety/__init__.py +5 -0
- llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
- llama_stack/providers/remote/safety/bedrock/config.py +14 -0
- llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
- llama_stack/providers/remote/safety/nvidia/config.py +40 -0
- llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
- llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
- llama_stack/providers/remote/safety/sambanova/config.py +37 -0
- llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
- llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
- llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
- llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
- llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
- llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
- llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
- llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
- llama_stack/providers/remote/vector_io/__init__.py +5 -0
- llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
- llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
- llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
- llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
- llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
- llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
- llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
- llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
- llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
- llama_stack/providers/utils/bedrock/__init__.py +5 -0
- llama_stack/providers/utils/bedrock/client.py +74 -0
- llama_stack/providers/utils/bedrock/config.py +64 -0
- llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
- llama_stack/providers/utils/common/__init__.py +5 -0
- llama_stack/providers/utils/common/data_schema_validator.py +103 -0
- llama_stack/providers/utils/datasetio/__init__.py +5 -0
- llama_stack/providers/utils/datasetio/url_utils.py +47 -0
- llama_stack/providers/utils/files/__init__.py +5 -0
- llama_stack/providers/utils/files/form_data.py +69 -0
- llama_stack/providers/utils/inference/__init__.py +8 -7
- llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
- llama_stack/providers/utils/inference/inference_store.py +264 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
- llama_stack/providers/utils/inference/model_registry.py +173 -23
- llama_stack/providers/utils/inference/openai_compat.py +1261 -49
- llama_stack/providers/utils/inference/openai_mixin.py +506 -0
- llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
- llama_stack/providers/utils/kvstore/api.py +6 -6
- llama_stack/providers/utils/kvstore/config.py +28 -48
- llama_stack/providers/utils/kvstore/kvstore.py +61 -15
- llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
- llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
- llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
- llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
- llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
- llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
- llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
- llama_stack/providers/utils/memory/file_utils.py +1 -1
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
- llama_stack/providers/utils/memory/vector_store.py +220 -82
- llama_stack/providers/utils/pagination.py +43 -0
- llama_stack/providers/utils/responses/__init__.py +5 -0
- llama_stack/providers/utils/responses/responses_store.py +292 -0
- llama_stack/providers/utils/scheduler.py +270 -0
- llama_stack/providers/utils/scoring/__init__.py +5 -0
- llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
- llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
- llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
- llama_stack/providers/utils/sqlstore/__init__.py +5 -0
- llama_stack/providers/utils/sqlstore/api.py +128 -0
- llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
- llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
- llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
- llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
- llama_stack/providers/utils/telemetry/tracing.py +192 -53
- llama_stack/providers/utils/tools/__init__.py +5 -0
- llama_stack/providers/utils/tools/mcp.py +148 -0
- llama_stack/providers/utils/tools/ttl_dict.py +70 -0
- llama_stack/providers/utils/vector_io/__init__.py +5 -0
- llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
- llama_stack/schema_utils.py +118 -0
- llama_stack/strong_typing/__init__.py +19 -0
- llama_stack/strong_typing/auxiliary.py +228 -0
- llama_stack/strong_typing/classdef.py +440 -0
- llama_stack/strong_typing/core.py +46 -0
- llama_stack/strong_typing/deserializer.py +877 -0
- llama_stack/strong_typing/docstring.py +409 -0
- llama_stack/strong_typing/exception.py +23 -0
- llama_stack/strong_typing/inspection.py +1085 -0
- llama_stack/strong_typing/mapping.py +40 -0
- llama_stack/strong_typing/name.py +182 -0
- llama_stack/strong_typing/py.typed +0 -0
- llama_stack/strong_typing/schema.py +792 -0
- llama_stack/strong_typing/serialization.py +97 -0
- llama_stack/strong_typing/serializer.py +500 -0
- llama_stack/strong_typing/slots.py +27 -0
- llama_stack/strong_typing/topological.py +89 -0
- llama_stack/testing/__init__.py +5 -0
- llama_stack/testing/api_recorder.py +956 -0
- llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
- llama_stack-0.3.4.dist-info/METADATA +261 -0
- llama_stack-0.3.4.dist-info/RECORD +625 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
- llama_stack/apis/agents/client.py +0 -292
- llama_stack/apis/agents/event_logger.py +0 -184
- llama_stack/apis/batch_inference/batch_inference.py +0 -72
- llama_stack/apis/common/deployment_types.py +0 -31
- llama_stack/apis/dataset/dataset.py +0 -63
- llama_stack/apis/evals/evals.py +0 -122
- llama_stack/apis/inference/client.py +0 -197
- llama_stack/apis/inspect/client.py +0 -82
- llama_stack/apis/memory/client.py +0 -155
- llama_stack/apis/memory/memory.py +0 -65
- llama_stack/apis/memory_banks/__init__.py +0 -7
- llama_stack/apis/memory_banks/client.py +0 -101
- llama_stack/apis/memory_banks/memory_banks.py +0 -78
- llama_stack/apis/models/client.py +0 -83
- llama_stack/apis/reward_scoring/__init__.py +0 -7
- llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
- llama_stack/apis/safety/client.py +0 -105
- llama_stack/apis/shields/client.py +0 -79
- llama_stack/cli/download.py +0 -340
- llama_stack/cli/model/describe.py +0 -82
- llama_stack/cli/model/download.py +0 -24
- llama_stack/cli/model/list.py +0 -62
- llama_stack/cli/model/model.py +0 -34
- llama_stack/cli/model/prompt_format.py +0 -112
- llama_stack/cli/model/safety_models.py +0 -52
- llama_stack/cli/stack/build.py +0 -299
- llama_stack/cli/stack/configure.py +0 -178
- llama_stack/distribution/build.py +0 -123
- llama_stack/distribution/build_conda_env.sh +0 -136
- llama_stack/distribution/build_container.sh +0 -142
- llama_stack/distribution/common.sh +0 -40
- llama_stack/distribution/configure_container.sh +0 -47
- llama_stack/distribution/datatypes.py +0 -139
- llama_stack/distribution/distribution.py +0 -58
- llama_stack/distribution/inspect.py +0 -67
- llama_stack/distribution/request_headers.py +0 -57
- llama_stack/distribution/resolver.py +0 -323
- llama_stack/distribution/routers/__init__.py +0 -48
- llama_stack/distribution/routers/routers.py +0 -158
- llama_stack/distribution/routers/routing_tables.py +0 -173
- llama_stack/distribution/server/endpoints.py +0 -48
- llama_stack/distribution/server/server.py +0 -343
- llama_stack/distribution/start_conda_env.sh +0 -42
- llama_stack/distribution/start_container.sh +0 -64
- llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
- llama_stack/distribution/templates/local-build.yaml +0 -10
- llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
- llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
- llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
- llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
- llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
- llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
- llama_stack/distribution/templates/local-together-build.yaml +0 -10
- llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
- llama_stack/distribution/utils/exec.py +0 -105
- llama_stack/providers/adapters/agents/sample/sample.py +0 -18
- llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
- llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
- llama_stack/providers/adapters/inference/databricks/config.py +0 -21
- llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
- llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
- llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
- llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
- llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
- llama_stack/providers/adapters/inference/sample/sample.py +0 -23
- llama_stack/providers/adapters/inference/tgi/config.py +0 -43
- llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
- llama_stack/providers/adapters/inference/together/config.py +0 -22
- llama_stack/providers/adapters/inference/together/together.py +0 -143
- llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
- llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
- llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
- llama_stack/providers/adapters/memory/sample/sample.py +0 -23
- llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
- llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
- llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
- llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
- llama_stack/providers/adapters/safety/sample/sample.py +0 -23
- llama_stack/providers/adapters/safety/together/__init__.py +0 -18
- llama_stack/providers/adapters/safety/together/config.py +0 -26
- llama_stack/providers/adapters/safety/together/together.py +0 -101
- llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
- llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
- llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
- llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
- llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
- llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
- llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
- llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
- llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
- llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
- llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
- llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
- llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
- llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
- llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
- llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
- llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
- llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
- llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
- llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
- llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
- llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
- llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
- llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
- llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
- llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
- llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
- llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
- llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
- llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
- llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
- llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
- llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
- llama_stack/providers/impls/vllm/config.py +0 -35
- llama_stack/providers/impls/vllm/vllm.py +0 -241
- llama_stack/providers/registry/memory.py +0 -78
- llama_stack/providers/registry/telemetry.py +0 -44
- llama_stack/providers/tests/agents/test_agents.py +0 -210
- llama_stack/providers/tests/inference/test_inference.py +0 -257
- llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
- llama_stack/providers/tests/memory/test_memory.py +0 -136
- llama_stack/providers/tests/resolver.py +0 -100
- llama_stack/providers/tests/safety/test_safety.py +0 -77
- llama_stack-0.0.42.dist-info/METADATA +0 -137
- llama_stack-0.0.42.dist-info/RECORD +0 -256
- /llama_stack/{distribution → core}/__init__.py +0 -0
- /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
- /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
- /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
- /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
- /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
- /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
- /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
- /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
- /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
- /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
- /llama_stack/{distribution → core}/utils/serialize.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
- /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
- /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
- /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
- /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
- /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
- /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
- /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
from collections.abc import AsyncIterator
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
from openai.types.chat.chat_completion_chunk import (
|
|
11
|
+
ChatCompletionChunk as OpenAIChatCompletionChunk,
|
|
12
|
+
)
|
|
13
|
+
from pydantic import ConfigDict
|
|
14
|
+
|
|
15
|
+
from llama_stack.apis.inference import (
|
|
16
|
+
OpenAIChatCompletion,
|
|
17
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
18
|
+
ToolChoice,
|
|
19
|
+
)
|
|
20
|
+
from llama_stack.log import get_logger
|
|
21
|
+
from llama_stack.providers.datatypes import (
|
|
22
|
+
HealthResponse,
|
|
23
|
+
HealthStatus,
|
|
24
|
+
)
|
|
25
|
+
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
|
|
26
|
+
|
|
27
|
+
from .config import VLLMInferenceAdapterConfig
|
|
28
|
+
|
|
29
|
+
log = get_logger(name=__name__, category="inference::vllm")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VLLMInferenceAdapter(OpenAIMixin):
|
|
33
|
+
config: VLLMInferenceAdapterConfig
|
|
34
|
+
|
|
35
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
36
|
+
|
|
37
|
+
provider_data_api_key_field: str = "vllm_api_token"
|
|
38
|
+
|
|
39
|
+
def get_api_key(self) -> str | None:
|
|
40
|
+
if self.config.auth_credential:
|
|
41
|
+
return self.config.auth_credential.get_secret_value()
|
|
42
|
+
return "NO KEY REQUIRED"
|
|
43
|
+
|
|
44
|
+
def get_base_url(self) -> str:
|
|
45
|
+
"""Get the base URL from config."""
|
|
46
|
+
if not self.config.url:
|
|
47
|
+
raise ValueError("No base URL configured")
|
|
48
|
+
return self.config.url
|
|
49
|
+
|
|
50
|
+
async def initialize(self) -> None:
|
|
51
|
+
if not self.config.url:
|
|
52
|
+
raise ValueError(
|
|
53
|
+
"You must provide a URL in run.yaml (or via the VLLM_URL environment variable) to use vLLM."
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
async def health(self) -> HealthResponse:
|
|
57
|
+
"""
|
|
58
|
+
Performs a health check by verifying connectivity to the remote vLLM server.
|
|
59
|
+
This method is used by the Provider API to verify
|
|
60
|
+
that the service is running correctly.
|
|
61
|
+
Uses the unauthenticated /health endpoint.
|
|
62
|
+
Returns:
|
|
63
|
+
|
|
64
|
+
HealthResponse: A dictionary containing the health status.
|
|
65
|
+
"""
|
|
66
|
+
try:
|
|
67
|
+
base_url = self.get_base_url()
|
|
68
|
+
health_url = urljoin(base_url, "health")
|
|
69
|
+
|
|
70
|
+
async with httpx.AsyncClient() as client:
|
|
71
|
+
response = await client.get(health_url)
|
|
72
|
+
response.raise_for_status()
|
|
73
|
+
return HealthResponse(status=HealthStatus.OK)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
return HealthResponse(status=HealthStatus.ERROR, message=f"Health check failed: {str(e)}")
|
|
76
|
+
|
|
77
|
+
def get_extra_client_params(self):
|
|
78
|
+
return {"http_client": httpx.AsyncClient(verify=self.config.tls_verify)}
|
|
79
|
+
|
|
80
|
+
async def check_model_availability(self, model: str) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Skip the check when running without authentication.
|
|
83
|
+
"""
|
|
84
|
+
if not self.config.auth_credential:
|
|
85
|
+
model_ids = []
|
|
86
|
+
async for m in self.client.models.list():
|
|
87
|
+
if m.id == model: # Found exact match
|
|
88
|
+
return True
|
|
89
|
+
model_ids.append(m.id)
|
|
90
|
+
raise ValueError(f"Model '{model}' not found. Available models: {model_ids}")
|
|
91
|
+
log.warning(f"Not checking model availability for {model} as API token may trigger OAuth workflow")
|
|
92
|
+
return True
|
|
93
|
+
|
|
94
|
+
async def openai_chat_completion(
|
|
95
|
+
self,
|
|
96
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
97
|
+
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
98
|
+
params = params.model_copy()
|
|
99
|
+
|
|
100
|
+
# Apply vLLM-specific defaults
|
|
101
|
+
if params.max_tokens is None and self.config.max_tokens:
|
|
102
|
+
params.max_tokens = self.config.max_tokens
|
|
103
|
+
|
|
104
|
+
# This is to be consistent with OpenAI API and support vLLM <= v0.6.3
|
|
105
|
+
# References:
|
|
106
|
+
# * https://platform.openai.com/docs/api-reference/chat/create#chat-create-tool_choice
|
|
107
|
+
# * https://github.com/vllm-project/vllm/pull/10000
|
|
108
|
+
if not params.tools and params.tool_choice is not None:
|
|
109
|
+
params.tool_choice = ToolChoice.none.value
|
|
110
|
+
|
|
111
|
+
return await super().openai_chat_completion(params)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from .config import WatsonXConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def get_adapter_impl(config: WatsonXConfig, _deps):
|
|
11
|
+
# import dynamically so the import is used only when it is needed
|
|
12
|
+
from .watsonx import WatsonXInferenceAdapter
|
|
13
|
+
|
|
14
|
+
adapter = WatsonXInferenceAdapter(config)
|
|
15
|
+
return adapter
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
|
|
13
|
+
from llama_stack.schema_utils import json_schema_type
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class WatsonXProviderDataValidator(BaseModel):
|
|
17
|
+
watsonx_project_id: str | None = Field(
|
|
18
|
+
default=None,
|
|
19
|
+
description="IBM WatsonX project ID",
|
|
20
|
+
)
|
|
21
|
+
watsonx_api_key: str | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@json_schema_type
|
|
25
|
+
class WatsonXConfig(RemoteInferenceProviderConfig):
|
|
26
|
+
url: str = Field(
|
|
27
|
+
default_factory=lambda: os.getenv("WATSONX_BASE_URL", "https://us-south.ml.cloud.ibm.com"),
|
|
28
|
+
description="A base url for accessing the watsonx.ai",
|
|
29
|
+
)
|
|
30
|
+
project_id: str | None = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="The watsonx.ai project ID",
|
|
33
|
+
)
|
|
34
|
+
timeout: int = Field(
|
|
35
|
+
default=60,
|
|
36
|
+
description="Timeout for the HTTP requests",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def sample_run_config(cls, **kwargs) -> dict[str, Any]:
|
|
41
|
+
return {
|
|
42
|
+
"url": "${env.WATSONX_BASE_URL:=https://us-south.ml.cloud.ibm.com}",
|
|
43
|
+
"api_key": "${env.WATSONX_API_KEY:=}",
|
|
44
|
+
"project_id": "${env.WATSONX_PROJECT_ID:=}",
|
|
45
|
+
}
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import litellm
|
|
11
|
+
import requests
|
|
12
|
+
|
|
13
|
+
from llama_stack.apis.inference.inference import (
|
|
14
|
+
OpenAIChatCompletion,
|
|
15
|
+
OpenAIChatCompletionChunk,
|
|
16
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
17
|
+
OpenAIChatCompletionUsage,
|
|
18
|
+
OpenAICompletion,
|
|
19
|
+
OpenAICompletionRequestWithExtraBody,
|
|
20
|
+
OpenAIEmbeddingsRequestWithExtraBody,
|
|
21
|
+
OpenAIEmbeddingsResponse,
|
|
22
|
+
)
|
|
23
|
+
from llama_stack.apis.models import Model
|
|
24
|
+
from llama_stack.apis.models.models import ModelType
|
|
25
|
+
from llama_stack.log import get_logger
|
|
26
|
+
from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
|
|
27
|
+
from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
|
|
28
|
+
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
|
29
|
+
from llama_stack.providers.utils.telemetry.tracing import get_current_span
|
|
30
|
+
|
|
31
|
+
logger = get_logger(name=__name__, category="providers::remote::watsonx")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class WatsonXInferenceAdapter(LiteLLMOpenAIMixin):
|
|
35
|
+
_model_cache: dict[str, Model] = {}
|
|
36
|
+
|
|
37
|
+
provider_data_api_key_field: str = "watsonx_api_key"
|
|
38
|
+
|
|
39
|
+
def __init__(self, config: WatsonXConfig):
|
|
40
|
+
self.available_models = None
|
|
41
|
+
self.config = config
|
|
42
|
+
api_key = config.auth_credential.get_secret_value() if config.auth_credential else None
|
|
43
|
+
LiteLLMOpenAIMixin.__init__(
|
|
44
|
+
self,
|
|
45
|
+
litellm_provider_name="watsonx",
|
|
46
|
+
api_key_from_config=api_key,
|
|
47
|
+
provider_data_api_key_field="watsonx_api_key",
|
|
48
|
+
openai_compat_api_base=self.get_base_url(),
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
async def openai_chat_completion(
|
|
52
|
+
self,
|
|
53
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
54
|
+
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
55
|
+
"""
|
|
56
|
+
Override parent method to add timeout and inject usage object when missing.
|
|
57
|
+
This works around a LiteLLM defect where usage block is sometimes dropped.
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Add usage tracking for streaming when telemetry is active
|
|
61
|
+
stream_options = params.stream_options
|
|
62
|
+
if params.stream and get_current_span() is not None:
|
|
63
|
+
if stream_options is None:
|
|
64
|
+
stream_options = {"include_usage": True}
|
|
65
|
+
elif "include_usage" not in stream_options:
|
|
66
|
+
stream_options = {**stream_options, "include_usage": True}
|
|
67
|
+
|
|
68
|
+
model_obj = await self.model_store.get_model(params.model)
|
|
69
|
+
|
|
70
|
+
request_params = await prepare_openai_completion_params(
|
|
71
|
+
model=self.get_litellm_model_name(model_obj.provider_resource_id),
|
|
72
|
+
messages=params.messages,
|
|
73
|
+
frequency_penalty=params.frequency_penalty,
|
|
74
|
+
function_call=params.function_call,
|
|
75
|
+
functions=params.functions,
|
|
76
|
+
logit_bias=params.logit_bias,
|
|
77
|
+
logprobs=params.logprobs,
|
|
78
|
+
max_completion_tokens=params.max_completion_tokens,
|
|
79
|
+
max_tokens=params.max_tokens,
|
|
80
|
+
n=params.n,
|
|
81
|
+
parallel_tool_calls=params.parallel_tool_calls,
|
|
82
|
+
presence_penalty=params.presence_penalty,
|
|
83
|
+
response_format=params.response_format,
|
|
84
|
+
seed=params.seed,
|
|
85
|
+
stop=params.stop,
|
|
86
|
+
stream=params.stream,
|
|
87
|
+
stream_options=stream_options,
|
|
88
|
+
temperature=params.temperature,
|
|
89
|
+
tool_choice=params.tool_choice,
|
|
90
|
+
tools=params.tools,
|
|
91
|
+
top_logprobs=params.top_logprobs,
|
|
92
|
+
top_p=params.top_p,
|
|
93
|
+
user=params.user,
|
|
94
|
+
api_key=self.get_api_key(),
|
|
95
|
+
api_base=self.api_base,
|
|
96
|
+
# These are watsonx-specific parameters
|
|
97
|
+
timeout=self.config.timeout,
|
|
98
|
+
project_id=self.config.project_id,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
result = await litellm.acompletion(**request_params)
|
|
102
|
+
|
|
103
|
+
# If not streaming, check and inject usage if missing
|
|
104
|
+
if not params.stream:
|
|
105
|
+
# Use getattr to safely handle cases where usage attribute might not exist
|
|
106
|
+
if getattr(result, "usage", None) is None:
|
|
107
|
+
# Create usage object with zeros
|
|
108
|
+
usage_obj = OpenAIChatCompletionUsage(
|
|
109
|
+
prompt_tokens=0,
|
|
110
|
+
completion_tokens=0,
|
|
111
|
+
total_tokens=0,
|
|
112
|
+
)
|
|
113
|
+
# Use model_copy to create a new response with the usage injected
|
|
114
|
+
result = result.model_copy(update={"usage": usage_obj})
|
|
115
|
+
return result
|
|
116
|
+
|
|
117
|
+
# For streaming, wrap the iterator to normalize chunks
|
|
118
|
+
return self._normalize_stream(result)
|
|
119
|
+
|
|
120
|
+
def _normalize_chunk(self, chunk: OpenAIChatCompletionChunk) -> OpenAIChatCompletionChunk:
|
|
121
|
+
"""
|
|
122
|
+
Normalize a chunk to ensure it has all expected attributes.
|
|
123
|
+
This works around LiteLLM not always including all expected attributes.
|
|
124
|
+
"""
|
|
125
|
+
# Ensure chunk has usage attribute with zeros if missing
|
|
126
|
+
if not hasattr(chunk, "usage") or chunk.usage is None:
|
|
127
|
+
usage_obj = OpenAIChatCompletionUsage(
|
|
128
|
+
prompt_tokens=0,
|
|
129
|
+
completion_tokens=0,
|
|
130
|
+
total_tokens=0,
|
|
131
|
+
)
|
|
132
|
+
chunk = chunk.model_copy(update={"usage": usage_obj})
|
|
133
|
+
|
|
134
|
+
# Ensure all delta objects in choices have expected attributes
|
|
135
|
+
if hasattr(chunk, "choices") and chunk.choices:
|
|
136
|
+
normalized_choices = []
|
|
137
|
+
for choice in chunk.choices:
|
|
138
|
+
if hasattr(choice, "delta") and choice.delta:
|
|
139
|
+
delta = choice.delta
|
|
140
|
+
# Build update dict for missing attributes
|
|
141
|
+
delta_updates = {}
|
|
142
|
+
if not hasattr(delta, "refusal"):
|
|
143
|
+
delta_updates["refusal"] = None
|
|
144
|
+
if not hasattr(delta, "reasoning_content"):
|
|
145
|
+
delta_updates["reasoning_content"] = None
|
|
146
|
+
|
|
147
|
+
# If we need to update delta, create a new choice with updated delta
|
|
148
|
+
if delta_updates:
|
|
149
|
+
new_delta = delta.model_copy(update=delta_updates)
|
|
150
|
+
new_choice = choice.model_copy(update={"delta": new_delta})
|
|
151
|
+
normalized_choices.append(new_choice)
|
|
152
|
+
else:
|
|
153
|
+
normalized_choices.append(choice)
|
|
154
|
+
else:
|
|
155
|
+
normalized_choices.append(choice)
|
|
156
|
+
|
|
157
|
+
# If we modified any choices, create a new chunk with updated choices
|
|
158
|
+
if any(normalized_choices[i] is not chunk.choices[i] for i in range(len(chunk.choices))):
|
|
159
|
+
chunk = chunk.model_copy(update={"choices": normalized_choices})
|
|
160
|
+
|
|
161
|
+
return chunk
|
|
162
|
+
|
|
163
|
+
async def _normalize_stream(
|
|
164
|
+
self, stream: AsyncIterator[OpenAIChatCompletionChunk]
|
|
165
|
+
) -> AsyncIterator[OpenAIChatCompletionChunk]:
|
|
166
|
+
"""
|
|
167
|
+
Normalize all chunks in the stream to ensure they have expected attributes.
|
|
168
|
+
This works around LiteLLM sometimes not including expected attributes.
|
|
169
|
+
"""
|
|
170
|
+
try:
|
|
171
|
+
async for chunk in stream:
|
|
172
|
+
# Normalize and yield each chunk immediately
|
|
173
|
+
yield self._normalize_chunk(chunk)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
logger.error(f"Error normalizing stream: {e}", exc_info=True)
|
|
176
|
+
raise
|
|
177
|
+
|
|
178
|
+
async def openai_completion(
|
|
179
|
+
self,
|
|
180
|
+
params: OpenAICompletionRequestWithExtraBody,
|
|
181
|
+
) -> OpenAICompletion:
|
|
182
|
+
"""
|
|
183
|
+
Override parent method to add watsonx-specific parameters.
|
|
184
|
+
"""
|
|
185
|
+
from llama_stack.providers.utils.inference.openai_compat import prepare_openai_completion_params
|
|
186
|
+
|
|
187
|
+
model_obj = await self.model_store.get_model(params.model)
|
|
188
|
+
|
|
189
|
+
request_params = await prepare_openai_completion_params(
|
|
190
|
+
model=self.get_litellm_model_name(model_obj.provider_resource_id),
|
|
191
|
+
prompt=params.prompt,
|
|
192
|
+
best_of=params.best_of,
|
|
193
|
+
echo=params.echo,
|
|
194
|
+
frequency_penalty=params.frequency_penalty,
|
|
195
|
+
logit_bias=params.logit_bias,
|
|
196
|
+
logprobs=params.logprobs,
|
|
197
|
+
max_tokens=params.max_tokens,
|
|
198
|
+
n=params.n,
|
|
199
|
+
presence_penalty=params.presence_penalty,
|
|
200
|
+
seed=params.seed,
|
|
201
|
+
stop=params.stop,
|
|
202
|
+
stream=params.stream,
|
|
203
|
+
stream_options=params.stream_options,
|
|
204
|
+
temperature=params.temperature,
|
|
205
|
+
top_p=params.top_p,
|
|
206
|
+
user=params.user,
|
|
207
|
+
suffix=params.suffix,
|
|
208
|
+
api_key=self.get_api_key(),
|
|
209
|
+
api_base=self.api_base,
|
|
210
|
+
# These are watsonx-specific parameters
|
|
211
|
+
timeout=self.config.timeout,
|
|
212
|
+
project_id=self.config.project_id,
|
|
213
|
+
)
|
|
214
|
+
return await litellm.atext_completion(**request_params)
|
|
215
|
+
|
|
216
|
+
async def openai_embeddings(
|
|
217
|
+
self,
|
|
218
|
+
params: OpenAIEmbeddingsRequestWithExtraBody,
|
|
219
|
+
) -> OpenAIEmbeddingsResponse:
|
|
220
|
+
"""
|
|
221
|
+
Override parent method to add watsonx-specific parameters.
|
|
222
|
+
"""
|
|
223
|
+
model_obj = await self.model_store.get_model(params.model)
|
|
224
|
+
|
|
225
|
+
# Convert input to list if it's a string
|
|
226
|
+
input_list = [params.input] if isinstance(params.input, str) else params.input
|
|
227
|
+
|
|
228
|
+
# Call litellm embedding function with watsonx-specific parameters
|
|
229
|
+
response = litellm.embedding(
|
|
230
|
+
model=self.get_litellm_model_name(model_obj.provider_resource_id),
|
|
231
|
+
input=input_list,
|
|
232
|
+
api_key=self.get_api_key(),
|
|
233
|
+
api_base=self.api_base,
|
|
234
|
+
dimensions=params.dimensions,
|
|
235
|
+
# These are watsonx-specific parameters
|
|
236
|
+
timeout=self.config.timeout,
|
|
237
|
+
project_id=self.config.project_id,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Convert response to OpenAI format
|
|
241
|
+
from llama_stack.apis.inference import OpenAIEmbeddingUsage
|
|
242
|
+
from llama_stack.providers.utils.inference.litellm_openai_mixin import b64_encode_openai_embeddings_response
|
|
243
|
+
|
|
244
|
+
data = b64_encode_openai_embeddings_response(response.data, params.encoding_format)
|
|
245
|
+
|
|
246
|
+
usage = OpenAIEmbeddingUsage(
|
|
247
|
+
prompt_tokens=response["usage"]["prompt_tokens"],
|
|
248
|
+
total_tokens=response["usage"]["total_tokens"],
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
return OpenAIEmbeddingsResponse(
|
|
252
|
+
data=data,
|
|
253
|
+
model=model_obj.provider_resource_id,
|
|
254
|
+
usage=usage,
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
def get_base_url(self) -> str:
|
|
258
|
+
return self.config.url
|
|
259
|
+
|
|
260
|
+
# Copied from OpenAIMixin
|
|
261
|
+
async def check_model_availability(self, model: str) -> bool:
|
|
262
|
+
"""
|
|
263
|
+
Check if a specific model is available from the provider's /v1/models.
|
|
264
|
+
|
|
265
|
+
:param model: The model identifier to check.
|
|
266
|
+
:return: True if the model is available dynamically, False otherwise.
|
|
267
|
+
"""
|
|
268
|
+
if not self._model_cache:
|
|
269
|
+
await self.list_models()
|
|
270
|
+
return model in self._model_cache
|
|
271
|
+
|
|
272
|
+
async def list_models(self) -> list[Model] | None:
|
|
273
|
+
self._model_cache = {}
|
|
274
|
+
models = []
|
|
275
|
+
for model_spec in self._get_model_specs():
|
|
276
|
+
functions = [f["id"] for f in model_spec.get("functions", [])]
|
|
277
|
+
# Format: {"embedding_dimension": 1536, "context_length": 8192}
|
|
278
|
+
|
|
279
|
+
# Example of an embedding model:
|
|
280
|
+
# {'model_id': 'ibm/granite-embedding-278m-multilingual',
|
|
281
|
+
# 'label': 'granite-embedding-278m-multilingual',
|
|
282
|
+
# 'model_limits': {'max_sequence_length': 512, 'embedding_dimension': 768},
|
|
283
|
+
# ...
|
|
284
|
+
provider_resource_id = f"{self.__provider_id__}/{model_spec['model_id']}"
|
|
285
|
+
if "embedding" in functions:
|
|
286
|
+
embedding_dimension = model_spec.get("model_limits", {}).get("embedding_dimension", 0)
|
|
287
|
+
context_length = model_spec.get("model_limits", {}).get("max_sequence_length", 0)
|
|
288
|
+
embedding_metadata = {
|
|
289
|
+
"embedding_dimension": embedding_dimension,
|
|
290
|
+
"context_length": context_length,
|
|
291
|
+
}
|
|
292
|
+
model = Model(
|
|
293
|
+
identifier=model_spec["model_id"],
|
|
294
|
+
provider_resource_id=provider_resource_id,
|
|
295
|
+
provider_id=self.__provider_id__,
|
|
296
|
+
metadata=embedding_metadata,
|
|
297
|
+
model_type=ModelType.embedding,
|
|
298
|
+
)
|
|
299
|
+
self._model_cache[provider_resource_id] = model
|
|
300
|
+
models.append(model)
|
|
301
|
+
if "text_chat" in functions:
|
|
302
|
+
model = Model(
|
|
303
|
+
identifier=model_spec["model_id"],
|
|
304
|
+
provider_resource_id=provider_resource_id,
|
|
305
|
+
provider_id=self.__provider_id__,
|
|
306
|
+
metadata={},
|
|
307
|
+
model_type=ModelType.llm,
|
|
308
|
+
)
|
|
309
|
+
self._model_cache[provider_resource_id] = model
|
|
310
|
+
models.append(model)
|
|
311
|
+
return models
|
|
312
|
+
|
|
313
|
+
# LiteLLM provides methods to list models for many providers, but not for watsonx.ai.
|
|
314
|
+
# So we need to implement our own method to list models by calling the watsonx.ai API.
|
|
315
|
+
def _get_model_specs(self) -> list[dict[str, Any]]:
|
|
316
|
+
"""
|
|
317
|
+
Retrieves foundation model specifications from the watsonx.ai API.
|
|
318
|
+
"""
|
|
319
|
+
url = f"{self.config.url}/ml/v1/foundation_model_specs?version=2023-10-25"
|
|
320
|
+
headers = {
|
|
321
|
+
# Note that there is no authorization header. Listing models does not require authentication.
|
|
322
|
+
"Content-Type": "application/json",
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
response = requests.get(url, headers=headers)
|
|
326
|
+
|
|
327
|
+
# --- Process the Response ---
|
|
328
|
+
# Raise an exception for bad status codes (4xx or 5xx)
|
|
329
|
+
response.raise_for_status()
|
|
330
|
+
|
|
331
|
+
# If the request is successful, parse and return the JSON response.
|
|
332
|
+
# The response should contain a list of model specifications
|
|
333
|
+
response_data = response.json()
|
|
334
|
+
if "resources" not in response_data:
|
|
335
|
+
raise ValueError("Resources not found in response")
|
|
336
|
+
return response_data["resources"]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from .config import NvidiaPostTrainingConfig
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def get_adapter_impl(
|
|
11
|
+
config: NvidiaPostTrainingConfig,
|
|
12
|
+
_deps,
|
|
13
|
+
):
|
|
14
|
+
from .post_training import NvidiaPostTrainingAdapter
|
|
15
|
+
|
|
16
|
+
if not isinstance(config, NvidiaPostTrainingConfig):
|
|
17
|
+
raise RuntimeError(f"Unexpected config type: {type(config)}")
|
|
18
|
+
|
|
19
|
+
impl = NvidiaPostTrainingAdapter(config)
|
|
20
|
+
return impl
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = ["get_adapter_impl", "NvidiaPostTrainingAdapter"]
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
# TODO: add default values for all fields
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class NvidiaPostTrainingConfig(BaseModel):
|
|
16
|
+
"""Configuration for NVIDIA Post Training implementation."""
|
|
17
|
+
|
|
18
|
+
api_key: str | None = Field(
|
|
19
|
+
default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
|
|
20
|
+
description="The NVIDIA API key.",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
dataset_namespace: str | None = Field(
|
|
24
|
+
default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
|
|
25
|
+
description="The NVIDIA dataset namespace.",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
project_id: str | None = Field(
|
|
29
|
+
default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-example-model@v1"),
|
|
30
|
+
description="The NVIDIA project ID.",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# ToDO: validate this, add default value
|
|
34
|
+
customizer_url: str | None = Field(
|
|
35
|
+
default_factory=lambda: os.getenv("NVIDIA_CUSTOMIZER_URL"),
|
|
36
|
+
description="Base URL for the NeMo Customizer API",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
timeout: int = Field(
|
|
40
|
+
default=300,
|
|
41
|
+
description="Timeout for the NVIDIA Post Training API",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
max_retries: int = Field(
|
|
45
|
+
default=3,
|
|
46
|
+
description="Maximum number of retries for the NVIDIA Post Training API",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# ToDo: validate this
|
|
50
|
+
output_model_dir: str = Field(
|
|
51
|
+
default_factory=lambda: os.getenv("NVIDIA_OUTPUT_MODEL_DIR", "test-example-model@v1"),
|
|
52
|
+
description="Directory to save the output model",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def sample_run_config(cls, **kwargs) -> dict[str, Any]:
|
|
57
|
+
return {
|
|
58
|
+
"api_key": "${env.NVIDIA_API_KEY:=}",
|
|
59
|
+
"dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:=default}",
|
|
60
|
+
"project_id": "${env.NVIDIA_PROJECT_ID:=test-project}",
|
|
61
|
+
"customizer_url": "${env.NVIDIA_CUSTOMIZER_URL:=http://nemo.test}",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class SFTLoRADefaultConfig(BaseModel):
|
|
66
|
+
"""NVIDIA-specific training configuration with default values."""
|
|
67
|
+
|
|
68
|
+
# ToDo: split into SFT and LoRA configs??
|
|
69
|
+
|
|
70
|
+
# General training parameters
|
|
71
|
+
n_epochs: int = 50
|
|
72
|
+
|
|
73
|
+
# NeMo customizer specific parameters
|
|
74
|
+
log_every_n_steps: int | None = None
|
|
75
|
+
val_check_interval: float = 0.25
|
|
76
|
+
sequence_packing_enabled: bool = False
|
|
77
|
+
weight_decay: float = 0.01
|
|
78
|
+
lr: float = 0.0001
|
|
79
|
+
|
|
80
|
+
# SFT specific parameters
|
|
81
|
+
hidden_dropout: float | None = None
|
|
82
|
+
attention_dropout: float | None = None
|
|
83
|
+
ffn_dropout: float | None = None
|
|
84
|
+
|
|
85
|
+
# LoRA default parameters
|
|
86
|
+
lora_adapter_dim: int = 8
|
|
87
|
+
lora_adapter_dropout: float | None = None
|
|
88
|
+
lora_alpha: int = 16
|
|
89
|
+
|
|
90
|
+
# Data config
|
|
91
|
+
batch_size: int = 8
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def sample_config(cls) -> dict[str, Any]:
|
|
95
|
+
"""Return a sample configuration for NVIDIA training."""
|
|
96
|
+
return {
|
|
97
|
+
"n_epochs": 50,
|
|
98
|
+
"log_every_n_steps": 10,
|
|
99
|
+
"val_check_interval": 0.25,
|
|
100
|
+
"sequence_packing_enabled": False,
|
|
101
|
+
"weight_decay": 0.01,
|
|
102
|
+
"hidden_dropout": 0.1,
|
|
103
|
+
"attention_dropout": 0.1,
|
|
104
|
+
"lora_adapter_dim": 8,
|
|
105
|
+
"lora_alpha": 16,
|
|
106
|
+
"data_config": {
|
|
107
|
+
"dataset_id": "default",
|
|
108
|
+
"batch_size": 8,
|
|
109
|
+
},
|
|
110
|
+
"optimizer_config": {
|
|
111
|
+
"lr": 0.0001,
|
|
112
|
+
},
|
|
113
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
from llama_stack.models.llama.sku_types import CoreModelId
|
|
9
|
+
from llama_stack.providers.utils.inference.model_registry import (
|
|
10
|
+
ProviderModelEntry,
|
|
11
|
+
build_hf_repo_model_entry,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
_MODEL_ENTRIES = [
|
|
15
|
+
build_hf_repo_model_entry(
|
|
16
|
+
"meta/llama-3.1-8b-instruct",
|
|
17
|
+
CoreModelId.llama3_1_8b_instruct.value,
|
|
18
|
+
),
|
|
19
|
+
build_hf_repo_model_entry(
|
|
20
|
+
"meta/llama-3.2-1b-instruct",
|
|
21
|
+
CoreModelId.llama3_2_1b_instruct.value,
|
|
22
|
+
),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_model_entries() -> list[ProviderModelEntry]:
|
|
27
|
+
return _MODEL_ENTRIES
|