llama-stack 0.0.42__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/__init__.py +5 -0
- llama_stack/apis/agents/__init__.py +1 -1
- llama_stack/apis/agents/agents.py +700 -281
- llama_stack/apis/agents/openai_responses.py +1311 -0
- llama_stack/{providers/adapters/memory/sample/config.py → apis/batches/__init__.py} +2 -5
- llama_stack/apis/batches/batches.py +100 -0
- llama_stack/apis/benchmarks/__init__.py +7 -0
- llama_stack/apis/benchmarks/benchmarks.py +108 -0
- llama_stack/apis/common/content_types.py +143 -0
- llama_stack/apis/common/errors.py +103 -0
- llama_stack/apis/common/job_types.py +38 -0
- llama_stack/apis/common/responses.py +36 -0
- llama_stack/apis/common/training_types.py +36 -5
- llama_stack/apis/common/type_system.py +158 -0
- llama_stack/apis/conversations/__init__.py +31 -0
- llama_stack/apis/conversations/conversations.py +286 -0
- llama_stack/apis/datasetio/__init__.py +7 -0
- llama_stack/apis/datasetio/datasetio.py +59 -0
- llama_stack/apis/datasets/__init__.py +7 -0
- llama_stack/apis/datasets/datasets.py +251 -0
- llama_stack/apis/datatypes.py +160 -0
- llama_stack/apis/eval/__init__.py +7 -0
- llama_stack/apis/eval/eval.py +169 -0
- llama_stack/apis/files/__init__.py +7 -0
- llama_stack/apis/files/files.py +199 -0
- llama_stack/apis/inference/__init__.py +1 -1
- llama_stack/apis/inference/inference.py +1169 -113
- llama_stack/apis/inspect/__init__.py +1 -1
- llama_stack/apis/inspect/inspect.py +69 -16
- llama_stack/apis/models/__init__.py +1 -1
- llama_stack/apis/models/models.py +148 -21
- llama_stack/apis/post_training/__init__.py +1 -1
- llama_stack/apis/post_training/post_training.py +265 -120
- llama_stack/{providers/adapters/agents/sample/config.py → apis/prompts/__init__.py} +2 -5
- llama_stack/apis/prompts/prompts.py +204 -0
- llama_stack/apis/providers/__init__.py +7 -0
- llama_stack/apis/providers/providers.py +69 -0
- llama_stack/apis/resource.py +37 -0
- llama_stack/apis/safety/__init__.py +1 -1
- llama_stack/apis/safety/safety.py +95 -12
- llama_stack/apis/scoring/__init__.py +7 -0
- llama_stack/apis/scoring/scoring.py +93 -0
- llama_stack/apis/scoring_functions/__init__.py +7 -0
- llama_stack/apis/scoring_functions/scoring_functions.py +208 -0
- llama_stack/apis/shields/__init__.py +1 -1
- llama_stack/apis/shields/shields.py +76 -33
- llama_stack/apis/synthetic_data_generation/__init__.py +1 -1
- llama_stack/apis/synthetic_data_generation/synthetic_data_generation.py +40 -17
- llama_stack/apis/telemetry/__init__.py +1 -1
- llama_stack/apis/telemetry/telemetry.py +322 -31
- llama_stack/apis/{dataset → tools}/__init__.py +2 -1
- llama_stack/apis/tools/rag_tool.py +218 -0
- llama_stack/apis/tools/tools.py +221 -0
- llama_stack/apis/vector_io/__init__.py +7 -0
- llama_stack/apis/vector_io/vector_io.py +960 -0
- llama_stack/apis/vector_stores/__init__.py +7 -0
- llama_stack/apis/vector_stores/vector_stores.py +51 -0
- llama_stack/apis/version.py +9 -0
- llama_stack/cli/llama.py +13 -5
- llama_stack/cli/stack/_list_deps.py +182 -0
- llama_stack/cli/stack/list_apis.py +1 -1
- llama_stack/cli/stack/list_deps.py +55 -0
- llama_stack/cli/stack/list_providers.py +24 -10
- llama_stack/cli/stack/list_stacks.py +56 -0
- llama_stack/cli/stack/remove.py +115 -0
- llama_stack/cli/stack/run.py +169 -56
- llama_stack/cli/stack/stack.py +18 -4
- llama_stack/cli/stack/utils.py +151 -0
- llama_stack/cli/table.py +23 -61
- llama_stack/cli/utils.py +29 -0
- llama_stack/core/access_control/access_control.py +131 -0
- llama_stack/core/access_control/conditions.py +129 -0
- llama_stack/core/access_control/datatypes.py +107 -0
- llama_stack/core/build.py +164 -0
- llama_stack/core/client.py +205 -0
- llama_stack/core/common.sh +37 -0
- llama_stack/{distribution → core}/configure.py +74 -55
- llama_stack/core/conversations/conversations.py +309 -0
- llama_stack/core/datatypes.py +625 -0
- llama_stack/core/distribution.py +276 -0
- llama_stack/core/external.py +54 -0
- llama_stack/core/id_generation.py +42 -0
- llama_stack/core/inspect.py +86 -0
- llama_stack/core/library_client.py +539 -0
- llama_stack/core/prompts/prompts.py +234 -0
- llama_stack/core/providers.py +137 -0
- llama_stack/core/request_headers.py +115 -0
- llama_stack/core/resolver.py +506 -0
- llama_stack/core/routers/__init__.py +101 -0
- llama_stack/core/routers/datasets.py +73 -0
- llama_stack/core/routers/eval_scoring.py +155 -0
- llama_stack/core/routers/inference.py +645 -0
- llama_stack/core/routers/safety.py +85 -0
- llama_stack/core/routers/tool_runtime.py +91 -0
- llama_stack/core/routers/vector_io.py +442 -0
- llama_stack/core/routing_tables/benchmarks.py +62 -0
- llama_stack/core/routing_tables/common.py +254 -0
- llama_stack/core/routing_tables/datasets.py +91 -0
- llama_stack/core/routing_tables/models.py +163 -0
- llama_stack/core/routing_tables/scoring_functions.py +66 -0
- llama_stack/core/routing_tables/shields.py +61 -0
- llama_stack/core/routing_tables/toolgroups.py +129 -0
- llama_stack/core/routing_tables/vector_stores.py +292 -0
- llama_stack/core/server/auth.py +187 -0
- llama_stack/core/server/auth_providers.py +494 -0
- llama_stack/core/server/quota.py +110 -0
- llama_stack/core/server/routes.py +141 -0
- llama_stack/core/server/server.py +542 -0
- llama_stack/core/server/tracing.py +80 -0
- llama_stack/core/stack.py +546 -0
- llama_stack/core/start_stack.sh +117 -0
- llama_stack/core/storage/datatypes.py +283 -0
- llama_stack/{cli/model → core/store}/__init__.py +1 -1
- llama_stack/core/store/registry.py +199 -0
- llama_stack/core/testing_context.py +49 -0
- llama_stack/core/ui/app.py +55 -0
- llama_stack/core/ui/modules/api.py +32 -0
- llama_stack/core/ui/modules/utils.py +42 -0
- llama_stack/core/ui/page/distribution/datasets.py +18 -0
- llama_stack/core/ui/page/distribution/eval_tasks.py +20 -0
- llama_stack/core/ui/page/distribution/models.py +18 -0
- llama_stack/core/ui/page/distribution/providers.py +27 -0
- llama_stack/core/ui/page/distribution/resources.py +48 -0
- llama_stack/core/ui/page/distribution/scoring_functions.py +18 -0
- llama_stack/core/ui/page/distribution/shields.py +19 -0
- llama_stack/core/ui/page/evaluations/app_eval.py +143 -0
- llama_stack/core/ui/page/evaluations/native_eval.py +253 -0
- llama_stack/core/ui/page/playground/chat.py +130 -0
- llama_stack/core/ui/page/playground/tools.py +352 -0
- llama_stack/core/utils/config.py +30 -0
- llama_stack/{distribution → core}/utils/config_dirs.py +3 -6
- llama_stack/core/utils/config_resolution.py +125 -0
- llama_stack/core/utils/context.py +84 -0
- llama_stack/core/utils/exec.py +96 -0
- llama_stack/{providers/impls/meta_reference/codeshield/config.py → core/utils/image_types.py} +4 -3
- llama_stack/{distribution → core}/utils/model_utils.py +2 -2
- llama_stack/{distribution → core}/utils/prompt_for_config.py +30 -63
- llama_stack/{apis/batch_inference → distributions/dell}/__init__.py +1 -1
- llama_stack/distributions/dell/build.yaml +33 -0
- llama_stack/distributions/dell/dell.py +158 -0
- llama_stack/distributions/dell/run-with-safety.yaml +141 -0
- llama_stack/distributions/dell/run.yaml +132 -0
- llama_stack/distributions/meta-reference-gpu/__init__.py +7 -0
- llama_stack/distributions/meta-reference-gpu/build.yaml +32 -0
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +163 -0
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +154 -0
- llama_stack/distributions/meta-reference-gpu/run.yaml +139 -0
- llama_stack/{apis/evals → distributions/nvidia}/__init__.py +1 -1
- llama_stack/distributions/nvidia/build.yaml +29 -0
- llama_stack/distributions/nvidia/nvidia.py +154 -0
- llama_stack/distributions/nvidia/run-with-safety.yaml +137 -0
- llama_stack/distributions/nvidia/run.yaml +116 -0
- llama_stack/distributions/open-benchmark/__init__.py +7 -0
- llama_stack/distributions/open-benchmark/build.yaml +36 -0
- llama_stack/distributions/open-benchmark/open_benchmark.py +303 -0
- llama_stack/distributions/open-benchmark/run.yaml +252 -0
- llama_stack/distributions/postgres-demo/__init__.py +7 -0
- llama_stack/distributions/postgres-demo/build.yaml +23 -0
- llama_stack/distributions/postgres-demo/postgres_demo.py +125 -0
- llama_stack/distributions/postgres-demo/run.yaml +115 -0
- llama_stack/{apis/memory → distributions/starter}/__init__.py +1 -1
- llama_stack/distributions/starter/build.yaml +61 -0
- llama_stack/distributions/starter/run-with-postgres-store.yaml +285 -0
- llama_stack/distributions/starter/run.yaml +276 -0
- llama_stack/distributions/starter/starter.py +345 -0
- llama_stack/distributions/starter-gpu/__init__.py +7 -0
- llama_stack/distributions/starter-gpu/build.yaml +61 -0
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +288 -0
- llama_stack/distributions/starter-gpu/run.yaml +279 -0
- llama_stack/distributions/starter-gpu/starter_gpu.py +20 -0
- llama_stack/distributions/template.py +456 -0
- llama_stack/distributions/watsonx/__init__.py +7 -0
- llama_stack/distributions/watsonx/build.yaml +33 -0
- llama_stack/distributions/watsonx/run.yaml +133 -0
- llama_stack/distributions/watsonx/watsonx.py +95 -0
- llama_stack/env.py +24 -0
- llama_stack/log.py +314 -0
- llama_stack/models/llama/checkpoint.py +164 -0
- llama_stack/models/llama/datatypes.py +164 -0
- llama_stack/models/llama/hadamard_utils.py +86 -0
- llama_stack/models/llama/llama3/args.py +74 -0
- llama_stack/models/llama/llama3/chat_format.py +286 -0
- llama_stack/models/llama/llama3/generation.py +376 -0
- llama_stack/models/llama/llama3/interface.py +255 -0
- llama_stack/models/llama/llama3/model.py +304 -0
- llama_stack/models/llama/llama3/multimodal/__init__.py +12 -0
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +180 -0
- llama_stack/models/llama/llama3/multimodal/image_transform.py +409 -0
- llama_stack/models/llama/llama3/multimodal/model.py +1430 -0
- llama_stack/models/llama/llama3/multimodal/utils.py +26 -0
- llama_stack/models/llama/llama3/prompt_templates/__init__.py +22 -0
- llama_stack/models/llama/llama3/prompt_templates/base.py +39 -0
- llama_stack/models/llama/llama3/prompt_templates/system_prompts.py +319 -0
- llama_stack/models/llama/llama3/prompt_templates/tool_response.py +62 -0
- llama_stack/models/llama/llama3/quantization/loader.py +316 -0
- llama_stack/models/llama/llama3/template_data.py +116 -0
- llama_stack/models/llama/llama3/tokenizer.model +128000 -0
- llama_stack/models/llama/llama3/tokenizer.py +198 -0
- llama_stack/models/llama/llama3/tool_utils.py +266 -0
- llama_stack/models/llama/llama3_1/__init__.py +12 -0
- llama_stack/models/llama/llama3_1/prompt_format.md +358 -0
- llama_stack/models/llama/llama3_1/prompts.py +258 -0
- llama_stack/models/llama/llama3_2/prompts_text.py +229 -0
- llama_stack/models/llama/llama3_2/prompts_vision.py +126 -0
- llama_stack/models/llama/llama3_2/text_prompt_format.md +286 -0
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +141 -0
- llama_stack/models/llama/llama3_3/prompts.py +259 -0
- llama_stack/models/llama/llama4/args.py +107 -0
- llama_stack/models/llama/llama4/chat_format.py +317 -0
- llama_stack/models/llama/llama4/datatypes.py +56 -0
- llama_stack/models/llama/llama4/ffn.py +58 -0
- llama_stack/models/llama/llama4/generation.py +313 -0
- llama_stack/models/llama/llama4/model.py +437 -0
- llama_stack/models/llama/llama4/moe.py +214 -0
- llama_stack/models/llama/llama4/preprocess.py +435 -0
- llama_stack/models/llama/llama4/prompt_format.md +304 -0
- llama_stack/models/llama/llama4/prompt_templates/system_prompts.py +136 -0
- llama_stack/models/llama/llama4/prompts.py +279 -0
- llama_stack/models/llama/llama4/quantization/__init__.py +5 -0
- llama_stack/models/llama/llama4/quantization/loader.py +226 -0
- llama_stack/models/llama/llama4/tokenizer.model +200000 -0
- llama_stack/models/llama/llama4/tokenizer.py +263 -0
- llama_stack/models/llama/llama4/vision/__init__.py +5 -0
- llama_stack/models/llama/llama4/vision/embedding.py +210 -0
- llama_stack/models/llama/llama4/vision/encoder.py +412 -0
- llama_stack/models/llama/prompt_format.py +191 -0
- llama_stack/models/llama/quantize_impls.py +316 -0
- llama_stack/models/llama/sku_list.py +1029 -0
- llama_stack/models/llama/sku_types.py +233 -0
- llama_stack/models/llama/tokenizer_utils.py +40 -0
- llama_stack/providers/datatypes.py +136 -107
- llama_stack/providers/inline/__init__.py +5 -0
- llama_stack/providers/inline/agents/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/agents → inline/agents/meta_reference}/__init__.py +12 -5
- llama_stack/providers/inline/agents/meta_reference/agent_instance.py +1024 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +383 -0
- llama_stack/providers/inline/agents/meta_reference/config.py +37 -0
- llama_stack/providers/inline/agents/meta_reference/persistence.py +228 -0
- llama_stack/providers/inline/agents/meta_reference/responses/__init__.py +5 -0
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +423 -0
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +1226 -0
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +449 -0
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +194 -0
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +365 -0
- llama_stack/providers/inline/agents/meta_reference/safety.py +52 -0
- llama_stack/providers/inline/batches/__init__.py +5 -0
- llama_stack/providers/inline/batches/reference/__init__.py +36 -0
- llama_stack/providers/inline/batches/reference/batches.py +679 -0
- llama_stack/providers/inline/batches/reference/config.py +40 -0
- llama_stack/providers/inline/datasetio/__init__.py +5 -0
- llama_stack/providers/inline/datasetio/localfs/__init__.py +20 -0
- llama_stack/providers/inline/datasetio/localfs/config.py +23 -0
- llama_stack/providers/inline/datasetio/localfs/datasetio.py +113 -0
- llama_stack/providers/inline/eval/__init__.py +5 -0
- llama_stack/providers/inline/eval/meta_reference/__init__.py +28 -0
- llama_stack/providers/inline/eval/meta_reference/config.py +23 -0
- llama_stack/providers/inline/eval/meta_reference/eval.py +259 -0
- llama_stack/providers/inline/files/localfs/__init__.py +20 -0
- llama_stack/providers/inline/files/localfs/config.py +31 -0
- llama_stack/providers/inline/files/localfs/files.py +219 -0
- llama_stack/providers/inline/inference/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/__init__.py +4 -4
- llama_stack/providers/inline/inference/meta_reference/common.py +24 -0
- llama_stack/providers/inline/inference/meta_reference/config.py +68 -0
- llama_stack/providers/inline/inference/meta_reference/generators.py +211 -0
- llama_stack/providers/inline/inference/meta_reference/inference.py +158 -0
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +96 -0
- llama_stack/providers/{impls/meta_reference/inference → inline/inference/meta_reference}/parallel_utils.py +56 -73
- llama_stack/providers/inline/inference/sentence_transformers/__init__.py +22 -0
- llama_stack/providers/{impls/meta_reference/agents → inline/inference/sentence_transformers}/config.py +6 -4
- llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +83 -0
- llama_stack/providers/inline/post_training/__init__.py +5 -0
- llama_stack/providers/inline/post_training/common/__init__.py +5 -0
- llama_stack/providers/inline/post_training/common/utils.py +35 -0
- llama_stack/providers/inline/post_training/common/validator.py +36 -0
- llama_stack/providers/inline/post_training/huggingface/__init__.py +27 -0
- llama_stack/providers/inline/post_training/huggingface/config.py +83 -0
- llama_stack/providers/inline/post_training/huggingface/post_training.py +208 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/__init__.py +5 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device.py +519 -0
- llama_stack/providers/inline/post_training/huggingface/recipes/finetune_single_device_dpo.py +485 -0
- llama_stack/providers/inline/post_training/huggingface/utils.py +269 -0
- llama_stack/providers/inline/post_training/torchtune/__init__.py +27 -0
- llama_stack/providers/inline/post_training/torchtune/common/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/common/checkpointer.py +240 -0
- llama_stack/providers/inline/post_training/torchtune/common/utils.py +99 -0
- llama_stack/providers/inline/post_training/torchtune/config.py +20 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/format_adapter.py +57 -0
- llama_stack/providers/inline/post_training/torchtune/datasets/sft.py +78 -0
- llama_stack/providers/inline/post_training/torchtune/post_training.py +178 -0
- llama_stack/providers/inline/post_training/torchtune/recipes/__init__.py +5 -0
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +588 -0
- llama_stack/providers/inline/safety/__init__.py +5 -0
- llama_stack/providers/{impls/meta_reference/codeshield → inline/safety/code_scanner}/__init__.py +4 -2
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +128 -0
- llama_stack/providers/{impls/meta_reference/memory → inline/safety/code_scanner}/config.py +5 -3
- llama_stack/providers/inline/safety/llama_guard/__init__.py +19 -0
- llama_stack/providers/inline/safety/llama_guard/config.py +19 -0
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +489 -0
- llama_stack/providers/{adapters/memory/sample → inline/safety/prompt_guard}/__init__.py +4 -4
- llama_stack/providers/inline/safety/prompt_guard/config.py +32 -0
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +131 -0
- llama_stack/providers/inline/scoring/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/__init__.py +25 -0
- llama_stack/providers/{adapters/memory/weaviate → inline/scoring/basic}/config.py +5 -7
- llama_stack/providers/inline/scoring/basic/scoring.py +126 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py +240 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py +41 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py +23 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py +27 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py +71 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py +21 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py +80 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py +66 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py +58 -0
- llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py +38 -0
- llama_stack/providers/inline/scoring/basic/utils/__init__.py +5 -0
- llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py +3319 -0
- llama_stack/providers/inline/scoring/basic/utils/math_utils.py +330 -0
- llama_stack/providers/inline/scoring/braintrust/__init__.py +27 -0
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +230 -0
- llama_stack/providers/inline/scoring/braintrust/config.py +21 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py +23 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py +24 -0
- llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py +24 -0
- llama_stack/providers/inline/scoring/llm_as_judge/__init__.py +21 -0
- llama_stack/providers/inline/scoring/llm_as_judge/config.py +14 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +113 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py +5 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py +5 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py +96 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py +20 -0
- llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py +81 -0
- llama_stack/providers/inline/telemetry/__init__.py +5 -0
- llama_stack/providers/inline/telemetry/meta_reference/__init__.py +21 -0
- llama_stack/providers/inline/telemetry/meta_reference/config.py +47 -0
- llama_stack/providers/inline/telemetry/meta_reference/telemetry.py +252 -0
- llama_stack/providers/inline/tool_runtime/__init__.py +5 -0
- llama_stack/providers/inline/tool_runtime/rag/__init__.py +19 -0
- llama_stack/providers/{impls/meta_reference/telemetry → inline/tool_runtime/rag}/config.py +5 -3
- llama_stack/providers/inline/tool_runtime/rag/context_retriever.py +77 -0
- llama_stack/providers/inline/tool_runtime/rag/memory.py +332 -0
- llama_stack/providers/inline/vector_io/__init__.py +5 -0
- llama_stack/providers/inline/vector_io/chroma/__init__.py +19 -0
- llama_stack/providers/inline/vector_io/chroma/config.py +30 -0
- llama_stack/providers/inline/vector_io/faiss/__init__.py +21 -0
- llama_stack/providers/inline/vector_io/faiss/config.py +26 -0
- llama_stack/providers/inline/vector_io/faiss/faiss.py +293 -0
- llama_stack/providers/inline/vector_io/milvus/__init__.py +19 -0
- llama_stack/providers/inline/vector_io/milvus/config.py +29 -0
- llama_stack/providers/inline/vector_io/qdrant/__init__.py +20 -0
- llama_stack/providers/inline/vector_io/qdrant/config.py +29 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/__init__.py +20 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/config.py +26 -0
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +483 -0
- llama_stack/providers/registry/agents.py +16 -18
- llama_stack/providers/registry/batches.py +26 -0
- llama_stack/providers/registry/datasetio.py +49 -0
- llama_stack/providers/registry/eval.py +46 -0
- llama_stack/providers/registry/files.py +31 -0
- llama_stack/providers/registry/inference.py +273 -118
- llama_stack/providers/registry/post_training.py +69 -0
- llama_stack/providers/registry/safety.py +46 -41
- llama_stack/providers/registry/scoring.py +51 -0
- llama_stack/providers/registry/tool_runtime.py +87 -0
- llama_stack/providers/registry/vector_io.py +828 -0
- llama_stack/providers/remote/__init__.py +5 -0
- llama_stack/providers/remote/agents/__init__.py +5 -0
- llama_stack/providers/remote/datasetio/__init__.py +5 -0
- llama_stack/providers/{adapters/memory/chroma → remote/datasetio/huggingface}/__init__.py +7 -4
- llama_stack/providers/remote/datasetio/huggingface/config.py +23 -0
- llama_stack/providers/remote/datasetio/huggingface/huggingface.py +99 -0
- llama_stack/providers/remote/datasetio/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/datasetio/nvidia/config.py +61 -0
- llama_stack/providers/remote/datasetio/nvidia/datasetio.py +116 -0
- llama_stack/providers/remote/eval/__init__.py +5 -0
- llama_stack/providers/remote/eval/nvidia/__init__.py +31 -0
- llama_stack/providers/remote/eval/nvidia/config.py +29 -0
- llama_stack/providers/remote/eval/nvidia/eval.py +162 -0
- llama_stack/providers/remote/files/s3/__init__.py +19 -0
- llama_stack/providers/remote/files/s3/config.py +42 -0
- llama_stack/providers/remote/files/s3/files.py +313 -0
- llama_stack/providers/remote/inference/__init__.py +5 -0
- llama_stack/providers/{adapters/safety/sample → remote/inference/anthropic}/__init__.py +4 -6
- llama_stack/providers/remote/inference/anthropic/anthropic.py +36 -0
- llama_stack/providers/remote/inference/anthropic/config.py +28 -0
- llama_stack/providers/{impls/meta_reference/telemetry → remote/inference/azure}/__init__.py +4 -4
- llama_stack/providers/remote/inference/azure/azure.py +25 -0
- llama_stack/providers/remote/inference/azure/config.py +61 -0
- llama_stack/providers/{adapters → remote}/inference/bedrock/__init__.py +18 -17
- llama_stack/providers/remote/inference/bedrock/bedrock.py +142 -0
- llama_stack/providers/{adapters/inference/sample → remote/inference/bedrock}/config.py +3 -4
- llama_stack/providers/remote/inference/bedrock/models.py +29 -0
- llama_stack/providers/remote/inference/cerebras/__init__.py +19 -0
- llama_stack/providers/remote/inference/cerebras/cerebras.py +28 -0
- llama_stack/providers/remote/inference/cerebras/config.py +30 -0
- llama_stack/providers/{adapters → remote}/inference/databricks/__init__.py +4 -5
- llama_stack/providers/remote/inference/databricks/config.py +37 -0
- llama_stack/providers/remote/inference/databricks/databricks.py +44 -0
- llama_stack/providers/{adapters → remote}/inference/fireworks/__init__.py +8 -4
- llama_stack/providers/remote/inference/fireworks/config.py +27 -0
- llama_stack/providers/remote/inference/fireworks/fireworks.py +27 -0
- llama_stack/providers/{adapters/memory/pgvector → remote/inference/gemini}/__init__.py +4 -4
- llama_stack/providers/remote/inference/gemini/config.py +28 -0
- llama_stack/providers/remote/inference/gemini/gemini.py +82 -0
- llama_stack/providers/remote/inference/groq/__init__.py +15 -0
- llama_stack/providers/remote/inference/groq/config.py +34 -0
- llama_stack/providers/remote/inference/groq/groq.py +18 -0
- llama_stack/providers/remote/inference/llama_openai_compat/__init__.py +15 -0
- llama_stack/providers/remote/inference/llama_openai_compat/config.py +34 -0
- llama_stack/providers/remote/inference/llama_openai_compat/llama.py +46 -0
- llama_stack/providers/remote/inference/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/inference/nvidia/config.py +64 -0
- llama_stack/providers/remote/inference/nvidia/nvidia.py +61 -0
- llama_stack/providers/{adapters/safety/sample/config.py → remote/inference/nvidia/utils.py} +3 -4
- llama_stack/providers/{impls/vllm → remote/inference/ollama}/__init__.py +4 -6
- llama_stack/providers/remote/inference/ollama/config.py +25 -0
- llama_stack/providers/remote/inference/ollama/ollama.py +102 -0
- llama_stack/providers/{adapters/telemetry/opentelemetry → remote/inference/openai}/__init__.py +4 -4
- llama_stack/providers/remote/inference/openai/config.py +39 -0
- llama_stack/providers/remote/inference/openai/openai.py +38 -0
- llama_stack/providers/remote/inference/passthrough/__init__.py +23 -0
- llama_stack/providers/remote/inference/passthrough/config.py +34 -0
- llama_stack/providers/remote/inference/passthrough/passthrough.py +122 -0
- llama_stack/providers/remote/inference/runpod/__init__.py +16 -0
- llama_stack/providers/remote/inference/runpod/config.py +32 -0
- llama_stack/providers/remote/inference/runpod/runpod.py +42 -0
- llama_stack/providers/remote/inference/sambanova/__init__.py +16 -0
- llama_stack/providers/remote/inference/sambanova/config.py +34 -0
- llama_stack/providers/remote/inference/sambanova/sambanova.py +28 -0
- llama_stack/providers/{adapters → remote}/inference/tgi/__init__.py +3 -4
- llama_stack/providers/remote/inference/tgi/config.py +76 -0
- llama_stack/providers/remote/inference/tgi/tgi.py +85 -0
- llama_stack/providers/{adapters → remote}/inference/together/__init__.py +8 -4
- llama_stack/providers/remote/inference/together/config.py +27 -0
- llama_stack/providers/remote/inference/together/together.py +102 -0
- llama_stack/providers/remote/inference/vertexai/__init__.py +15 -0
- llama_stack/providers/remote/inference/vertexai/config.py +48 -0
- llama_stack/providers/remote/inference/vertexai/vertexai.py +54 -0
- llama_stack/providers/remote/inference/vllm/__init__.py +22 -0
- llama_stack/providers/remote/inference/vllm/config.py +59 -0
- llama_stack/providers/remote/inference/vllm/vllm.py +111 -0
- llama_stack/providers/remote/inference/watsonx/__init__.py +15 -0
- llama_stack/providers/remote/inference/watsonx/config.py +45 -0
- llama_stack/providers/remote/inference/watsonx/watsonx.py +336 -0
- llama_stack/providers/remote/post_training/__init__.py +5 -0
- llama_stack/providers/remote/post_training/nvidia/__init__.py +23 -0
- llama_stack/providers/remote/post_training/nvidia/config.py +113 -0
- llama_stack/providers/remote/post_training/nvidia/models.py +27 -0
- llama_stack/providers/remote/post_training/nvidia/post_training.py +430 -0
- llama_stack/providers/remote/post_training/nvidia/utils.py +63 -0
- llama_stack/providers/remote/safety/__init__.py +5 -0
- llama_stack/providers/remote/safety/bedrock/bedrock.py +111 -0
- llama_stack/providers/remote/safety/bedrock/config.py +14 -0
- llama_stack/providers/{adapters/inference/sample → remote/safety/nvidia}/__init__.py +5 -4
- llama_stack/providers/remote/safety/nvidia/config.py +40 -0
- llama_stack/providers/remote/safety/nvidia/nvidia.py +161 -0
- llama_stack/providers/{adapters/agents/sample → remote/safety/sambanova}/__init__.py +5 -4
- llama_stack/providers/remote/safety/sambanova/config.py +37 -0
- llama_stack/providers/remote/safety/sambanova/sambanova.py +98 -0
- llama_stack/providers/remote/tool_runtime/__init__.py +5 -0
- llama_stack/providers/remote/tool_runtime/bing_search/__init__.py +21 -0
- llama_stack/providers/remote/tool_runtime/bing_search/bing_search.py +112 -0
- llama_stack/providers/remote/tool_runtime/bing_search/config.py +22 -0
- llama_stack/providers/remote/tool_runtime/brave_search/__init__.py +20 -0
- llama_stack/providers/remote/tool_runtime/brave_search/brave_search.py +148 -0
- llama_stack/providers/remote/tool_runtime/brave_search/config.py +27 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/__init__.py +15 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/config.py +20 -0
- llama_stack/providers/remote/tool_runtime/model_context_protocol/model_context_protocol.py +73 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/__init__.py +20 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/config.py +27 -0
- llama_stack/providers/remote/tool_runtime/tavily_search/tavily_search.py +84 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/__init__.py +22 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/config.py +21 -0
- llama_stack/providers/remote/tool_runtime/wolfram_alpha/wolfram_alpha.py +140 -0
- llama_stack/providers/remote/vector_io/__init__.py +5 -0
- llama_stack/providers/remote/vector_io/chroma/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/chroma/chroma.py +215 -0
- llama_stack/providers/remote/vector_io/chroma/config.py +28 -0
- llama_stack/providers/remote/vector_io/milvus/__init__.py +18 -0
- llama_stack/providers/remote/vector_io/milvus/config.py +35 -0
- llama_stack/providers/remote/vector_io/milvus/milvus.py +375 -0
- llama_stack/providers/remote/vector_io/pgvector/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +47 -0
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +460 -0
- llama_stack/providers/remote/vector_io/qdrant/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/qdrant/config.py +37 -0
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +265 -0
- llama_stack/providers/remote/vector_io/weaviate/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/weaviate/config.py +32 -0
- llama_stack/providers/remote/vector_io/weaviate/weaviate.py +393 -0
- llama_stack/providers/utils/bedrock/__init__.py +5 -0
- llama_stack/providers/utils/bedrock/client.py +74 -0
- llama_stack/providers/utils/bedrock/config.py +64 -0
- llama_stack/providers/utils/bedrock/refreshable_boto_session.py +112 -0
- llama_stack/providers/utils/common/__init__.py +5 -0
- llama_stack/providers/utils/common/data_schema_validator.py +103 -0
- llama_stack/providers/utils/datasetio/__init__.py +5 -0
- llama_stack/providers/utils/datasetio/url_utils.py +47 -0
- llama_stack/providers/utils/files/__init__.py +5 -0
- llama_stack/providers/utils/files/form_data.py +69 -0
- llama_stack/providers/utils/inference/__init__.py +8 -7
- llama_stack/providers/utils/inference/embedding_mixin.py +101 -0
- llama_stack/providers/utils/inference/inference_store.py +264 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +336 -0
- llama_stack/providers/utils/inference/model_registry.py +173 -23
- llama_stack/providers/utils/inference/openai_compat.py +1261 -49
- llama_stack/providers/utils/inference/openai_mixin.py +506 -0
- llama_stack/providers/utils/inference/prompt_adapter.py +365 -67
- llama_stack/providers/utils/kvstore/api.py +6 -6
- llama_stack/providers/utils/kvstore/config.py +28 -48
- llama_stack/providers/utils/kvstore/kvstore.py +61 -15
- llama_stack/providers/utils/kvstore/mongodb/__init__.py +9 -0
- llama_stack/providers/utils/kvstore/mongodb/mongodb.py +82 -0
- llama_stack/providers/utils/kvstore/postgres/__init__.py +7 -0
- llama_stack/providers/utils/kvstore/postgres/postgres.py +114 -0
- llama_stack/providers/utils/kvstore/redis/redis.py +33 -9
- llama_stack/providers/utils/kvstore/sqlite/config.py +2 -1
- llama_stack/providers/utils/kvstore/sqlite/sqlite.py +123 -22
- llama_stack/providers/utils/memory/file_utils.py +1 -1
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +1304 -0
- llama_stack/providers/utils/memory/vector_store.py +220 -82
- llama_stack/providers/utils/pagination.py +43 -0
- llama_stack/providers/utils/responses/__init__.py +5 -0
- llama_stack/providers/utils/responses/responses_store.py +292 -0
- llama_stack/providers/utils/scheduler.py +270 -0
- llama_stack/providers/utils/scoring/__init__.py +5 -0
- llama_stack/providers/utils/scoring/aggregation_utils.py +75 -0
- llama_stack/providers/utils/scoring/base_scoring_fn.py +114 -0
- llama_stack/providers/utils/scoring/basic_scoring_utils.py +26 -0
- llama_stack/providers/utils/sqlstore/__init__.py +5 -0
- llama_stack/providers/utils/sqlstore/api.py +128 -0
- llama_stack/providers/utils/sqlstore/authorized_sqlstore.py +319 -0
- llama_stack/providers/utils/sqlstore/sqlalchemy_sqlstore.py +343 -0
- llama_stack/providers/utils/sqlstore/sqlstore.py +70 -0
- llama_stack/providers/utils/telemetry/trace_protocol.py +142 -0
- llama_stack/providers/utils/telemetry/tracing.py +192 -53
- llama_stack/providers/utils/tools/__init__.py +5 -0
- llama_stack/providers/utils/tools/mcp.py +148 -0
- llama_stack/providers/utils/tools/ttl_dict.py +70 -0
- llama_stack/providers/utils/vector_io/__init__.py +5 -0
- llama_stack/providers/utils/vector_io/vector_utils.py +156 -0
- llama_stack/schema_utils.py +118 -0
- llama_stack/strong_typing/__init__.py +19 -0
- llama_stack/strong_typing/auxiliary.py +228 -0
- llama_stack/strong_typing/classdef.py +440 -0
- llama_stack/strong_typing/core.py +46 -0
- llama_stack/strong_typing/deserializer.py +877 -0
- llama_stack/strong_typing/docstring.py +409 -0
- llama_stack/strong_typing/exception.py +23 -0
- llama_stack/strong_typing/inspection.py +1085 -0
- llama_stack/strong_typing/mapping.py +40 -0
- llama_stack/strong_typing/name.py +182 -0
- llama_stack/strong_typing/py.typed +0 -0
- llama_stack/strong_typing/schema.py +792 -0
- llama_stack/strong_typing/serialization.py +97 -0
- llama_stack/strong_typing/serializer.py +500 -0
- llama_stack/strong_typing/slots.py +27 -0
- llama_stack/strong_typing/topological.py +89 -0
- llama_stack/testing/__init__.py +5 -0
- llama_stack/testing/api_recorder.py +956 -0
- llama_stack/ui/node_modules/flatted/python/flatted.py +149 -0
- llama_stack-0.3.4.dist-info/METADATA +261 -0
- llama_stack-0.3.4.dist-info/RECORD +625 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/WHEEL +1 -1
- llama_stack/apis/agents/client.py +0 -292
- llama_stack/apis/agents/event_logger.py +0 -184
- llama_stack/apis/batch_inference/batch_inference.py +0 -72
- llama_stack/apis/common/deployment_types.py +0 -31
- llama_stack/apis/dataset/dataset.py +0 -63
- llama_stack/apis/evals/evals.py +0 -122
- llama_stack/apis/inference/client.py +0 -197
- llama_stack/apis/inspect/client.py +0 -82
- llama_stack/apis/memory/client.py +0 -155
- llama_stack/apis/memory/memory.py +0 -65
- llama_stack/apis/memory_banks/__init__.py +0 -7
- llama_stack/apis/memory_banks/client.py +0 -101
- llama_stack/apis/memory_banks/memory_banks.py +0 -78
- llama_stack/apis/models/client.py +0 -83
- llama_stack/apis/reward_scoring/__init__.py +0 -7
- llama_stack/apis/reward_scoring/reward_scoring.py +0 -55
- llama_stack/apis/safety/client.py +0 -105
- llama_stack/apis/shields/client.py +0 -79
- llama_stack/cli/download.py +0 -340
- llama_stack/cli/model/describe.py +0 -82
- llama_stack/cli/model/download.py +0 -24
- llama_stack/cli/model/list.py +0 -62
- llama_stack/cli/model/model.py +0 -34
- llama_stack/cli/model/prompt_format.py +0 -112
- llama_stack/cli/model/safety_models.py +0 -52
- llama_stack/cli/stack/build.py +0 -299
- llama_stack/cli/stack/configure.py +0 -178
- llama_stack/distribution/build.py +0 -123
- llama_stack/distribution/build_conda_env.sh +0 -136
- llama_stack/distribution/build_container.sh +0 -142
- llama_stack/distribution/common.sh +0 -40
- llama_stack/distribution/configure_container.sh +0 -47
- llama_stack/distribution/datatypes.py +0 -139
- llama_stack/distribution/distribution.py +0 -58
- llama_stack/distribution/inspect.py +0 -67
- llama_stack/distribution/request_headers.py +0 -57
- llama_stack/distribution/resolver.py +0 -323
- llama_stack/distribution/routers/__init__.py +0 -48
- llama_stack/distribution/routers/routers.py +0 -158
- llama_stack/distribution/routers/routing_tables.py +0 -173
- llama_stack/distribution/server/endpoints.py +0 -48
- llama_stack/distribution/server/server.py +0 -343
- llama_stack/distribution/start_conda_env.sh +0 -42
- llama_stack/distribution/start_container.sh +0 -64
- llama_stack/distribution/templates/local-bedrock-conda-example-build.yaml +0 -10
- llama_stack/distribution/templates/local-build.yaml +0 -10
- llama_stack/distribution/templates/local-databricks-build.yaml +0 -10
- llama_stack/distribution/templates/local-fireworks-build.yaml +0 -10
- llama_stack/distribution/templates/local-hf-endpoint-build.yaml +0 -10
- llama_stack/distribution/templates/local-hf-serverless-build.yaml +0 -10
- llama_stack/distribution/templates/local-ollama-build.yaml +0 -10
- llama_stack/distribution/templates/local-tgi-build.yaml +0 -10
- llama_stack/distribution/templates/local-together-build.yaml +0 -10
- llama_stack/distribution/templates/local-vllm-build.yaml +0 -10
- llama_stack/distribution/utils/exec.py +0 -105
- llama_stack/providers/adapters/agents/sample/sample.py +0 -18
- llama_stack/providers/adapters/inference/bedrock/bedrock.py +0 -451
- llama_stack/providers/adapters/inference/bedrock/config.py +0 -55
- llama_stack/providers/adapters/inference/databricks/config.py +0 -21
- llama_stack/providers/adapters/inference/databricks/databricks.py +0 -125
- llama_stack/providers/adapters/inference/fireworks/config.py +0 -20
- llama_stack/providers/adapters/inference/fireworks/fireworks.py +0 -130
- llama_stack/providers/adapters/inference/ollama/__init__.py +0 -19
- llama_stack/providers/adapters/inference/ollama/ollama.py +0 -175
- llama_stack/providers/adapters/inference/sample/sample.py +0 -23
- llama_stack/providers/adapters/inference/tgi/config.py +0 -43
- llama_stack/providers/adapters/inference/tgi/tgi.py +0 -200
- llama_stack/providers/adapters/inference/together/config.py +0 -22
- llama_stack/providers/adapters/inference/together/together.py +0 -143
- llama_stack/providers/adapters/memory/chroma/chroma.py +0 -157
- llama_stack/providers/adapters/memory/pgvector/config.py +0 -17
- llama_stack/providers/adapters/memory/pgvector/pgvector.py +0 -211
- llama_stack/providers/adapters/memory/sample/sample.py +0 -23
- llama_stack/providers/adapters/memory/weaviate/__init__.py +0 -15
- llama_stack/providers/adapters/memory/weaviate/weaviate.py +0 -190
- llama_stack/providers/adapters/safety/bedrock/bedrock.py +0 -113
- llama_stack/providers/adapters/safety/bedrock/config.py +0 -16
- llama_stack/providers/adapters/safety/sample/sample.py +0 -23
- llama_stack/providers/adapters/safety/together/__init__.py +0 -18
- llama_stack/providers/adapters/safety/together/config.py +0 -26
- llama_stack/providers/adapters/safety/together/together.py +0 -101
- llama_stack/providers/adapters/telemetry/opentelemetry/config.py +0 -12
- llama_stack/providers/adapters/telemetry/opentelemetry/opentelemetry.py +0 -201
- llama_stack/providers/adapters/telemetry/sample/__init__.py +0 -17
- llama_stack/providers/adapters/telemetry/sample/config.py +0 -12
- llama_stack/providers/adapters/telemetry/sample/sample.py +0 -18
- llama_stack/providers/impls/meta_reference/agents/agent_instance.py +0 -844
- llama_stack/providers/impls/meta_reference/agents/agents.py +0 -161
- llama_stack/providers/impls/meta_reference/agents/persistence.py +0 -84
- llama_stack/providers/impls/meta_reference/agents/rag/context_retriever.py +0 -74
- llama_stack/providers/impls/meta_reference/agents/safety.py +0 -57
- llama_stack/providers/impls/meta_reference/agents/tests/code_execution.py +0 -93
- llama_stack/providers/impls/meta_reference/agents/tests/test_chat_agent.py +0 -305
- llama_stack/providers/impls/meta_reference/agents/tools/base.py +0 -20
- llama_stack/providers/impls/meta_reference/agents/tools/builtin.py +0 -375
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_env_prefix.py +0 -133
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/code_execution.py +0 -256
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/matplotlib_custom_backend.py +0 -87
- llama_stack/providers/impls/meta_reference/agents/tools/ipython_tool/utils.py +0 -21
- llama_stack/providers/impls/meta_reference/agents/tools/safety.py +0 -43
- llama_stack/providers/impls/meta_reference/codeshield/code_scanner.py +0 -58
- llama_stack/providers/impls/meta_reference/inference/config.py +0 -45
- llama_stack/providers/impls/meta_reference/inference/generation.py +0 -376
- llama_stack/providers/impls/meta_reference/inference/inference.py +0 -280
- llama_stack/providers/impls/meta_reference/inference/model_parallel.py +0 -99
- llama_stack/providers/impls/meta_reference/inference/quantization/fp8_impls.py +0 -184
- llama_stack/providers/impls/meta_reference/inference/quantization/fp8_txest_disabled.py +0 -76
- llama_stack/providers/impls/meta_reference/inference/quantization/loader.py +0 -97
- llama_stack/providers/impls/meta_reference/inference/quantization/scripts/quantize_checkpoint.py +0 -161
- llama_stack/providers/impls/meta_reference/memory/__init__.py +0 -19
- llama_stack/providers/impls/meta_reference/memory/faiss.py +0 -113
- llama_stack/providers/impls/meta_reference/safety/__init__.py +0 -17
- llama_stack/providers/impls/meta_reference/safety/base.py +0 -57
- llama_stack/providers/impls/meta_reference/safety/config.py +0 -48
- llama_stack/providers/impls/meta_reference/safety/llama_guard.py +0 -268
- llama_stack/providers/impls/meta_reference/safety/prompt_guard.py +0 -145
- llama_stack/providers/impls/meta_reference/safety/safety.py +0 -112
- llama_stack/providers/impls/meta_reference/telemetry/console.py +0 -89
- llama_stack/providers/impls/vllm/config.py +0 -35
- llama_stack/providers/impls/vllm/vllm.py +0 -241
- llama_stack/providers/registry/memory.py +0 -78
- llama_stack/providers/registry/telemetry.py +0 -44
- llama_stack/providers/tests/agents/test_agents.py +0 -210
- llama_stack/providers/tests/inference/test_inference.py +0 -257
- llama_stack/providers/tests/inference/test_prompt_adapter.py +0 -126
- llama_stack/providers/tests/memory/test_memory.py +0 -136
- llama_stack/providers/tests/resolver.py +0 -100
- llama_stack/providers/tests/safety/test_safety.py +0 -77
- llama_stack-0.0.42.dist-info/METADATA +0 -137
- llama_stack-0.0.42.dist-info/RECORD +0 -256
- /llama_stack/{distribution → core}/__init__.py +0 -0
- /llama_stack/{distribution/server → core/access_control}/__init__.py +0 -0
- /llama_stack/{distribution/utils → core/conversations}/__init__.py +0 -0
- /llama_stack/{providers/adapters → core/prompts}/__init__.py +0 -0
- /llama_stack/{providers/adapters/agents → core/routing_tables}/__init__.py +0 -0
- /llama_stack/{providers/adapters/inference → core/server}/__init__.py +0 -0
- /llama_stack/{providers/adapters/memory → core/storage}/__init__.py +0 -0
- /llama_stack/{providers/adapters/safety → core/ui}/__init__.py +0 -0
- /llama_stack/{providers/adapters/telemetry → core/ui/modules}/__init__.py +0 -0
- /llama_stack/{providers/impls → core/ui/page}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference → core/ui/page/distribution}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/rag → core/ui/page/evaluations}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tests → core/ui/page/playground}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tools → core/utils}/__init__.py +0 -0
- /llama_stack/{distribution → core}/utils/dynamic.py +0 -0
- /llama_stack/{distribution → core}/utils/serialize.py +0 -0
- /llama_stack/{providers/impls/meta_reference/agents/tools/ipython_tool → distributions}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/inference/quantization → models}/__init__.py +0 -0
- /llama_stack/{providers/impls/meta_reference/inference/quantization/scripts → models/llama}/__init__.py +0 -0
- /llama_stack/{providers/tests → models/llama/llama3}/__init__.py +0 -0
- /llama_stack/{providers/tests/agents → models/llama/llama3/quantization}/__init__.py +0 -0
- /llama_stack/{providers/tests/inference → models/llama/llama3_2}/__init__.py +0 -0
- /llama_stack/{providers/tests/memory → models/llama/llama3_3}/__init__.py +0 -0
- /llama_stack/{providers/tests/safety → models/llama/llama4}/__init__.py +0 -0
- /llama_stack/{scripts → models/llama/llama4/prompt_templates}/__init__.py +0 -0
- /llama_stack/providers/{adapters → remote}/safety/bedrock/__init__.py +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info/licenses}/LICENSE +0 -0
- {llama_stack-0.0.42.dist-info → llama_stack-0.3.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
from collections.abc import AsyncIterator
|
|
9
|
+
|
|
10
|
+
from llama_stack.apis.inference import (
|
|
11
|
+
InferenceProvider,
|
|
12
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
13
|
+
OpenAICompletionRequestWithExtraBody,
|
|
14
|
+
)
|
|
15
|
+
from llama_stack.apis.inference.inference import (
|
|
16
|
+
OpenAIChatCompletion,
|
|
17
|
+
OpenAIChatCompletionChunk,
|
|
18
|
+
OpenAICompletion,
|
|
19
|
+
)
|
|
20
|
+
from llama_stack.apis.models import Model, ModelType
|
|
21
|
+
from llama_stack.log import get_logger
|
|
22
|
+
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
|
23
|
+
from llama_stack.models.llama.llama3.tokenizer import Tokenizer as Llama3Tokenizer
|
|
24
|
+
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
|
25
|
+
from llama_stack.models.llama.llama4.tokenizer import Tokenizer as Llama4Tokenizer
|
|
26
|
+
from llama_stack.models.llama.sku_list import resolve_model
|
|
27
|
+
from llama_stack.models.llama.sku_types import ModelFamily
|
|
28
|
+
from llama_stack.providers.datatypes import ModelsProtocolPrivate
|
|
29
|
+
from llama_stack.providers.utils.inference.embedding_mixin import (
|
|
30
|
+
SentenceTransformerEmbeddingMixin,
|
|
31
|
+
)
|
|
32
|
+
from llama_stack.providers.utils.inference.model_registry import (
|
|
33
|
+
ModelRegistryHelper,
|
|
34
|
+
build_hf_repo_model_entry,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from .config import MetaReferenceInferenceConfig
|
|
38
|
+
from .generators import LlamaGenerator
|
|
39
|
+
from .model_parallel import LlamaModelParallelGenerator
|
|
40
|
+
|
|
41
|
+
log = get_logger(__name__, category="inference")
|
|
42
|
+
# there's a single model parallel process running serving the model. for now,
|
|
43
|
+
# we don't support multiple concurrent requests to this process.
|
|
44
|
+
SEMAPHORE = asyncio.Semaphore(1)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def llama_builder_fn(config: MetaReferenceInferenceConfig, model_id: str, llama_model: Model) -> LlamaGenerator:
|
|
48
|
+
return LlamaGenerator(config, model_id, llama_model)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MetaReferenceInferenceImpl(
|
|
52
|
+
SentenceTransformerEmbeddingMixin,
|
|
53
|
+
InferenceProvider,
|
|
54
|
+
ModelsProtocolPrivate,
|
|
55
|
+
):
|
|
56
|
+
def __init__(self, config: MetaReferenceInferenceConfig) -> None:
|
|
57
|
+
self.config = config
|
|
58
|
+
self.model_id = None
|
|
59
|
+
self.llama_model = None
|
|
60
|
+
|
|
61
|
+
async def initialize(self) -> None:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
async def shutdown(self) -> None:
|
|
65
|
+
if self.config.create_distributed_process_group:
|
|
66
|
+
self.generator.stop()
|
|
67
|
+
|
|
68
|
+
async def openai_completion(
|
|
69
|
+
self,
|
|
70
|
+
params: OpenAICompletionRequestWithExtraBody,
|
|
71
|
+
) -> OpenAICompletion:
|
|
72
|
+
raise NotImplementedError("OpenAI completion not supported by meta reference provider")
|
|
73
|
+
|
|
74
|
+
async def should_refresh_models(self) -> bool:
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
async def list_models(self) -> list[Model] | None:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
async def unregister_model(self, model_id: str) -> None:
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
async def register_model(self, model: Model) -> Model:
|
|
84
|
+
llama_model = (
|
|
85
|
+
resolve_model(model.metadata["llama_model"])
|
|
86
|
+
if "llama_model" in model.metadata
|
|
87
|
+
else resolve_model(model.identifier)
|
|
88
|
+
)
|
|
89
|
+
if llama_model is None:
|
|
90
|
+
raise ValueError(
|
|
91
|
+
"Please make sure your llama_model in model metadata or model identifier is in Llama SKU list"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
self.model_registry_helper = ModelRegistryHelper(
|
|
95
|
+
[
|
|
96
|
+
build_hf_repo_model_entry(
|
|
97
|
+
llama_model.descriptor(),
|
|
98
|
+
llama_model.core_model_id.value,
|
|
99
|
+
)
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
model = await self.model_registry_helper.register_model(model)
|
|
103
|
+
|
|
104
|
+
if model.model_type == ModelType.embedding:
|
|
105
|
+
self._load_sentence_transformer_model(model.provider_resource_id)
|
|
106
|
+
|
|
107
|
+
# TODO: what is this?! you can't really specify skipping via model metadata
|
|
108
|
+
# kill this madness
|
|
109
|
+
if "skip_load" in model.metadata and model.metadata["skip_load"]:
|
|
110
|
+
return model
|
|
111
|
+
|
|
112
|
+
await self.load_model(model.identifier, llama_model)
|
|
113
|
+
return model
|
|
114
|
+
|
|
115
|
+
async def load_model(self, model_id, llama_model) -> None:
|
|
116
|
+
log.info(f"Loading model `{model_id}`")
|
|
117
|
+
|
|
118
|
+
builder_params = [self.config, model_id, llama_model]
|
|
119
|
+
|
|
120
|
+
if self.config.create_distributed_process_group:
|
|
121
|
+
self.generator = LlamaModelParallelGenerator(
|
|
122
|
+
model_parallel_size=self.config.model_parallel_size or llama_model.pth_file_count,
|
|
123
|
+
builder_fn=llama_builder_fn,
|
|
124
|
+
builder_params=builder_params,
|
|
125
|
+
formatter=(
|
|
126
|
+
Llama4ChatFormat(Llama4Tokenizer.get_instance())
|
|
127
|
+
if llama_model.model_family == ModelFamily.llama4
|
|
128
|
+
else Llama3ChatFormat(Llama3Tokenizer.get_instance())
|
|
129
|
+
),
|
|
130
|
+
)
|
|
131
|
+
self.generator.start()
|
|
132
|
+
else:
|
|
133
|
+
self.generator = llama_builder_fn(*builder_params)
|
|
134
|
+
|
|
135
|
+
self.model_id = model_id
|
|
136
|
+
self.llama_model = llama_model
|
|
137
|
+
|
|
138
|
+
log.info("Warming up...")
|
|
139
|
+
await self.openai_chat_completion(
|
|
140
|
+
model=model_id,
|
|
141
|
+
messages=[{"role": "user", "content": "Hi how are you?"}],
|
|
142
|
+
max_tokens=20,
|
|
143
|
+
)
|
|
144
|
+
log.info("Warmed up!")
|
|
145
|
+
|
|
146
|
+
def check_model(self, request) -> None:
|
|
147
|
+
if self.model_id is None or self.llama_model is None:
|
|
148
|
+
raise RuntimeError(
|
|
149
|
+
"No avaible model yet, please register your requested model or add your model in the resouces first"
|
|
150
|
+
)
|
|
151
|
+
elif request.model != self.model_id:
|
|
152
|
+
raise RuntimeError(f"Model mismatch: request model: {request.model} != loaded model: {self.model_id}")
|
|
153
|
+
|
|
154
|
+
async def openai_chat_completion(
|
|
155
|
+
self,
|
|
156
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
157
|
+
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
158
|
+
raise NotImplementedError("OpenAI chat completion not supported by meta-reference inference provider")
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from collections.abc import Callable, Generator
|
|
8
|
+
from copy import deepcopy
|
|
9
|
+
from functools import partial
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from llama_stack.models.llama.llama3.chat_format import ChatFormat as Llama3ChatFormat
|
|
13
|
+
from llama_stack.models.llama.llama4.chat_format import ChatFormat as Llama4ChatFormat
|
|
14
|
+
from llama_stack.providers.utils.inference.prompt_adapter import (
|
|
15
|
+
ChatCompletionRequestWithRawContent,
|
|
16
|
+
CompletionRequestWithRawContent,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from .parallel_utils import ModelParallelProcessGroup
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ModelRunner:
|
|
23
|
+
def __init__(self, llama):
|
|
24
|
+
self.llama = llama
|
|
25
|
+
|
|
26
|
+
# the `task` object is the same that is sent to `ModelParallelProcessGroup.run_inference()`
|
|
27
|
+
def __call__(self, task: Any):
|
|
28
|
+
if task[0] == "chat_completion":
|
|
29
|
+
return self.llama.chat_completion(task[1])
|
|
30
|
+
else:
|
|
31
|
+
raise ValueError(f"Unexpected task type {task[0]}")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def init_model_cb(
|
|
35
|
+
builder_fn: Callable,
|
|
36
|
+
params: list[Any],
|
|
37
|
+
):
|
|
38
|
+
llama = builder_fn(*params)
|
|
39
|
+
return ModelRunner(llama)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class LlamaModelParallelGenerator:
|
|
43
|
+
"""
|
|
44
|
+
This abstraction exists so
|
|
45
|
+
- we can run model parallel code without needing to run the CLIs via torchrun
|
|
46
|
+
- this also enables use model parallel code within a notebook context.
|
|
47
|
+
|
|
48
|
+
A Context Manager is used to ensure that the model parallel process is started and stopped
|
|
49
|
+
correctly. This does make the ergonomics a little awkward, because it isn't immediately
|
|
50
|
+
clear at the callsite why we need to use a context manager.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
model_parallel_size: int,
|
|
56
|
+
builder_fn: Callable,
|
|
57
|
+
builder_params: list[Any],
|
|
58
|
+
formatter: Llama3ChatFormat | Llama4ChatFormat,
|
|
59
|
+
):
|
|
60
|
+
self.model_parallel_size = model_parallel_size
|
|
61
|
+
self.builder_fn = builder_fn
|
|
62
|
+
self.builder_params = builder_params
|
|
63
|
+
self.formatter = formatter
|
|
64
|
+
|
|
65
|
+
def start(self):
|
|
66
|
+
self.__enter__()
|
|
67
|
+
|
|
68
|
+
def stop(self):
|
|
69
|
+
self.__exit__(None, None, None)
|
|
70
|
+
|
|
71
|
+
def __enter__(self):
|
|
72
|
+
self.group = ModelParallelProcessGroup(
|
|
73
|
+
self.model_parallel_size,
|
|
74
|
+
init_model_cb=partial(init_model_cb, self.builder_fn, self.builder_params),
|
|
75
|
+
)
|
|
76
|
+
self.group.start()
|
|
77
|
+
return self
|
|
78
|
+
|
|
79
|
+
def __exit__(self, exc_type, exc_value, exc_traceback):
|
|
80
|
+
self.group.stop()
|
|
81
|
+
|
|
82
|
+
def completion(
|
|
83
|
+
self,
|
|
84
|
+
request_batch: list[CompletionRequestWithRawContent],
|
|
85
|
+
) -> Generator:
|
|
86
|
+
req_obj = deepcopy(request_batch)
|
|
87
|
+
gen = self.group.run_inference(("completion", req_obj))
|
|
88
|
+
yield from gen
|
|
89
|
+
|
|
90
|
+
def chat_completion(
|
|
91
|
+
self,
|
|
92
|
+
request_batch: list[ChatCompletionRequestWithRawContent],
|
|
93
|
+
) -> Generator:
|
|
94
|
+
req_obj = deepcopy(request_batch)
|
|
95
|
+
gen = self.group.run_inference(("chat_completion", req_obj))
|
|
96
|
+
yield from gen
|
|
@@ -4,42 +4,41 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
+
# Copyright (c) Meta Platforms, IAny, nc. and affiliates.
|
|
8
|
+
# All rights reserved.
|
|
9
|
+
#
|
|
10
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
11
|
+
# the root directory of this source tree.
|
|
12
|
+
|
|
13
|
+
import copy
|
|
7
14
|
import json
|
|
8
15
|
import multiprocessing
|
|
9
16
|
import os
|
|
10
17
|
import tempfile
|
|
11
18
|
import time
|
|
12
19
|
import uuid
|
|
20
|
+
from collections.abc import Callable, Generator
|
|
13
21
|
from enum import Enum
|
|
14
|
-
from typing import
|
|
22
|
+
from typing import Annotated, Literal
|
|
15
23
|
|
|
16
24
|
import torch
|
|
17
|
-
|
|
18
25
|
import zmq
|
|
19
|
-
|
|
20
26
|
from fairscale.nn.model_parallel.initialize import (
|
|
21
27
|
get_model_parallel_group,
|
|
22
28
|
get_model_parallel_rank,
|
|
23
29
|
get_model_parallel_src_rank,
|
|
24
30
|
)
|
|
25
|
-
|
|
26
|
-
from llama_models.llama3.api.datatypes import Message, ToolPromptFormat
|
|
27
|
-
|
|
28
31
|
from pydantic import BaseModel, Field
|
|
32
|
+
from torch.distributed.launcher.api import LaunchConfig, elastic_launch
|
|
29
33
|
|
|
30
|
-
from
|
|
31
|
-
from
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
from llama_stack.log import get_logger
|
|
35
|
+
from llama_stack.models.llama.datatypes import GenerationResult
|
|
36
|
+
from llama_stack.providers.utils.inference.prompt_adapter import (
|
|
37
|
+
ChatCompletionRequestWithRawContent,
|
|
38
|
+
CompletionRequestWithRawContent,
|
|
39
|
+
)
|
|
35
40
|
|
|
36
|
-
|
|
37
|
-
messages: List[Message]
|
|
38
|
-
temperature: float
|
|
39
|
-
top_p: float
|
|
40
|
-
max_gen_len: int
|
|
41
|
-
logprobs: bool
|
|
42
|
-
tool_prompt_format: ToolPromptFormat
|
|
41
|
+
log = get_logger(name=__name__, category="inference")
|
|
43
42
|
|
|
44
43
|
|
|
45
44
|
class ProcessingMessageName(str, Enum):
|
|
@@ -53,59 +52,42 @@ class ProcessingMessageName(str, Enum):
|
|
|
53
52
|
|
|
54
53
|
|
|
55
54
|
class ReadyRequest(BaseModel):
|
|
56
|
-
type: Literal[ProcessingMessageName.ready_request] =
|
|
57
|
-
ProcessingMessageName.ready_request
|
|
58
|
-
)
|
|
55
|
+
type: Literal[ProcessingMessageName.ready_request] = ProcessingMessageName.ready_request
|
|
59
56
|
|
|
60
57
|
|
|
61
58
|
class ReadyResponse(BaseModel):
|
|
62
|
-
type: Literal[ProcessingMessageName.ready_response] =
|
|
63
|
-
ProcessingMessageName.ready_response
|
|
64
|
-
)
|
|
59
|
+
type: Literal[ProcessingMessageName.ready_response] = ProcessingMessageName.ready_response
|
|
65
60
|
|
|
66
61
|
|
|
67
62
|
class EndSentinel(BaseModel):
|
|
68
|
-
type: Literal[ProcessingMessageName.end_sentinel] =
|
|
69
|
-
ProcessingMessageName.end_sentinel
|
|
70
|
-
)
|
|
63
|
+
type: Literal[ProcessingMessageName.end_sentinel] = ProcessingMessageName.end_sentinel
|
|
71
64
|
|
|
72
65
|
|
|
73
66
|
class CancelSentinel(BaseModel):
|
|
74
|
-
type: Literal[ProcessingMessageName.cancel_sentinel] =
|
|
75
|
-
ProcessingMessageName.cancel_sentinel
|
|
76
|
-
)
|
|
67
|
+
type: Literal[ProcessingMessageName.cancel_sentinel] = ProcessingMessageName.cancel_sentinel
|
|
77
68
|
|
|
78
69
|
|
|
79
70
|
class TaskRequest(BaseModel):
|
|
80
|
-
type: Literal[ProcessingMessageName.task_request] =
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
71
|
+
type: Literal[ProcessingMessageName.task_request] = ProcessingMessageName.task_request
|
|
72
|
+
task: tuple[
|
|
73
|
+
str,
|
|
74
|
+
list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
|
|
75
|
+
]
|
|
84
76
|
|
|
85
77
|
|
|
86
78
|
class TaskResponse(BaseModel):
|
|
87
|
-
type: Literal[ProcessingMessageName.task_response] =
|
|
88
|
-
|
|
89
|
-
)
|
|
90
|
-
result: TokenResult
|
|
79
|
+
type: Literal[ProcessingMessageName.task_response] = ProcessingMessageName.task_response
|
|
80
|
+
result: list[GenerationResult]
|
|
91
81
|
|
|
92
82
|
|
|
93
83
|
class ExceptionResponse(BaseModel):
|
|
94
|
-
type: Literal[ProcessingMessageName.exception_response] =
|
|
95
|
-
ProcessingMessageName.exception_response
|
|
96
|
-
)
|
|
84
|
+
type: Literal[ProcessingMessageName.exception_response] = ProcessingMessageName.exception_response
|
|
97
85
|
error: str
|
|
98
86
|
|
|
99
87
|
|
|
100
|
-
ProcessingMessage =
|
|
101
|
-
ReadyRequest
|
|
102
|
-
|
|
103
|
-
EndSentinel,
|
|
104
|
-
CancelSentinel,
|
|
105
|
-
TaskRequest,
|
|
106
|
-
TaskResponse,
|
|
107
|
-
ExceptionResponse,
|
|
108
|
-
]
|
|
88
|
+
ProcessingMessage = (
|
|
89
|
+
ReadyRequest | ReadyResponse | EndSentinel | CancelSentinel | TaskRequest | TaskResponse | ExceptionResponse
|
|
90
|
+
)
|
|
109
91
|
|
|
110
92
|
|
|
111
93
|
class ProcessingMessageWrapper(BaseModel):
|
|
@@ -116,7 +98,7 @@ class ProcessingMessageWrapper(BaseModel):
|
|
|
116
98
|
|
|
117
99
|
|
|
118
100
|
def mp_rank_0() -> bool:
|
|
119
|
-
return get_model_parallel_rank() == 0
|
|
101
|
+
return bool(get_model_parallel_rank() == 0)
|
|
120
102
|
|
|
121
103
|
|
|
122
104
|
def encode_msg(msg: ProcessingMessage) -> bytes:
|
|
@@ -143,7 +125,7 @@ def retrieve_requests(reply_socket_url: str):
|
|
|
143
125
|
reply_socket.send_multipart([client_id, encode_msg(obj)])
|
|
144
126
|
|
|
145
127
|
while True:
|
|
146
|
-
tasks = [None]
|
|
128
|
+
tasks: list[ProcessingMessage | None] = [None]
|
|
147
129
|
if mp_rank_0():
|
|
148
130
|
client_id, maybe_task_json = maybe_get_work(reply_socket)
|
|
149
131
|
if maybe_task_json is not None:
|
|
@@ -170,7 +152,7 @@ def retrieve_requests(reply_socket_url: str):
|
|
|
170
152
|
break
|
|
171
153
|
|
|
172
154
|
for obj in out:
|
|
173
|
-
updates = [None]
|
|
155
|
+
updates: list[ProcessingMessage | None] = [None]
|
|
174
156
|
if mp_rank_0():
|
|
175
157
|
_, update_json = maybe_get_work(reply_socket)
|
|
176
158
|
update = maybe_parse_message(update_json)
|
|
@@ -187,16 +169,14 @@ def retrieve_requests(reply_socket_url: str):
|
|
|
187
169
|
group=get_model_parallel_group(),
|
|
188
170
|
)
|
|
189
171
|
if isinstance(updates[0], CancelSentinel):
|
|
190
|
-
|
|
172
|
+
log.info("quitting generation loop because request was cancelled")
|
|
191
173
|
break
|
|
192
174
|
|
|
193
175
|
if mp_rank_0():
|
|
194
176
|
send_obj(EndSentinel())
|
|
195
177
|
except Exception as e:
|
|
196
|
-
|
|
197
|
-
import traceback
|
|
178
|
+
log.exception("exception in generation loop")
|
|
198
179
|
|
|
199
|
-
traceback.print_exc()
|
|
200
180
|
if mp_rank_0():
|
|
201
181
|
send_obj(ExceptionResponse(error=str(e)))
|
|
202
182
|
|
|
@@ -217,20 +197,20 @@ def maybe_get_work(sock: zmq.Socket):
|
|
|
217
197
|
return client_id, message
|
|
218
198
|
|
|
219
199
|
|
|
220
|
-
def maybe_parse_message(maybe_json:
|
|
200
|
+
def maybe_parse_message(maybe_json: str | None) -> ProcessingMessage | None:
|
|
221
201
|
if maybe_json is None:
|
|
222
202
|
return None
|
|
223
203
|
try:
|
|
224
204
|
return parse_message(maybe_json)
|
|
225
205
|
except json.JSONDecodeError:
|
|
226
206
|
return None
|
|
227
|
-
except ValueError
|
|
207
|
+
except ValueError:
|
|
228
208
|
return None
|
|
229
209
|
|
|
230
210
|
|
|
231
211
|
def parse_message(json_str: str) -> ProcessingMessage:
|
|
232
212
|
data = json.loads(json_str)
|
|
233
|
-
return ProcessingMessageWrapper(**data).payload
|
|
213
|
+
return copy.deepcopy(ProcessingMessageWrapper(**data).payload)
|
|
234
214
|
|
|
235
215
|
|
|
236
216
|
def worker_process_entrypoint(
|
|
@@ -248,15 +228,15 @@ def worker_process_entrypoint(
|
|
|
248
228
|
while True:
|
|
249
229
|
try:
|
|
250
230
|
task = req_gen.send(result)
|
|
251
|
-
if isinstance(task,
|
|
231
|
+
if isinstance(task, EndSentinel):
|
|
252
232
|
break
|
|
253
233
|
|
|
254
|
-
assert isinstance(task, TaskRequest)
|
|
234
|
+
assert isinstance(task, TaskRequest), task
|
|
255
235
|
result = model(task.task)
|
|
256
236
|
except StopIteration:
|
|
257
237
|
break
|
|
258
238
|
|
|
259
|
-
|
|
239
|
+
log.info("[debug] worker process done")
|
|
260
240
|
|
|
261
241
|
|
|
262
242
|
def launch_dist_group(
|
|
@@ -265,9 +245,6 @@ def launch_dist_group(
|
|
|
265
245
|
init_model_cb: Callable,
|
|
266
246
|
**kwargs,
|
|
267
247
|
) -> None:
|
|
268
|
-
id = uuid.uuid4().hex
|
|
269
|
-
dist_url = f"file:///tmp/llama3_{id}_{time.time()}"
|
|
270
|
-
|
|
271
248
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
272
249
|
# TODO: track workers and if they terminate, tell parent process about it so cleanup can happen
|
|
273
250
|
launch_config = LaunchConfig(
|
|
@@ -301,7 +278,7 @@ def start_model_parallel_process(
|
|
|
301
278
|
|
|
302
279
|
main_process_url = request_socket.getsockopt_string(zmq.LAST_ENDPOINT)
|
|
303
280
|
|
|
304
|
-
ctx = multiprocessing.get_context("
|
|
281
|
+
ctx = multiprocessing.get_context("spawn")
|
|
305
282
|
process = ctx.Process(
|
|
306
283
|
target=launch_dist_group,
|
|
307
284
|
args=(
|
|
@@ -316,8 +293,8 @@ def start_model_parallel_process(
|
|
|
316
293
|
# wait until the model is loaded; rank 0 will send a message to indicate it's ready
|
|
317
294
|
|
|
318
295
|
request_socket.send(encode_msg(ReadyRequest()))
|
|
319
|
-
|
|
320
|
-
|
|
296
|
+
_response = request_socket.recv()
|
|
297
|
+
log.info("Loaded model...")
|
|
321
298
|
|
|
322
299
|
return request_socket, process
|
|
323
300
|
|
|
@@ -349,12 +326,18 @@ class ModelParallelProcessGroup:
|
|
|
349
326
|
self.process.join()
|
|
350
327
|
self.started = False
|
|
351
328
|
|
|
352
|
-
def run_inference(
|
|
329
|
+
def run_inference(
|
|
330
|
+
self,
|
|
331
|
+
req: tuple[
|
|
332
|
+
str,
|
|
333
|
+
list[CompletionRequestWithRawContent] | list[ChatCompletionRequestWithRawContent],
|
|
334
|
+
],
|
|
335
|
+
) -> Generator:
|
|
353
336
|
assert not self.running, "inference already running"
|
|
354
337
|
|
|
355
338
|
self.running = True
|
|
356
|
-
self.request_socket.send(encode_msg(TaskRequest(task=inference_args)))
|
|
357
339
|
try:
|
|
340
|
+
self.request_socket.send(encode_msg(TaskRequest(task=req)))
|
|
358
341
|
while True:
|
|
359
342
|
obj_json = self.request_socket.recv()
|
|
360
343
|
obj = parse_message(obj_json)
|
|
@@ -363,13 +346,13 @@ class ModelParallelProcessGroup:
|
|
|
363
346
|
break
|
|
364
347
|
|
|
365
348
|
if isinstance(obj, ExceptionResponse):
|
|
366
|
-
|
|
349
|
+
log.error(f"[debug] got exception {obj.error}")
|
|
367
350
|
raise Exception(obj.error)
|
|
368
351
|
|
|
369
352
|
if isinstance(obj, TaskResponse):
|
|
370
353
|
yield obj.result
|
|
371
354
|
|
|
372
|
-
except GeneratorExit
|
|
355
|
+
except GeneratorExit:
|
|
373
356
|
self.request_socket.send(encode_msg(CancelSentinel()))
|
|
374
357
|
while True:
|
|
375
358
|
obj_json = self.request_socket.send()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from llama_stack.providers.inline.inference.sentence_transformers.config import (
|
|
10
|
+
SentenceTransformersInferenceConfig,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def get_provider_impl(
|
|
15
|
+
config: SentenceTransformersInferenceConfig,
|
|
16
|
+
_deps: dict[str, Any],
|
|
17
|
+
):
|
|
18
|
+
from .sentence_transformers import SentenceTransformersInferenceImpl
|
|
19
|
+
|
|
20
|
+
impl = SentenceTransformersInferenceImpl(config)
|
|
21
|
+
await impl.initialize()
|
|
22
|
+
return impl
|
|
@@ -4,10 +4,12 @@
|
|
|
4
4
|
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
5
|
# the root directory of this source tree.
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from typing import Any
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from pydantic import BaseModel
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
class
|
|
13
|
-
|
|
12
|
+
class SentenceTransformersInferenceConfig(BaseModel):
|
|
13
|
+
@classmethod
|
|
14
|
+
def sample_run_config(cls, **kwargs) -> dict[str, Any]:
|
|
15
|
+
return {}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the terms described in the LICENSE file in
|
|
5
|
+
# the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
|
|
9
|
+
from llama_stack.apis.inference import (
|
|
10
|
+
InferenceProvider,
|
|
11
|
+
OpenAIChatCompletionRequestWithExtraBody,
|
|
12
|
+
OpenAICompletionRequestWithExtraBody,
|
|
13
|
+
)
|
|
14
|
+
from llama_stack.apis.inference.inference import (
|
|
15
|
+
OpenAIChatCompletion,
|
|
16
|
+
OpenAIChatCompletionChunk,
|
|
17
|
+
OpenAICompletion,
|
|
18
|
+
)
|
|
19
|
+
from llama_stack.apis.models import ModelType
|
|
20
|
+
from llama_stack.log import get_logger
|
|
21
|
+
from llama_stack.providers.datatypes import Model, ModelsProtocolPrivate
|
|
22
|
+
from llama_stack.providers.utils.inference.embedding_mixin import (
|
|
23
|
+
SentenceTransformerEmbeddingMixin,
|
|
24
|
+
)
|
|
25
|
+
from llama_stack.providers.utils.inference.openai_compat import (
|
|
26
|
+
OpenAIChatCompletionToLlamaStackMixin,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
from .config import SentenceTransformersInferenceConfig
|
|
30
|
+
|
|
31
|
+
log = get_logger(name=__name__, category="inference")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class SentenceTransformersInferenceImpl(
|
|
35
|
+
OpenAIChatCompletionToLlamaStackMixin,
|
|
36
|
+
SentenceTransformerEmbeddingMixin,
|
|
37
|
+
InferenceProvider,
|
|
38
|
+
ModelsProtocolPrivate,
|
|
39
|
+
):
|
|
40
|
+
__provider_id__: str
|
|
41
|
+
|
|
42
|
+
def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
|
|
43
|
+
self.config = config
|
|
44
|
+
|
|
45
|
+
async def initialize(self) -> None:
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
async def shutdown(self) -> None:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
async def should_refresh_models(self) -> bool:
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
async def list_models(self) -> list[Model] | None:
|
|
55
|
+
return [
|
|
56
|
+
Model(
|
|
57
|
+
identifier="nomic-ai/nomic-embed-text-v1.5",
|
|
58
|
+
provider_resource_id="nomic-ai/nomic-embed-text-v1.5",
|
|
59
|
+
provider_id=self.__provider_id__,
|
|
60
|
+
metadata={
|
|
61
|
+
"embedding_dimension": 768,
|
|
62
|
+
},
|
|
63
|
+
model_type=ModelType.embedding,
|
|
64
|
+
),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
async def register_model(self, model: Model) -> Model:
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
async def unregister_model(self, model_id: str) -> None:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
async def openai_completion(
|
|
74
|
+
self,
|
|
75
|
+
params: OpenAICompletionRequestWithExtraBody,
|
|
76
|
+
) -> OpenAICompletion:
|
|
77
|
+
raise NotImplementedError("OpenAI completion not supported by sentence transformers provider")
|
|
78
|
+
|
|
79
|
+
async def openai_chat_completion(
|
|
80
|
+
self,
|
|
81
|
+
params: OpenAIChatCompletionRequestWithExtraBody,
|
|
82
|
+
) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
|
|
83
|
+
raise NotImplementedError("OpenAI chat completion not supported by sentence transformers provider")
|