PyPI - llama-stack - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

llama-stack 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

llama_stack/core/library_client.py +80 -3
llama_stack/core/routing_tables/common.py +11 -0
llama_stack/core/routing_tables/vector_stores.py +4 -0
llama_stack/core/stack.py +16 -1
llama_stack/core/storage/kvstore/kvstore.py +11 -0
llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
llama_stack/core/storage/kvstore/redis/redis.py +5 -0
llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/models/llama/llama3/dog.jpg +0 -0
llama_stack/models/llama/llama3/pasta.jpeg +0 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
llama_stack/providers/utils/responses/responses_store.py +34 -0
llama_stack/providers/utils/tools/mcp.py +258 -16
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
llama_stack-0.4.4.dist-info/top_level.txt +1 -0
llama_stack-0.4.2.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/internal/kvstore.py +0 -26
llama_stack_api/internal/sqlstore.py +0 -79
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -51
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -51
llama_stack_api/version.py +0 -9
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0

llama_stack/distributions/nvidia/doc_template.md ADDED Viewed

@@ -0,0 +1,170 @@
+---
+orphan: true
+---
+# NVIDIA Distribution
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+{{ providers_table }}
+{% if run_config_env_vars %}
+### Environment Variables
+The following environment variables can be configured:
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+{% if default_models %}
+### Models
+The following models are available by default:
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
+## Prerequisites
+### NVIDIA API Keys
+Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
+### Deploy NeMo Microservices Platform
+The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
+## Supported Services
+Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
+### Inference: NVIDIA NIM
+NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
+  1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
+  2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
+The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
+### Datasetio API: NeMo Data Store
+The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
+See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
+### Eval API: NeMo Evaluator
+The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
+See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
+### Post-Training API: NeMo Customizer
+The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
+See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
+### Safety API: NeMo Guardrails
+The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
+See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
+## Deploying models
+In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
+Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
+```sh
+# URL to NeMo NIM Proxy service
+export NEMO_URL="http://nemo.test"
+curl --location "$NEMO_URL/v1/deployment/model-deployments" \
+   -H 'accept: application/json' \
+   -H 'Content-Type: application/json' \
+   -d '{
+      "name": "llama-3.2-1b-instruct",
+      "namespace": "meta",
+      "config": {
+         "model": "meta/llama-3.2-1b-instruct",
+         "nim_deployment": {
+            "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
+            "image_tag": "1.8.3",
+            "pvc_size": "25Gi",
+            "gpu": 1,
+            "additional_envs": {
+               "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
+            }
+         }
+      }
+   }'
+```
+This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
+You can also remove a deployed NIM to free up GPU resources, if needed.
+```sh
+export NEMO_URL="http://nemo.test"
+curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
+```
+## Running Llama Stack with NVIDIA
+You can do this via venv (build code), or Docker which has a pre-built image.
+### Via Docker
+This method allows you to get started quickly without having to build the distribution code.
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+### Via Docker with Custom Run Configuration
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+```bash
+# Set the path to your custom config.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-config.yaml \
+  -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+### Via venv
+If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+llama stack list-deps nvidia | xargs -L1 uv pip install
+NVIDIA_API_KEY=$NVIDIA_API_KEY \
+INFERENCE_MODEL=$INFERENCE_MODEL \
+llama stack run ./config.yaml \
+  --port 8321
+```
+## Example Notebooks
+For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).

llama_stack/distributions/oci/doc_template.md ADDED Viewed

@@ -0,0 +1,140 @@
+---
+orphan: true
+---
+# OCI Distribution
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+{{ providers_table }}
+{% if run_config_env_vars %}
+### Environment Variables
+The following environment variables can be configured:
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+{% if default_models %}
+### Models
+The following models are available by default:
+{% for model in default_models %}
+- `{{ model.model_id }} {{ model.doc_string }}`
+{% endfor %}
+{% endif %}
+## Prerequisites
+### Oracle Cloud Infrastructure Setup
+Before using the OCI Generative AI distribution, ensure you have:
+1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
+2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
+3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
+4. **Authentication**: Configure authentication using either:
+   - **Instance Principal** (recommended for cloud-hosted deployments)
+   - **API Key** (for on-premises or development environments)
+### Authentication Methods
+#### Instance Principal Authentication (Recommended)
+Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
+Requirements:
+- Instance must be running in an Oracle Cloud Infrastructure compartment
+- Instance must have appropriate IAM policies to access Generative AI services
+#### API Key Authentication
+For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
+### Required IAM Policies
+Ensure your OCI user or instance has the following policy statements:
+```
+Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
+Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
+```
+## Supported Services
+### Inference: OCI Generative AI
+Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
+- **Chat Completions**: Conversational AI with context awareness
+- **Text Generation**: Complete prompts and generate text content
+#### Available Models
+Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
+### Safety: Llama Guard
+For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
+- Content filtering and moderation
+- Policy compliance checking
+- Harmful content detection
+### Vector Storage: Multiple Options
+The distribution supports several vector storage providers:
+- **FAISS**: Local in-memory vector search
+- **ChromaDB**: Distributed vector database
+- **PGVector**: PostgreSQL with vector extensions
+### Additional Services
+- **Dataset I/O**: Local filesystem and Hugging Face integration
+- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
+- **Evaluation**: Meta reference evaluation framework
+## Running Llama Stack with OCI
+You can run the OCI distribution via Docker or local virtual environment.
+### Via venv
+If you've set up your local development environment, you can also build the image using your local virtual environment.
+```bash
+OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
+```
+### Configuration Examples
+#### Using Instance Principal (Recommended for Production)
+```bash
+export OCI_AUTH_TYPE=instance_principal
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
+```
+#### Using API Key Authentication (Development)
+```bash
+export OCI_AUTH_TYPE=config_file
+export OCI_CONFIG_FILE_PATH=~/.oci/config
+export OCI_CLI_PROFILE=DEFAULT
+export OCI_REGION=us-chicago-1
+export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
+```
+## Regional Endpoints
+OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
+https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
+## Troubleshooting
+### Common Issues
+1. **Authentication Errors**: Verify your OCI credentials and IAM policies
+2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
+3. **Permission Denied**: Check compartment permissions and Generative AI service access
+4. **Region Unavailable**: Verify the specified region supports Generative AI services
+### Getting Help
+For additional support:
+- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
+- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)

llama_stack/models/llama/llama3/dog.jpg ADDED Viewed

Binary file

llama_stack/models/llama/llama3/pasta.jpeg ADDED Viewed

Binary file

llama_stack/models/llama/resources/dog.jpg ADDED Viewed

Binary file

llama_stack/models/llama/resources/pasta.jpeg ADDED Viewed

Binary file

llama_stack/models/llama/resources/small_dog.jpg ADDED Viewed

Binary file

llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py CHANGED Viewed

@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
+import asyncio
 import re
 import time
 import uuid
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.responses.responses_store import (
     ResponsesStore,
     _OpenAIResponseObjectWithInputAndMessages,
 )
+from llama_stack.providers.utils.tools.mcp import MCPSessionManager
 from llama_stack_api import (
     ConversationItem,
     Conversations,
@@ -322,6 +324,125 @@ class OpenAIResponsesImpl:
             messages=messages,
         )
+    def _prepare_input_items_for_storage(
+        self,
+        input: str | list[OpenAIResponseInput],
+    ) -> list[OpenAIResponseInput]:
+        """Prepare input items for storage, adding IDs where needed.
+        This method is called once at the start of streaming to prepare input items
+        that will be reused across multiple persistence calls during streaming.
+        """
+        new_input_id = f"msg_{uuid.uuid4()}"
+        input_items_data: list[OpenAIResponseInput] = []
+        if isinstance(input, str):
+            input_content = OpenAIResponseInputMessageContentText(text=input)
+            input_content_item = OpenAIResponseMessage(
+                role="user",
+                content=[input_content],
+                id=new_input_id,
+            )
+            input_items_data = [input_content_item]
+        else:
+            for input_item in input:
+                if isinstance(input_item, OpenAIResponseMessage):
+                    input_item_dict = input_item.model_dump()
+                    if "id" not in input_item_dict:
+                        input_item_dict["id"] = new_input_id
+                    input_items_data.append(OpenAIResponseMessage(**input_item_dict))
+                else:
+                    input_items_data.append(input_item)
+        return input_items_data
+    async def _persist_streaming_state(
+        self,
+        stream_chunk: OpenAIResponseObjectStream,
+        orchestrator,
+        input_items: list[OpenAIResponseInput],
+        output_items: list,
+    ) -> None:
+        """Persist response state at significant streaming events.
+        This enables clients to poll GET /v1/responses/{response_id} during streaming
+        to see in-progress turn state instead of empty results.
+        Persistence occurs at:
+        - response.in_progress: Initial INSERT with empty output
+        - response.output_item.done: UPDATE with accumulated output items
+        - response.completed/response.incomplete: Final UPDATE with complete state
+        - response.failed: UPDATE with error state
+        :param stream_chunk: The current streaming event.
+        :param orchestrator: The streaming orchestrator (for snapshotting response).
+        :param input_items: Pre-prepared input items for storage.
+        :param output_items: Accumulated output items so far.
+        """
+        try:
+            match stream_chunk.type:
+                case "response.in_progress":
+                    # Initial persistence when response starts
+                    in_progress_response = stream_chunk.response
+                    await self.responses_store.upsert_response_object(
+                        response_object=in_progress_response,
+                        input=input_items,
+                        messages=[],
+                    )
+                case "response.output_item.done":
+                    # Incremental update when an output item completes (tool call, message)
+                    current_snapshot = orchestrator._snapshot_response(
+                        status="in_progress",
+                        outputs=output_items,
+                    )
+                    # Get current messages (filter out system messages)
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages or orchestrator.ctx.messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=current_snapshot,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+                case "response.completed" | "response.incomplete":
+                    # Final persistence when response finishes
+                    final_response = stream_chunk.response
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=final_response,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+                case "response.failed":
+                    # Persist failed state so GET shows error
+                    failed_response = stream_chunk.response
+                    # Preserve any accumulated non-system messages for failed responses
+                    messages_to_store = list(
+                        filter(
+                            lambda x: not isinstance(x, OpenAISystemMessageParam),
+                            orchestrator.final_messages or orchestrator.ctx.messages,
+                        )
+                    )
+                    await self.responses_store.upsert_response_object(
+                        response_object=failed_response,
+                        input=input_items,
+                        messages=messages_to_store,
+                    )
+        except Exception as e:
+            # Best-effort persistence: log error but don't fail the stream
+            logger.warning(f"Failed to persist streaming state for {stream_chunk.type}: {e}")
     async def create_openai_response(
         self,
         input: str | list[OpenAIResponseInput],
@@ -489,6 +610,19 @@ class OpenAIResponsesImpl:
         response_id = f"resp_{uuid.uuid4()}"
         created_at = int(time.time())
+        # Create a per-request MCP session manager for session reuse (fix for #4452)
+        # This avoids redundant tools/list calls when making multiple MCP tool invocations
+        mcp_session_manager = MCPSessionManager()
+        # Create a per-request ToolExecutor with the session manager
+        request_tool_executor = ToolExecutor(
+            tool_groups_api=self.tool_groups_api,
+            tool_runtime_api=self.tool_runtime_api,
+            vector_io_api=self.vector_io_api,
+            vector_stores_config=self.tool_executor.vector_stores_config,
+            mcp_session_manager=mcp_session_manager,
+        )
         orchestrator = StreamingResponseOrchestrator(
             inference_api=self.inference_api,
             ctx=ctx,
@@ -498,7 +632,7 @@ class OpenAIResponsesImpl:
             text=text,
             max_infer_iters=max_infer_iters,
             parallel_tool_calls=parallel_tool_calls,
-            tool_executor=self.tool_executor,
+            tool_executor=request_tool_executor,
             safety_api=self.safety_api,
             guardrail_ids=guardrail_ids,
             instructions=instructions,
@@ -513,41 +647,58 @@ class OpenAIResponsesImpl:
         # Type as ConversationItem to avoid list invariance issues
         output_items: list[ConversationItem] = []
-        async for stream_chunk in orchestrator.create_response():
-            match stream_chunk.type:
-                case "response.completed" | "response.incomplete":
-                    final_response = stream_chunk.response
-                case "response.failed":
-                    failed_response = stream_chunk.response
-                case "response.output_item.done":
-                    item = stream_chunk.item
-                    output_items.append(item)
-                case _:
-                    pass  # Other event types
-            # Store and sync before yielding terminal events
-            # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
-            if (
-                stream_chunk.type in {"response.completed", "response.incomplete"}
-                and final_response
-                and failed_response is None
-            ):
-                messages_to_store = list(
-                    filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
-                )
+        # Prepare input items for storage once (used by all persistence calls)
+        input_items_for_storage = self._prepare_input_items_for_storage(all_input)
+        try:
+            async for stream_chunk in orchestrator.create_response():
+                match stream_chunk.type:
+                    case "response.completed" | "response.incomplete":
+                        final_response = stream_chunk.response
+                    case "response.failed":
+                        failed_response = stream_chunk.response
+                    case "response.output_item.done":
+                        item = stream_chunk.item
+                        output_items.append(item)
+                    case _:
+                        pass  # Other event types
+                # Incremental persistence: persist on significant state changes
+                # This enables clients to poll GET /v1/responses/{response_id} during streaming
                 if store:
-                    # TODO: we really should work off of output_items instead of "final_messages"
-                    await self._store_response(
-                        response=final_response,
-                        input=all_input,
-                        messages=messages_to_store,
+                    await self._persist_streaming_state(
+                        stream_chunk=stream_chunk,
+                        orchestrator=orchestrator,
+                        input_items=input_items_for_storage,
+                        output_items=output_items,
                     )
-                if conversation:
-                    await self._sync_response_to_conversation(conversation, input, output_items)
-                    await self.responses_store.store_conversation_messages(conversation, messages_to_store)
-            yield stream_chunk
+                # Store and sync before yielding terminal events
+                # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
+                if (
+                    stream_chunk.type in {"response.completed", "response.incomplete"}
+                    and final_response
+                    and failed_response is None
+                ):
+                    if conversation:
+                        messages_to_store = list(
+                            filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
+                        )
+                        await self._sync_response_to_conversation(conversation, input, output_items)
+                        await self.responses_store.store_conversation_messages(conversation, messages_to_store)
+                yield stream_chunk
+        finally:
+            # Clean up MCP sessions at the end of the request (fix for #4452)
+            # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
+            # Wrap in try/except as cleanup errors should not mask the original response
+            try:
+                await asyncio.shield(mcp_session_manager.close_all())
+            except BaseException as e:
+                # Debug level - cleanup errors are expected in streaming scenarios where
+                # anyio cancel scopes may be in a different task context
+                logger.debug(f"Error during MCP session cleanup: {e}")
     async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
         return await self.responses_store.delete_response_object(response_id)

llama_stack/providers/inline/agents/meta_reference/responses/streaming.py CHANGED Viewed

@@ -1200,6 +1200,9 @@ class StreamingResponseOrchestrator:
                 "mcp_list_tools_id": list_id,
             }
+            # Get session manager from tool_executor if available (fix for #4452)
+            session_manager = getattr(self.tool_executor, "mcp_session_manager", None)
             # TODO: follow semantic conventions for Open Telemetry tool spans
             # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
             with tracer.start_as_current_span("list_mcp_tools", attributes=attributes):
@@ -1207,6 +1210,7 @@ class StreamingResponseOrchestrator:
                     endpoint=mcp_tool.server_url,
                     headers=mcp_tool.headers,
                     authorization=mcp_tool.authorization,
+                    session_manager=session_manager,
                 )
             # Create the MCP list tools message

llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py CHANGED Viewed

@@ -54,11 +54,14 @@ class ToolExecutor:
         tool_runtime_api: ToolRuntime,
         vector_io_api: VectorIO,
         vector_stores_config=None,
+        mcp_session_manager=None,
     ):
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
         self.vector_io_api = vector_io_api
         self.vector_stores_config = vector_stores_config
+        # Optional MCPSessionManager for session reuse within a request (fix for #4452)
+        self.mcp_session_manager = mcp_session_manager
     async def execute_tool_call(
         self,
@@ -233,6 +236,7 @@ class ToolExecutor:
                 "document_ids": [r.file_id for r in search_results],
                 "chunks": [r.content[0].text if r.content else "" for r in search_results],
                 "scores": [r.score for r in search_results],
+                "attributes": [r.attributes or {} for r in search_results],
                 "citation_files": citation_files,
             },
         )
@@ -327,12 +331,14 @@ class ToolExecutor:
                 # TODO: follow semantic conventions for Open Telemetry tool spans
                 # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
                 with tracer.start_as_current_span("invoke_mcp_tool", attributes=attributes):
+                    # Pass session_manager for session reuse within request (fix for #4452)
                     result = await invoke_mcp_tool(
                         endpoint=mcp_tool.server_url,
                         tool_name=function_name,
                         kwargs=tool_kwargs,
                         headers=mcp_tool.headers,
                         authorization=mcp_tool.authorization,
+                        session_manager=self.mcp_session_manager,
                     )
             elif function_name == "knowledge_search":
                 response_file_search_tool = (
@@ -464,16 +470,18 @@ class ToolExecutor:
                 )
                 if result and (metadata := getattr(result, "metadata", None)) and "document_ids" in metadata:
                     message.results = []
+                    attributes_list = metadata.get("attributes", [])
                     for i, doc_id in enumerate(metadata["document_ids"]):
                         text = metadata["chunks"][i] if "chunks" in metadata else None
                         score = metadata["scores"][i] if "scores" in metadata else None
+                        attrs = attributes_list[i] if i < len(attributes_list) else {}
                         message.results.append(
                             OpenAIResponseOutputMessageFileSearchToolCallResults(
                                 file_id=doc_id,
                                 filename=doc_id,
                                 text=text if text is not None else "",
                                 score=score if score is not None else 0.0,
-                                attributes={},
+                                attributes=attrs,
                             )
                         )
                 if has_error:

llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h ADDED Viewed

@@ -0,0 +1,9 @@
+#import <Foundation/Foundation.h>
+//! Project version number for LocalInference.
+FOUNDATION_EXPORT double LocalInferenceVersionNumber;
+//! Project version string for LocalInference.
+FOUNDATION_EXPORT const unsigned char LocalInferenceVersionString[];
+// In this header, you should import all the public headers of your framework using statements like #import <LocalInference/PublicHeader.h>

llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

llama-stack 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl