llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. llama_stack/core/library_client.py +80 -3
  2. llama_stack/core/routing_tables/common.py +11 -0
  3. llama_stack/core/routing_tables/vector_stores.py +4 -0
  4. llama_stack/core/stack.py +16 -1
  5. llama_stack/core/storage/kvstore/kvstore.py +11 -0
  6. llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
  7. llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
  8. llama_stack/core/storage/kvstore/redis/redis.py +5 -0
  9. llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
  10. llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
  11. llama_stack/distributions/dell/doc_template.md +209 -0
  12. llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
  13. llama_stack/distributions/nvidia/doc_template.md +170 -0
  14. llama_stack/distributions/oci/doc_template.md +140 -0
  15. llama_stack/models/llama/llama3/dog.jpg +0 -0
  16. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  17. llama_stack/models/llama/resources/dog.jpg +0 -0
  18. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  19. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  20. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
  21. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
  22. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
  23. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  24. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  25. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  26. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  27. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  28. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  29. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  30. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  31. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  32. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  33. llama_stack/providers/remote/files/s3/README.md +266 -0
  34. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  35. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  36. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  37. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
  38. llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
  39. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
  40. llama_stack/providers/utils/responses/responses_store.py +34 -0
  41. llama_stack/providers/utils/tools/mcp.py +258 -16
  42. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
  43. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
  44. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
  45. llama_stack-0.4.4.dist-info/top_level.txt +1 -0
  46. llama_stack-0.4.2.dist-info/top_level.txt +0 -2
  47. llama_stack_api/__init__.py +0 -945
  48. llama_stack_api/admin/__init__.py +0 -45
  49. llama_stack_api/admin/api.py +0 -72
  50. llama_stack_api/admin/fastapi_routes.py +0 -117
  51. llama_stack_api/admin/models.py +0 -113
  52. llama_stack_api/agents.py +0 -173
  53. llama_stack_api/batches/__init__.py +0 -40
  54. llama_stack_api/batches/api.py +0 -53
  55. llama_stack_api/batches/fastapi_routes.py +0 -113
  56. llama_stack_api/batches/models.py +0 -78
  57. llama_stack_api/benchmarks/__init__.py +0 -43
  58. llama_stack_api/benchmarks/api.py +0 -39
  59. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  60. llama_stack_api/benchmarks/models.py +0 -109
  61. llama_stack_api/common/__init__.py +0 -5
  62. llama_stack_api/common/content_types.py +0 -101
  63. llama_stack_api/common/errors.py +0 -95
  64. llama_stack_api/common/job_types.py +0 -38
  65. llama_stack_api/common/responses.py +0 -77
  66. llama_stack_api/common/training_types.py +0 -47
  67. llama_stack_api/common/type_system.py +0 -146
  68. llama_stack_api/connectors.py +0 -146
  69. llama_stack_api/conversations.py +0 -270
  70. llama_stack_api/datasetio.py +0 -55
  71. llama_stack_api/datasets/__init__.py +0 -61
  72. llama_stack_api/datasets/api.py +0 -35
  73. llama_stack_api/datasets/fastapi_routes.py +0 -104
  74. llama_stack_api/datasets/models.py +0 -152
  75. llama_stack_api/datatypes.py +0 -373
  76. llama_stack_api/eval.py +0 -137
  77. llama_stack_api/file_processors/__init__.py +0 -27
  78. llama_stack_api/file_processors/api.py +0 -64
  79. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  80. llama_stack_api/file_processors/models.py +0 -42
  81. llama_stack_api/files/__init__.py +0 -35
  82. llama_stack_api/files/api.py +0 -51
  83. llama_stack_api/files/fastapi_routes.py +0 -124
  84. llama_stack_api/files/models.py +0 -107
  85. llama_stack_api/inference.py +0 -1169
  86. llama_stack_api/inspect_api/__init__.py +0 -37
  87. llama_stack_api/inspect_api/api.py +0 -25
  88. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  89. llama_stack_api/inspect_api/models.py +0 -28
  90. llama_stack_api/internal/__init__.py +0 -9
  91. llama_stack_api/internal/kvstore.py +0 -26
  92. llama_stack_api/internal/sqlstore.py +0 -79
  93. llama_stack_api/llama_stack_api/__init__.py +0 -945
  94. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  95. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  96. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  97. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  98. llama_stack_api/llama_stack_api/agents.py +0 -173
  99. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  100. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  101. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  102. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  103. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  104. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  105. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  106. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  107. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  108. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  109. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  110. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  111. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  112. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  113. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  114. llama_stack_api/llama_stack_api/connectors.py +0 -146
  115. llama_stack_api/llama_stack_api/conversations.py +0 -270
  116. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  117. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  118. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  119. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  120. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  121. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  122. llama_stack_api/llama_stack_api/eval.py +0 -137
  123. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  124. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  125. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  126. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  127. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  128. llama_stack_api/llama_stack_api/files/api.py +0 -51
  129. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  130. llama_stack_api/llama_stack_api/files/models.py +0 -107
  131. llama_stack_api/llama_stack_api/inference.py +0 -1169
  132. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  133. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  134. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  135. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  136. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  137. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
  138. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
  139. llama_stack_api/llama_stack_api/models.py +0 -171
  140. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  141. llama_stack_api/llama_stack_api/post_training.py +0 -370
  142. llama_stack_api/llama_stack_api/prompts.py +0 -203
  143. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  144. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  145. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  146. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  147. llama_stack_api/llama_stack_api/py.typed +0 -0
  148. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  149. llama_stack_api/llama_stack_api/resource.py +0 -37
  150. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  151. llama_stack_api/llama_stack_api/safety.py +0 -132
  152. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  153. llama_stack_api/llama_stack_api/scoring.py +0 -93
  154. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  155. llama_stack_api/llama_stack_api/shields.py +0 -93
  156. llama_stack_api/llama_stack_api/tools.py +0 -226
  157. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  158. llama_stack_api/llama_stack_api/vector_stores.py +0 -51
  159. llama_stack_api/llama_stack_api/version.py +0 -9
  160. llama_stack_api/models.py +0 -171
  161. llama_stack_api/openai_responses.py +0 -1468
  162. llama_stack_api/post_training.py +0 -370
  163. llama_stack_api/prompts.py +0 -203
  164. llama_stack_api/providers/__init__.py +0 -33
  165. llama_stack_api/providers/api.py +0 -16
  166. llama_stack_api/providers/fastapi_routes.py +0 -57
  167. llama_stack_api/providers/models.py +0 -24
  168. llama_stack_api/py.typed +0 -0
  169. llama_stack_api/rag_tool.py +0 -168
  170. llama_stack_api/resource.py +0 -37
  171. llama_stack_api/router_utils.py +0 -160
  172. llama_stack_api/safety.py +0 -132
  173. llama_stack_api/schema_utils.py +0 -208
  174. llama_stack_api/scoring.py +0 -93
  175. llama_stack_api/scoring_functions.py +0 -211
  176. llama_stack_api/shields.py +0 -93
  177. llama_stack_api/tools.py +0 -226
  178. llama_stack_api/vector_io.py +0 -941
  179. llama_stack_api/vector_stores.py +0 -51
  180. llama_stack_api/version.py +0 -9
  181. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
  182. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,170 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # NVIDIA Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Prerequisites
32
+ ### NVIDIA API Keys
33
+
34
+ Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
35
+
36
+ ### Deploy NeMo Microservices Platform
37
+ The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
38
+
39
+ ## Supported Services
40
+ Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
41
+
42
+ ### Inference: NVIDIA NIM
43
+ NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
44
+ 1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
45
+ 2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
46
+
47
+ The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
48
+
49
+ ### Datasetio API: NeMo Data Store
50
+ The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
51
+
52
+ See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
53
+
54
+ ### Eval API: NeMo Evaluator
55
+ The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
56
+
57
+ See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
58
+
59
+ ### Post-Training API: NeMo Customizer
60
+ The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
61
+
62
+ See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
63
+
64
+ ### Safety API: NeMo Guardrails
65
+ The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
66
+
67
+ See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
68
+
69
+ ## Deploying models
70
+ In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
71
+
72
+ Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
73
+ ```sh
74
+ # URL to NeMo NIM Proxy service
75
+ export NEMO_URL="http://nemo.test"
76
+
77
+ curl --location "$NEMO_URL/v1/deployment/model-deployments" \
78
+ -H 'accept: application/json' \
79
+ -H 'Content-Type: application/json' \
80
+ -d '{
81
+ "name": "llama-3.2-1b-instruct",
82
+ "namespace": "meta",
83
+ "config": {
84
+ "model": "meta/llama-3.2-1b-instruct",
85
+ "nim_deployment": {
86
+ "image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
87
+ "image_tag": "1.8.3",
88
+ "pvc_size": "25Gi",
89
+ "gpu": 1,
90
+ "additional_envs": {
91
+ "NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
92
+ }
93
+ }
94
+ }
95
+ }'
96
+ ```
97
+ This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
98
+
99
+ You can also remove a deployed NIM to free up GPU resources, if needed.
100
+ ```sh
101
+ export NEMO_URL="http://nemo.test"
102
+
103
+ curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
104
+ ```
105
+
106
+ ## Running Llama Stack with NVIDIA
107
+
108
+ You can do this via venv (build code), or Docker which has a pre-built image.
109
+
110
+ ### Via Docker
111
+
112
+ This method allows you to get started quickly without having to build the distribution code.
113
+
114
+ ```bash
115
+ LLAMA_STACK_PORT=8321
116
+ docker run \
117
+ -it \
118
+ --pull always \
119
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
120
+ -v ~/.llama:/root/.llama \
121
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
122
+ llamastack/distribution-{{ name }} \
123
+ --port $LLAMA_STACK_PORT
124
+ ```
125
+
126
+ ### Via Docker with Custom Run Configuration
127
+
128
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
129
+
130
+ ```bash
131
+ # Set the path to your custom config.yaml file
132
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
133
+ LLAMA_STACK_PORT=8321
134
+
135
+ docker run \
136
+ -it \
137
+ --pull always \
138
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
139
+ -v ~/.llama:/root/.llama \
140
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
141
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
142
+ -e NVIDIA_API_KEY=$NVIDIA_API_KEY \
143
+ llamastack/distribution-{{ name }} \
144
+ --port $LLAMA_STACK_PORT
145
+ ```
146
+
147
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
148
+
149
+ {% if run_configs %}
150
+ Available run configurations for this distribution:
151
+ {% for config in run_configs %}
152
+ - `{{ config }}`
153
+ {% endfor %}
154
+ {% endif %}
155
+
156
+ ### Via venv
157
+
158
+ If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
159
+
160
+ ```bash
161
+ INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
162
+ llama stack list-deps nvidia | xargs -L1 uv pip install
163
+ NVIDIA_API_KEY=$NVIDIA_API_KEY \
164
+ INFERENCE_MODEL=$INFERENCE_MODEL \
165
+ llama stack run ./config.yaml \
166
+ --port 8321
167
+ ```
168
+
169
+ ## Example Notebooks
170
+ For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
@@ -0,0 +1,140 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # OCI Distribution
5
+
6
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
7
+
8
+ {{ providers_table }}
9
+
10
+ {% if run_config_env_vars %}
11
+ ### Environment Variables
12
+
13
+ The following environment variables can be configured:
14
+
15
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
16
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
17
+ {% endfor %}
18
+ {% endif %}
19
+
20
+ {% if default_models %}
21
+ ### Models
22
+
23
+ The following models are available by default:
24
+
25
+ {% for model in default_models %}
26
+ - `{{ model.model_id }} {{ model.doc_string }}`
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+ ## Prerequisites
31
+ ### Oracle Cloud Infrastructure Setup
32
+
33
+ Before using the OCI Generative AI distribution, ensure you have:
34
+
35
+ 1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
36
+ 2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
37
+ 3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
38
+ 4. **Authentication**: Configure authentication using either:
39
+ - **Instance Principal** (recommended for cloud-hosted deployments)
40
+ - **API Key** (for on-premises or development environments)
41
+
42
+ ### Authentication Methods
43
+
44
+ #### Instance Principal Authentication (Recommended)
45
+ Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
46
+
47
+ Requirements:
48
+ - Instance must be running in an Oracle Cloud Infrastructure compartment
49
+ - Instance must have appropriate IAM policies to access Generative AI services
50
+
51
+ #### API Key Authentication
52
+ For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
53
+
54
+ ### Required IAM Policies
55
+
56
+ Ensure your OCI user or instance has the following policy statements:
57
+
58
+ ```
59
+ Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
60
+ Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
61
+ ```
62
+
63
+ ## Supported Services
64
+
65
+ ### Inference: OCI Generative AI
66
+ Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
67
+
68
+ - **Chat Completions**: Conversational AI with context awareness
69
+ - **Text Generation**: Complete prompts and generate text content
70
+
71
+ #### Available Models
72
+ Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
73
+
74
+ ### Safety: Llama Guard
75
+ For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
76
+ - Content filtering and moderation
77
+ - Policy compliance checking
78
+ - Harmful content detection
79
+
80
+ ### Vector Storage: Multiple Options
81
+ The distribution supports several vector storage providers:
82
+ - **FAISS**: Local in-memory vector search
83
+ - **ChromaDB**: Distributed vector database
84
+ - **PGVector**: PostgreSQL with vector extensions
85
+
86
+ ### Additional Services
87
+ - **Dataset I/O**: Local filesystem and Hugging Face integration
88
+ - **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
89
+ - **Evaluation**: Meta reference evaluation framework
90
+
91
+ ## Running Llama Stack with OCI
92
+
93
+ You can run the OCI distribution via Docker or local virtual environment.
94
+
95
+ ### Via venv
96
+
97
+ If you've set up your local development environment, you can also build the image using your local virtual environment.
98
+
99
+ ```bash
100
+ OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
101
+ ```
102
+
103
+ ### Configuration Examples
104
+
105
+ #### Using Instance Principal (Recommended for Production)
106
+ ```bash
107
+ export OCI_AUTH_TYPE=instance_principal
108
+ export OCI_REGION=us-chicago-1
109
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
110
+ ```
111
+
112
+ #### Using API Key Authentication (Development)
113
+ ```bash
114
+ export OCI_AUTH_TYPE=config_file
115
+ export OCI_CONFIG_FILE_PATH=~/.oci/config
116
+ export OCI_CLI_PROFILE=DEFAULT
117
+ export OCI_REGION=us-chicago-1
118
+ export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
119
+ ```
120
+
121
+ ## Regional Endpoints
122
+
123
+ OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
124
+
125
+ https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
126
+
127
+ ## Troubleshooting
128
+
129
+ ### Common Issues
130
+
131
+ 1. **Authentication Errors**: Verify your OCI credentials and IAM policies
132
+ 2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
133
+ 3. **Permission Denied**: Check compartment permissions and Generative AI service access
134
+ 4. **Region Unavailable**: Verify the specified region supports Generative AI services
135
+
136
+ ### Getting Help
137
+
138
+ For additional support:
139
+ - [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
140
+ - [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
Binary file
@@ -4,6 +4,7 @@
4
4
  # This source code is licensed under the terms described in the LICENSE file in
5
5
  # the root directory of this source tree.
6
6
 
7
+ import asyncio
7
8
  import re
8
9
  import time
9
10
  import uuid
@@ -16,6 +17,7 @@ from llama_stack.providers.utils.responses.responses_store import (
16
17
  ResponsesStore,
17
18
  _OpenAIResponseObjectWithInputAndMessages,
18
19
  )
20
+ from llama_stack.providers.utils.tools.mcp import MCPSessionManager
19
21
  from llama_stack_api import (
20
22
  ConversationItem,
21
23
  Conversations,
@@ -322,6 +324,125 @@ class OpenAIResponsesImpl:
322
324
  messages=messages,
323
325
  )
324
326
 
327
+ def _prepare_input_items_for_storage(
328
+ self,
329
+ input: str | list[OpenAIResponseInput],
330
+ ) -> list[OpenAIResponseInput]:
331
+ """Prepare input items for storage, adding IDs where needed.
332
+
333
+ This method is called once at the start of streaming to prepare input items
334
+ that will be reused across multiple persistence calls during streaming.
335
+ """
336
+ new_input_id = f"msg_{uuid.uuid4()}"
337
+ input_items_data: list[OpenAIResponseInput] = []
338
+
339
+ if isinstance(input, str):
340
+ input_content = OpenAIResponseInputMessageContentText(text=input)
341
+ input_content_item = OpenAIResponseMessage(
342
+ role="user",
343
+ content=[input_content],
344
+ id=new_input_id,
345
+ )
346
+ input_items_data = [input_content_item]
347
+ else:
348
+ for input_item in input:
349
+ if isinstance(input_item, OpenAIResponseMessage):
350
+ input_item_dict = input_item.model_dump()
351
+ if "id" not in input_item_dict:
352
+ input_item_dict["id"] = new_input_id
353
+ input_items_data.append(OpenAIResponseMessage(**input_item_dict))
354
+ else:
355
+ input_items_data.append(input_item)
356
+
357
+ return input_items_data
358
+
359
+ async def _persist_streaming_state(
360
+ self,
361
+ stream_chunk: OpenAIResponseObjectStream,
362
+ orchestrator,
363
+ input_items: list[OpenAIResponseInput],
364
+ output_items: list,
365
+ ) -> None:
366
+ """Persist response state at significant streaming events.
367
+
368
+ This enables clients to poll GET /v1/responses/{response_id} during streaming
369
+ to see in-progress turn state instead of empty results.
370
+
371
+ Persistence occurs at:
372
+ - response.in_progress: Initial INSERT with empty output
373
+ - response.output_item.done: UPDATE with accumulated output items
374
+ - response.completed/response.incomplete: Final UPDATE with complete state
375
+ - response.failed: UPDATE with error state
376
+
377
+ :param stream_chunk: The current streaming event.
378
+ :param orchestrator: The streaming orchestrator (for snapshotting response).
379
+ :param input_items: Pre-prepared input items for storage.
380
+ :param output_items: Accumulated output items so far.
381
+ """
382
+ try:
383
+ match stream_chunk.type:
384
+ case "response.in_progress":
385
+ # Initial persistence when response starts
386
+ in_progress_response = stream_chunk.response
387
+ await self.responses_store.upsert_response_object(
388
+ response_object=in_progress_response,
389
+ input=input_items,
390
+ messages=[],
391
+ )
392
+
393
+ case "response.output_item.done":
394
+ # Incremental update when an output item completes (tool call, message)
395
+ current_snapshot = orchestrator._snapshot_response(
396
+ status="in_progress",
397
+ outputs=output_items,
398
+ )
399
+ # Get current messages (filter out system messages)
400
+ messages_to_store = list(
401
+ filter(
402
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
403
+ orchestrator.final_messages or orchestrator.ctx.messages,
404
+ )
405
+ )
406
+ await self.responses_store.upsert_response_object(
407
+ response_object=current_snapshot,
408
+ input=input_items,
409
+ messages=messages_to_store,
410
+ )
411
+
412
+ case "response.completed" | "response.incomplete":
413
+ # Final persistence when response finishes
414
+ final_response = stream_chunk.response
415
+ messages_to_store = list(
416
+ filter(
417
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
418
+ orchestrator.final_messages,
419
+ )
420
+ )
421
+ await self.responses_store.upsert_response_object(
422
+ response_object=final_response,
423
+ input=input_items,
424
+ messages=messages_to_store,
425
+ )
426
+
427
+ case "response.failed":
428
+ # Persist failed state so GET shows error
429
+ failed_response = stream_chunk.response
430
+ # Preserve any accumulated non-system messages for failed responses
431
+ messages_to_store = list(
432
+ filter(
433
+ lambda x: not isinstance(x, OpenAISystemMessageParam),
434
+ orchestrator.final_messages or orchestrator.ctx.messages,
435
+ )
436
+ )
437
+ await self.responses_store.upsert_response_object(
438
+ response_object=failed_response,
439
+ input=input_items,
440
+ messages=messages_to_store,
441
+ )
442
+ except Exception as e:
443
+ # Best-effort persistence: log error but don't fail the stream
444
+ logger.warning(f"Failed to persist streaming state for {stream_chunk.type}: {e}")
445
+
325
446
  async def create_openai_response(
326
447
  self,
327
448
  input: str | list[OpenAIResponseInput],
@@ -489,6 +610,19 @@ class OpenAIResponsesImpl:
489
610
  response_id = f"resp_{uuid.uuid4()}"
490
611
  created_at = int(time.time())
491
612
 
613
+ # Create a per-request MCP session manager for session reuse (fix for #4452)
614
+ # This avoids redundant tools/list calls when making multiple MCP tool invocations
615
+ mcp_session_manager = MCPSessionManager()
616
+
617
+ # Create a per-request ToolExecutor with the session manager
618
+ request_tool_executor = ToolExecutor(
619
+ tool_groups_api=self.tool_groups_api,
620
+ tool_runtime_api=self.tool_runtime_api,
621
+ vector_io_api=self.vector_io_api,
622
+ vector_stores_config=self.tool_executor.vector_stores_config,
623
+ mcp_session_manager=mcp_session_manager,
624
+ )
625
+
492
626
  orchestrator = StreamingResponseOrchestrator(
493
627
  inference_api=self.inference_api,
494
628
  ctx=ctx,
@@ -498,7 +632,7 @@ class OpenAIResponsesImpl:
498
632
  text=text,
499
633
  max_infer_iters=max_infer_iters,
500
634
  parallel_tool_calls=parallel_tool_calls,
501
- tool_executor=self.tool_executor,
635
+ tool_executor=request_tool_executor,
502
636
  safety_api=self.safety_api,
503
637
  guardrail_ids=guardrail_ids,
504
638
  instructions=instructions,
@@ -513,41 +647,58 @@ class OpenAIResponsesImpl:
513
647
 
514
648
  # Type as ConversationItem to avoid list invariance issues
515
649
  output_items: list[ConversationItem] = []
516
- async for stream_chunk in orchestrator.create_response():
517
- match stream_chunk.type:
518
- case "response.completed" | "response.incomplete":
519
- final_response = stream_chunk.response
520
- case "response.failed":
521
- failed_response = stream_chunk.response
522
- case "response.output_item.done":
523
- item = stream_chunk.item
524
- output_items.append(item)
525
- case _:
526
- pass # Other event types
527
-
528
- # Store and sync before yielding terminal events
529
- # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
530
- if (
531
- stream_chunk.type in {"response.completed", "response.incomplete"}
532
- and final_response
533
- and failed_response is None
534
- ):
535
- messages_to_store = list(
536
- filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
537
- )
650
+
651
+ # Prepare input items for storage once (used by all persistence calls)
652
+ input_items_for_storage = self._prepare_input_items_for_storage(all_input)
653
+
654
+ try:
655
+ async for stream_chunk in orchestrator.create_response():
656
+ match stream_chunk.type:
657
+ case "response.completed" | "response.incomplete":
658
+ final_response = stream_chunk.response
659
+ case "response.failed":
660
+ failed_response = stream_chunk.response
661
+ case "response.output_item.done":
662
+ item = stream_chunk.item
663
+ output_items.append(item)
664
+ case _:
665
+ pass # Other event types
666
+
667
+ # Incremental persistence: persist on significant state changes
668
+ # This enables clients to poll GET /v1/responses/{response_id} during streaming
538
669
  if store:
539
- # TODO: we really should work off of output_items instead of "final_messages"
540
- await self._store_response(
541
- response=final_response,
542
- input=all_input,
543
- messages=messages_to_store,
670
+ await self._persist_streaming_state(
671
+ stream_chunk=stream_chunk,
672
+ orchestrator=orchestrator,
673
+ input_items=input_items_for_storage,
674
+ output_items=output_items,
544
675
  )
545
676
 
546
- if conversation:
547
- await self._sync_response_to_conversation(conversation, input, output_items)
548
- await self.responses_store.store_conversation_messages(conversation, messages_to_store)
549
-
550
- yield stream_chunk
677
+ # Store and sync before yielding terminal events
678
+ # This ensures the storage/syncing happens even if the consumer breaks after receiving the event
679
+ if (
680
+ stream_chunk.type in {"response.completed", "response.incomplete"}
681
+ and final_response
682
+ and failed_response is None
683
+ ):
684
+ if conversation:
685
+ messages_to_store = list(
686
+ filter(lambda x: not isinstance(x, OpenAISystemMessageParam), orchestrator.final_messages)
687
+ )
688
+ await self._sync_response_to_conversation(conversation, input, output_items)
689
+ await self.responses_store.store_conversation_messages(conversation, messages_to_store)
690
+
691
+ yield stream_chunk
692
+ finally:
693
+ # Clean up MCP sessions at the end of the request (fix for #4452)
694
+ # Use shield() to prevent cancellation from interrupting cleanup and leaking resources
695
+ # Wrap in try/except as cleanup errors should not mask the original response
696
+ try:
697
+ await asyncio.shield(mcp_session_manager.close_all())
698
+ except BaseException as e:
699
+ # Debug level - cleanup errors are expected in streaming scenarios where
700
+ # anyio cancel scopes may be in a different task context
701
+ logger.debug(f"Error during MCP session cleanup: {e}")
551
702
 
552
703
  async def delete_openai_response(self, response_id: str) -> OpenAIDeleteResponseObject:
553
704
  return await self.responses_store.delete_response_object(response_id)
@@ -1200,6 +1200,9 @@ class StreamingResponseOrchestrator:
1200
1200
  "mcp_list_tools_id": list_id,
1201
1201
  }
1202
1202
 
1203
+ # Get session manager from tool_executor if available (fix for #4452)
1204
+ session_manager = getattr(self.tool_executor, "mcp_session_manager", None)
1205
+
1203
1206
  # TODO: follow semantic conventions for Open Telemetry tool spans
1204
1207
  # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
1205
1208
  with tracer.start_as_current_span("list_mcp_tools", attributes=attributes):
@@ -1207,6 +1210,7 @@ class StreamingResponseOrchestrator:
1207
1210
  endpoint=mcp_tool.server_url,
1208
1211
  headers=mcp_tool.headers,
1209
1212
  authorization=mcp_tool.authorization,
1213
+ session_manager=session_manager,
1210
1214
  )
1211
1215
 
1212
1216
  # Create the MCP list tools message
@@ -54,11 +54,14 @@ class ToolExecutor:
54
54
  tool_runtime_api: ToolRuntime,
55
55
  vector_io_api: VectorIO,
56
56
  vector_stores_config=None,
57
+ mcp_session_manager=None,
57
58
  ):
58
59
  self.tool_groups_api = tool_groups_api
59
60
  self.tool_runtime_api = tool_runtime_api
60
61
  self.vector_io_api = vector_io_api
61
62
  self.vector_stores_config = vector_stores_config
63
+ # Optional MCPSessionManager for session reuse within a request (fix for #4452)
64
+ self.mcp_session_manager = mcp_session_manager
62
65
 
63
66
  async def execute_tool_call(
64
67
  self,
@@ -233,6 +236,7 @@ class ToolExecutor:
233
236
  "document_ids": [r.file_id for r in search_results],
234
237
  "chunks": [r.content[0].text if r.content else "" for r in search_results],
235
238
  "scores": [r.score for r in search_results],
239
+ "attributes": [r.attributes or {} for r in search_results],
236
240
  "citation_files": citation_files,
237
241
  },
238
242
  )
@@ -327,12 +331,14 @@ class ToolExecutor:
327
331
  # TODO: follow semantic conventions for Open Telemetry tool spans
328
332
  # https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/#execute-tool-span
329
333
  with tracer.start_as_current_span("invoke_mcp_tool", attributes=attributes):
334
+ # Pass session_manager for session reuse within request (fix for #4452)
330
335
  result = await invoke_mcp_tool(
331
336
  endpoint=mcp_tool.server_url,
332
337
  tool_name=function_name,
333
338
  kwargs=tool_kwargs,
334
339
  headers=mcp_tool.headers,
335
340
  authorization=mcp_tool.authorization,
341
+ session_manager=self.mcp_session_manager,
336
342
  )
337
343
  elif function_name == "knowledge_search":
338
344
  response_file_search_tool = (
@@ -464,16 +470,18 @@ class ToolExecutor:
464
470
  )
465
471
  if result and (metadata := getattr(result, "metadata", None)) and "document_ids" in metadata:
466
472
  message.results = []
473
+ attributes_list = metadata.get("attributes", [])
467
474
  for i, doc_id in enumerate(metadata["document_ids"]):
468
475
  text = metadata["chunks"][i] if "chunks" in metadata else None
469
476
  score = metadata["scores"][i] if "scores" in metadata else None
477
+ attrs = attributes_list[i] if i < len(attributes_list) else {}
470
478
  message.results.append(
471
479
  OpenAIResponseOutputMessageFileSearchToolCallResults(
472
480
  file_id=doc_id,
473
481
  filename=doc_id,
474
482
  text=text if text is not None else "",
475
483
  score=score if score is not None else 0.0,
476
- attributes={},
484
+ attributes=attrs,
477
485
  )
478
486
  )
479
487
  if has_error:
@@ -0,0 +1,9 @@
1
+ #import <Foundation/Foundation.h>
2
+
3
+ //! Project version number for LocalInference.
4
+ FOUNDATION_EXPORT double LocalInferenceVersionNumber;
5
+
6
+ //! Project version string for LocalInference.
7
+ FOUNDATION_EXPORT const unsigned char LocalInferenceVersionString[];
8
+
9
+ // In this header, you should import all the public headers of your framework using statements like #import <LocalInference/PublicHeader.h>